# Check PCA Clustering
- **Author(s)** - Frank Grenn
- **Date Started** - March  2021
- **Quick Description:** calculate principal components for AMP-PD chrY data and plot to visualize clustering.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
WRKDIR = "$PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_male_only_bfiles"
OUTDIR = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"
CARDDIR = "$PATH"

### Get European Samples

In [None]:
anc = pd.read_csv(f"{CARDDIR}/PD/AMP-PD/Plink/2021_v2_5release/euro_king_pca_v2.5_July2021/genetic_ancestry_all_pca.csv")
print(anc.shape)
print(anc.head())

In [None]:
anc[anc.InfPop=="EUROPE"][['#FID','IID']].to_csv(f"{BFILEDIR}/eur_samples.txt",header=None,index=None,sep="\t")

### Get PCs with Plink


In [None]:
print(f"plink --bfile {BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19_final --keep {BFILEDIR}/eur_samples.txt --make-bed --out amppd_male_eur")

In [None]:
temp = pd.read_csv(f"{BFILEDIR}/amppd_male_eur.bim",sep = "\t",header=None)
temp.columns = ['chr','snp','pos','bp','ref','alt']
print(temp.shape)
print(temp.head())

In [None]:
temp['chr'] = 22

temp.to_csv(f"{BFILEDIR}/amppd_male_eur.bim",sep = "\t",header=None,index=None)

In [None]:
print(f"plink --bfile {BFILEDIR}/amppd_male_eur --pca --out amppd_male_eur")

In [None]:
#read .eigenvec file

pcs = pd.read_table(f"{BFILEDIR}/amppd_male_eur.eigenvec",sep="\s+",header=None)
pcs.columns = ['fid','iid']+['pc'+str(x) for x in list(range(1,21))]
print(pcs.shape)
print(pcs.head())

In [None]:
yhaplo = pd.read_csv(f"{OUTDIR}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo']
yhaplo['haplo_major'] = yhaplo['haplo'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]


print(yhaplo.shape)
print(yhaplo.head())

yhaplo = yhaplo[['id','haplo','haplo_major']]
yhaplo.columns = ['id','yhaplo_haplo','yhaplo_haplo_major']

In [None]:
#use Ylineagetracker haplogroups since that tool assigned the most unique haplogroups compared to other tools
ltrack = pd.read_csv(f"{OUTDIR}/output_ltracker/ltrack_hg19.hapresult.hg",sep="\s+")
ltrack.columns = ['id','haplo']#['id','haplo_short','haplo_short_rep_snp','haplo_long']
ltrack['haplo_major'] = ltrack['haplo'].str[0]
ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
print(ltrack.shape)
print(ltrack.head())
print(len(set(ltrack.haplo)))


ltrack = ltrack[['id','haplo','haplo_major']]
ltrack.columns = ['id','ltrack_haplo','ltrack_haplo_major']

In [None]:
#snappy data
snappy = pd.read_csv(f"{OUTDIR}/chrY_hgs_snappy.out",sep="\t",header=None)
snappy.columns = ['id','haplo','haplo_score','info_alleles']

#some samples, like "PD-PDNZ095VCJ" have extra data in the "haplo" column, like "B2a1a M109,M152/Page60,P32,P50", and we only want the "B2a1a"
snappy['haplo']= snappy['haplo'].str.split(" ").str[0]
snappy['haplo_major'] = snappy['haplo'].str[0]
print(snappy.shape)
print(snappy.head())

snappy = snappy[['id','haplo','haplo_major']]
snappy.columns = ['id','snappy_haplo','snappy_haplo_major']

In [None]:
sample_haplos = pd.merge(left = yhaplo, right = ltrack, on = ['id'])
sample_haplos = pd.merge(left = sample_haplos, right = snappy, on = ['id'])
print(sample_haplos.shape)
print(sample_haplos.head())

In [None]:
#merge
meta_merge = pd.merge(left = pcs[['fid','iid','pc1','pc2','pc3','pc4','pc5']], right = sample_haplos, left_on = ['fid'], right_on = ['id'])

print(meta_merge.shape)
print(meta_merge.head())

In [None]:
ltrack_groups = meta_merge.groupby('ltrack_haplo_major')
yhaplo_groups = meta_merge.groupby('yhaplo_haplo_major')
snappy_groups = meta_merge.groupby('snappy_haplo_major')

In [None]:
#change colors
fig, ax = plt.subplots()

colors = {'A':'black','B':'pink','C':'orange', 'E':'blue', 'G':'turquoise','H':'yellow','I':'purple','J':'cyan','L':'brown','N':'grey','O':'magenta','Q':'green','R':'red','T':'violet'}

for key, group in groups:
    group.plot(ax=ax, kind='scatter', x='pc1', y='pc2', label=key, color=colors[key])
ax.legend(loc='best',bbox_to_anchor=(1.2,1))
#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot2.png")
plt.show()

In [None]:
# Plot ltrack
fig, ax = plt.subplots()

colors = {'A':'black','B':'pink','C':'orange', 'E':'blue', 'G':'turquoise','H':'yellow','I':'purple','J':'cyan','L':'brown','N':'grey','O':'magenta','Q':'green','R':'red','T':'violet'}

ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in ltrack_groups:
    ax.plot(group.pc1, group.pc2, marker='o', linestyle='', ms=5, label=name)
ax.legend(loc='best',bbox_to_anchor=(1.2,1))

#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot.png")
plt.show()

In [None]:
#change colors
fig, ax = plt.subplots()

colors = {'A':'black','B':'pink','C':'orange', 'E':'blue', 'G':'turquoise','H':'yellow','I':'purple','J':'cyan','K':'lightgreen','L':'brown','N':'grey','O':'magenta','Q':'green','R':'red','T':'violet'}

for key, group in ltrack_groups:
    group.plot(ax=ax, kind='scatter', x='pc1', y='pc2', label=key, color=colors[key])
ax.legend(loc='best',bbox_to_anchor=(1.2,1))
#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot2.png")
plt.show()

In [None]:
# Plot yhaplo
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in yhaplo_groups:
    ax.plot(group.pc1, group.pc2, marker='o', linestyle='', ms=5, label=name)
ax.legend(loc='best',bbox_to_anchor=(1.2,1))

#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot.png")
plt.show()

In [None]:
#change colors
fig, ax = plt.subplots()

colors = {'A':'black','B':'pink','C':'orange', 'E':'blue', 'G':'turquoise','H':'yellow','I':'purple','J':'cyan','K':'lightgreen','L':'brown','N':'grey','O':'magenta','Q':'green','R':'red','T':'violet'}

for key, group in yhaplo_groups:
    group.plot(ax=ax, kind='scatter', x='pc1', y='pc2', label=key, color=colors[key])
ax.legend(loc='best',bbox_to_anchor=(1.2,1))
#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot2.png")
plt.show()

In [None]:
# Plot snappy
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in snappy_groups:
    ax.plot(group.pc1, group.pc2, marker='o', linestyle='', ms=5, label=name)
ax.legend(loc='best',bbox_to_anchor=(1.2,1))

#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot.png")
plt.show()

In [None]:
#change colors
fig, ax = plt.subplots()

colors = {'A':'black','B':'pink','C':'orange', 'D':'lightgrey','E':'blue', 'G':'turquoise','H':'yellow','I':'purple','J':'cyan','K':'lightgreen','L':'brown','N':'grey','O':'magenta','P':'darkred','Q':'green','R':'red','T':'violet'}

for key, group in snappy_groups:
    group.plot(ax=ax, kind='scatter', x='pc1', y='pc2', label=key, color=colors[key])
ax.legend(loc='best',bbox_to_anchor=(1.2,1))
#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot2.png")
plt.show()

### maybe drop some outlier haplotypes to improve plot

In [None]:
haplo_drop = ['A','B']

In [None]:
#which samples are outliers?
print(meta_merge.loc[meta_merge.ltrack_haplo_major.isin(haplo_drop),'fid'])
print(meta_merge.loc[meta_merge.snappy_haplo_major.isin(haplo_drop),'fid'])
print(meta_merge.loc[meta_merge.yhaplo_haplo_major.isin(haplo_drop),'fid'])

In [None]:
#samples we may want to remove from some of the plots
samples_to_remove = ['PD-PDRK310ZWB']

In [None]:
print(sorted(set(meta_merge.ltrack_haplo_major)))
print(sorted(set(meta_merge.yhaplo_haplo_major)))
print(sorted(set(meta_merge.snappy_haplo_major)))

In [None]:
palette = {'A':'black','B':'grey','C':'darkred', 'D':'brown','E':'red', 'G':'orange','H':'yellow','I':'lightgreen','J':'green','K':'turquoise','L':'blue','N':'lightgrey','P':'purple','Q':'violet','R':'pink','T':'cyan'}
#order to draw colors if we want certain ones to display on top of others
hue_order = ['A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'L', 'N', 'Q', 'P', 'R', 'T']

In [None]:
#palette = ['black','grey','darkred','red','orange','yellow','lightgreen','green','blue','lightgrey','violet','pink','cyan']#,'teal']
#14, 8
fig = plt.figure(figsize=(12, 12), dpi=80)
fig.subplots_adjust(hspace=0.3, wspace=0.2)

#add subplot
ax1 = fig.add_subplot(3,2,1)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['ltrack_haplo_major'])

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "ltrack_haplo_major",  palette = palette, ax = ax1)
sns_plot.get_legend().remove()
#plt.legend(bbox_to_anchor=(2.4, 1),borderaxespad=0)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("All Y-LineageTracker Major Haplogroups")
            

    
ax2 = fig.add_subplot(3,2,2)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['ltrack_haplo_major'])
temp = temp[~temp.ltrack_haplo_major.isin(haplo_drop)]
#samples_to_remove
temp = temp[~temp.fid.isin(samples_to_remove)]

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "ltrack_haplo_major",  palette = palette, ax = ax2)
sns_plot.get_legend().remove()

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Y-LineageTracker Major Haplogroups With A and B Removed")

 #add subplot
ax3 = fig.add_subplot(3,2,3)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['yhaplo_haplo_major'])

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "yhaplo_haplo_major",  palette = palette, ax = ax3)
sns_plot.get_legend().remove()
#plt.legend(bbox_to_anchor=(2.4, 1),borderaxespad=0)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("All Yhaplo Major Haplogroups")
            
ax4 = fig.add_subplot(3,2,4)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['yhaplo_haplo_major'])
temp = temp[~temp.yhaplo_haplo_major.isin(haplo_drop)]

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "yhaplo_haplo_major",  palette = palette, ax = ax4)
sns_plot.get_legend().remove()

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Yhaplo Major Haplogroups With A and B Removed")

 #add subplot
ax5 = fig.add_subplot(3,2,5)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['snappy_haplo_major'])

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "snappy_haplo_major",  palette = palette, ax = ax5)
sns_plot.get_legend().remove()
#plt.legend(bbox_to_anchor=(2.4, 1),borderaxespad=0)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("All Snappy Major Haplogroups")
            
ax6 = fig.add_subplot(3,2,6)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['snappy_haplo_major'])
temp = temp[~temp.snappy_haplo_major.isin(haplo_drop)]
temp = temp.sort_values('snappy_haplo_major', key=np.vectorize(hue_order.index))
sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "snappy_haplo_major",  palette = palette, ax = ax6)#, hue_order = ['A', 'B', 'C', 'D', 'E', 'G', 'H', 'J', 'I', 'L', 'N', 'Q', 'P', 'R', 'T'])
sns_plot.get_legend().remove()

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Snappy Major Haplogroups With A and B Removed")

h1,l1 = ax1.get_legend_handles_labels()
df1 = pd.DataFrame({'handles':h1,'labels':l1})
print(df1.head())
h3,l3 = ax3.get_legend_handles_labels()
df3 = pd.DataFrame({'handles':h3,'labels':l3})
print(df3.head())
h5,l5 = ax5.get_legend_handles_labels()
df5 = pd.DataFrame({'handles':h5,'labels':l5})
print(df5.head())

df_legend = df1.append(df3).append(df5)
print(df_legend.shape)
df_legend = df_legend.drop_duplicates(subset = ['labels'],keep = 'first').sort_values(['labels'])
print(df_legend.shape)

ax1.legend(list(df_legend['handles']),list(df_legend['labels']),title="",bbox_to_anchor=(2.4, 1),borderaxespad=0)
#ax1.legend(list(set(h1+h3+h5)),list(set(l1+l3+l5)),title="new legend",bbox_to_anchor=(2.4, 1),borderaxespad=0)


fig.suptitle("AMP-PD Major Haplogroup Principal Components")
plt.show()
sns_plot.get_figure().savefig(f"{WRKDIR}/major_haplogroup_all_tools_pc_plot.png")  

In [None]:
#palette = ['black','grey','darkred','red','orange','yellow','lightgreen','green','blue','lightgrey','violet','pink','cyan']#,'teal']
#14, 8
fig = plt.figure(figsize=(14, 8), dpi=80)
fig.subplots_adjust(hspace=0.5, wspace=0.2)

#add subplot
ax1 = fig.add_subplot(1,2,1)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['ltrack_haplo_major'])

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "ltrack_haplo_major",  palette = palette, ax = ax1)
#sns_plot.get_legend().remove()
plt.legend(bbox_to_anchor=(2.4, 1),borderaxespad=0)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("All Y-LineageTracker Major Haplogroups")
            

    
ax2 = fig.add_subplot(1,2,2)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

temp = meta_merge.sort_values(by=['ltrack_haplo_major'])
temp = temp[~temp.ltrack_haplo_major.isin(haplo_drop)]
#samples_to_remove
temp = temp[~temp.fid.isin(samples_to_remove)]

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "ltrack_haplo_major",  palette = palette, ax = ax2)
sns_plot.get_legend().remove()

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Major Haplogroups With A, B and Outliers Removed")



fig.suptitle("AMP-PD Major Haplogroup Principal Components")
plt.show()
sns_plot.get_figure().savefig(f"{WRKDIR}/major_haplogroup_pc_plot.png")  