# Check PCA Clustering
- **Author(s)** - Frank Grenn
- **Date Started** - March  2021
- **Quick Description:** calculate principal components for AMP-PD chrY data and plot to visualize clustering.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
WRKDIR = "$PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_male_only_bfiles"
OUTDIR = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"
CARDDIR = "$PATH"

### Get European Samples

In [None]:
anc = pd.read_csv(f"{CARDDIR}/PD/AMP-PD/Plink/2021_v2_5release/euro_king_pca_v2.5_July2021/genetic_ancestry_all_pca.csv")
print(anc.shape)
print(anc.head())

In [None]:
anc[anc.InfPop=="EUROPE"][['#FID','IID']].to_csv(f"{BFILEDIR}/eur_samples.txt",header=None,index=None,sep="\t")

### Get PCs with Plink


In [None]:
print(f"plink --bfile {BFILEDIR}/chrY_male_hemizygous_only_het_filter_hg19_final --keep {BFILEDIR}/eur_samples.txt --make-bed --out amppd_male_eur")

In [None]:
temp = pd.read_csv(f"{BFILEDIR}/amppd_male_eur.bim",sep = "\t",header=None)
temp.columns = ['chr','snp','pos','bp','ref','alt']
print(temp.shape)
print(temp.head())

In [None]:
temp['chr'] = 22

temp.to_csv(f"{BFILEDIR}/amppd_male_eur.bim",sep = "\t",header=None,index=None)

In [None]:
print(f"plink --bfile {BFILEDIR}/amppd_male_eur --pca --out amppd_male_eur")

In [None]:
#read .eigenvec file

pcs = pd.read_table(f"{BFILEDIR}/amppd_male_eur.eigenvec",sep="\s+",header=None)
pcs.columns = ['fid','iid']+['pc'+str(x) for x in list(range(1,21))]
print(pcs.shape)
print(pcs.head())

In [None]:
#just get yhaplo data for now because first character of haplogroup for all samples is the same between the yhaplo and snappy tools
yhaplo = pd.read_csv(f"{OUTDIR}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo']
yhaplo['haplo_major'] = yhaplo['haplo'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
#yhaplo = yhaplo[yhaplo['haplo_long']!='A']#assume samples with "A" haplogroup were not assigned one.
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
#merge
meta_merge = pd.merge(left = pcs[['fid','iid','pc1','pc2','pc3','pc4','pc5']], right = yhaplo[['id','haplo_major']], left_on = ['fid'], right_on = ['id'])

meta_merge = meta_merge[['fid','iid','pc1','pc2','pc3','pc4','pc5','haplo_major']].sort_values(by=['haplo_major'])
print(meta_merge.shape)
print(meta_merge.head())

In [None]:
groups = meta_merge.groupby('haplo_major')

In [None]:
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group.pc1, group.pc2, marker='o', linestyle='', ms=5, label=name)
ax.legend(loc='best',bbox_to_anchor=(1.2,1))

#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot.png")
plt.show()

In [None]:
#change colors
fig, ax = plt.subplots()

colors = {'A':'black','B':'pink','C':'orange', 'E':'blue', 'G':'turquoise','H':'yellow','I':'purple','J':'cyan','L':'brown','N':'grey','O':'magenta','Q':'green','R':'red','T':'violet'}

for key, group in groups:
    group.plot(ax=ax, kind='scatter', x='pc1', y='pc2', label=key, color=colors[key])
ax.legend(loc='best',bbox_to_anchor=(1.2,1))
#plt.savefig(f"{OUTDIR}/haplo_major_pc_no_prune_biplot2.png")
plt.show()

### maybe drop some outlier haplotypes to improve plot

In [None]:
haplo_drop = ['A','B']

In [None]:
temp = meta_merge.copy()
temp = temp[~temp.haplo_major.isin(haplo_drop)]
groups = temp.groupby('haplo_major')


In [None]:
set(meta_merge.haplo_major)

In [None]:
sns.set_style("darkgrid")
g = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "haplo_major",  palette = sns.color_palette("Paired", 11))
plt.legend(bbox_to_anchor=(1.2, 1),borderaxespad=0)

In [None]:
palette = ['black','grey','darkred','red','orange','yellow','lightgreen','green','blue','lightgrey','violet','pink','cyan']#,'teal']
fig = plt.figure(figsize=(14, 8), dpi=80)
fig.subplots_adjust(hspace=0.5, wspace=0.2)


 #add subplot
ax1 = fig.add_subplot(1,2,1)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

sns_plot = sns.scatterplot(data = meta_merge, x = "pc1", y = "pc2", hue = "haplo_major",  palette = palette, ax = ax1)

plt.legend(bbox_to_anchor=(2.4, 1),borderaxespad=0)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("All Major Haplogroups")
            
ax2 = fig.add_subplot(1,2,2)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

sns_plot = sns.scatterplot(data = temp, x = "pc1", y = "pc2", hue = "haplo_major",  palette = palette[2:14], ax = ax2)
sns_plot.get_legend().remove()

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Major Haplogroups A and B Removed")
            
            
fig.suptitle("AMP-PD Major Haplogroup Principal Components")
plt.show()
sns_plot.get_figure().savefig(f"{WRKDIR}/major_haplogroup_pc_plot.png")  