# Check Haplogroup Calling Tool Variants in Our Chromosome Y Datasets
- **Author(s)** - Frank Grenn
- **Quick Description:** check to see if most of the variants used in each tool are actually availble in our datasets. May give some insight as to how reliable/accurate the results are in some cases

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
WRKDIR = "/PATH/chrY"

In [None]:
counts_df = pd.DataFrame()

## 1) Snappy Tool

In [None]:
snappy_vars = pd.read_csv(f"{WRKDIR}/snappy/ref_files/pos_to_allele.txt", sep = "\s+", names = ["pos", "ref","alt"], skiprows=1)


#some positions are a range (ex: 14036721-14036724) so move original positions to another col and split to get start position
snappy_vars['pos_original'] = snappy_vars['pos']
snappy_vars['pos'] = snappy_vars['pos_original'].str.split("-").str[0]
snappy_vars['pos'] = snappy_vars['pos'].astype('int64')
print(snappy_vars.shape)
print(snappy_vars.head())

In [None]:
snappy_vars2 = pd.read_csv(f"{WRKDIR}/snappy/ref_files/pos_to_allele.txt", sep = "\s+", names = ["pos", "alt","ref"], skiprows=1)


#some positions are a range (ex: 14036721-14036724) so move original positions to another col and split to get start position
snappy_vars2['pos_original'] = snappy_vars2['pos']
snappy_vars2['pos'] = snappy_vars2['pos_original'].str.split("-").str[0]
snappy_vars2['pos'] = snappy_vars2['pos'].astype('int64')
print(snappy_vars2.shape)
print(snappy_vars2.head())

#### AMP PD

In [None]:
amppd = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/chrY_male_hemizygous_only_het_filter_hg19_final.bim",header=None)
amppd.columns = ['chr','snp','idk','pos','alt','ref']
print(amppd.shape)
print(amppd.head())

In [None]:
merge_amp = pd.merge(left = snappy_vars, right = amppd, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_amp.shape)
print(pd.merge(left = snappy_vars2, right = amppd, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

#### UKBB

In [None]:
ukbb = pd.read_table(f"{WRKDIR}/y_ukbb/chrY_male_only.bim",header=None)
ukbb.columns = ['chr','snp','idk','pos','alt','ref']
print(ukbb.shape)
print(ukbb.head())

In [None]:
merge_ukbb = pd.merge(left = snappy_vars, right = ukbb, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_ukbb.shape)
print(pd.merge(left = snappy_vars2, right = ukbb, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

#### NABEC

In [None]:
nabec = pd.read_table(f"{WRKDIR}/y_nabec_files/nabec_males_only_hg19_chrY.bim",header=None)
nabec.columns = ['chr','snp','idk','pos','alt','ref']
print(nabec.shape)
print(nabec.head())

In [None]:
merge_nabec = pd.merge(left = snappy_vars, right = nabec, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_nabec.shape)
print(pd.merge(left = snappy_vars2, right = nabec, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

#### NEUROX

In [None]:
neurox = pd.read_table(f"{WRKDIR}/y_neurox/neurox_chrY_male_only.bim",header=None)
neurox.columns = ['chr','snp','idk','pos','alt','ref']
print(neurox.shape)
print(neurox.head())

In [None]:
merge_neurox = pd.merge(left = snappy_vars, right = neurox, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_neurox.shape)
print(pd.merge(left = snappy_vars2, right = neurox, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
count_df = pd.DataFrame({'tool':['snappy'],'tool_var_count':[len(snappy_vars.index)],'AMPPD_var_count':[len(merge_amp.index)],'UKBB_var_count':[len(merge_ukbb.index)],'NABEC_var_count':[len(merge_nabec.index)],'NEUROX_var_count':[len(merge_neurox.index)]})
count_df

## 2) Yhaplo Tool

In [None]:
yhaplo_vars = pd.read_table(f"{WRKDIR}/yhaplo/yhaplo/input/isogg.2016.01.04.txt", error_bad_lines = False)

yhaplo_vars['ref'] = yhaplo_vars['Mutation'].str.split('->').str[0]
yhaplo_vars['alt'] = yhaplo_vars['Mutation'].str.split('->').str[1]

#format the positions
yhaplo_vars['pos'] = yhaplo_vars['Y-position (GRCh37)'].str.strip()
print(yhaplo_vars.shape)
yhaplo_vars= yhaplo_vars[yhaplo_vars['pos']!=""]
print(yhaplo_vars.loc[yhaplo_vars['pos'].str.contains(";"),'pos'].shape)
print(yhaplo_vars.loc[yhaplo_vars['pos'].str.contains("\.\."),'pos'].shape)
print(yhaplo_vars.loc[yhaplo_vars['pos'].str.contains("-"),'pos'].shape)

yhaplo_vars.loc[yhaplo_vars['pos'].str.contains(";"),'pos'] = yhaplo_vars.loc[yhaplo_vars['pos'].str.contains(";"),'pos'].str.split(";").str[0]
yhaplo_vars.loc[yhaplo_vars['pos'].str.contains("\.\."),'pos'] = yhaplo_vars.loc[yhaplo_vars['pos'].str.contains("\.\."),'pos'].str.split("\.\.").str[0]
yhaplo_vars.loc[yhaplo_vars['pos'].str.contains("-"),'pos'] = yhaplo_vars.loc[yhaplo_vars['pos'].str.contains("-"),'pos'].str.split("-").str[0]
print(yhaplo_vars[yhaplo_vars['pos']==""])

yhaplo_vars['pos'] = yhaplo_vars['pos'].astype('int64')

print(yhaplo_vars.shape)
print(yhaplo_vars.head())
print(yhaplo_vars.tail())

In [None]:
yhaplo_vars2 = pd.read_table(f"{WRKDIR}/yhaplo/yhaplo/input/isogg.2016.01.04.txt", error_bad_lines = False)

yhaplo_vars2['alt'] = yhaplo_vars2['Mutation'].str.split('->').str[0]
yhaplo_vars2['ref'] = yhaplo_vars2['Mutation'].str.split('->').str[1]

#format the positions
yhaplo_vars2['pos'] = yhaplo_vars2['Y-position (GRCh37)'].str.strip()

yhaplo_vars2= yhaplo_vars2[yhaplo_vars2['pos']!=""]


yhaplo_vars2.loc[yhaplo_vars2['pos'].str.contains(";"),'pos'] = yhaplo_vars2.loc[yhaplo_vars2['pos'].str.contains(";"),'pos'].str.split(";").str[0]
yhaplo_vars2.loc[yhaplo_vars2['pos'].str.contains("\.\."),'pos'] = yhaplo_vars2.loc[yhaplo_vars2['pos'].str.contains("\.\."),'pos'].str.split("\.\.").str[0]
yhaplo_vars2.loc[yhaplo_vars2['pos'].str.contains("-"),'pos'] = yhaplo_vars2.loc[yhaplo_vars2['pos'].str.contains("-"),'pos'].str.split("-").str[0]


yhaplo_vars2['pos'] = yhaplo_vars2['pos'].astype('int64')


In [None]:
# AMPPD
merge_amp = pd.merge(left = yhaplo_vars, right = amppd, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_amp.shape)
print(pd.merge(left = yhaplo_vars2, right = amppd, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
merge_ukbb = pd.merge(left = yhaplo_vars, right = ukbb, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_ukbb.shape)
print(pd.merge(left = yhaplo_vars2, right = ukbb, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
merge_nabec = pd.merge(left = yhaplo_vars, right = nabec, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_nabec.shape)
print(pd.merge(left = yhaplo_vars2, right = nabec, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
merge_neurox = pd.merge(left = yhaplo_vars, right = neurox, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_neurox.shape)
print(pd.merge(left = yhaplo_vars2, right = neurox, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
count_df = count_df.append(pd.DataFrame({'tool':['yhaplo'],'tool_var_count':[len(yhaplo_vars.index)],'AMPPD_var_count':[len(merge_amp.index)],'UKBB_var_count':[len(merge_ukbb.index)],'NABEC_var_count':[len(merge_nabec.index)],'NEUROX_var_count':[len(merge_neurox.index)]}))
count_df

## 3) YLineageTracker Tool

In [None]:
ltrack_vars = pd.read_csv(f"{WRKDIR}/../Y-LineageTracker/LineageTracker/Data/HaplogroupInfo.csv")

ltrack_vars['pos'] = ltrack_vars['Build37']
ltrack_vars['ref'] = ltrack_vars['MutationInfo'].str.split("->").str[0]
ltrack_vars['alt'] = ltrack_vars['MutationInfo'].str.split("->").str[1]

print(ltrack_vars.dtypes)
print(ltrack_vars.shape)
print(ltrack_vars.head())

In [None]:
ltrack_vars2 = pd.read_csv(f"{WRKDIR}/../Y-LineageTracker/LineageTracker/Data/HaplogroupInfo.csv")

ltrack_vars2['pos'] = ltrack_vars2['Build37']
ltrack_vars2['alt'] = ltrack_vars2['MutationInfo'].str.split("->").str[0]
ltrack_vars2['ref'] = ltrack_vars2['MutationInfo'].str.split("->").str[1]

print(ltrack_vars2.dtypes)
print(ltrack_vars2.shape)
print(ltrack_vars2.head())

In [None]:
# AMPPD
merge_amp = pd.merge(left = ltrack_vars, right = amppd, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_amp.shape)
print(pd.merge(left = ltrack_vars2, right = amppd, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
merge_ukbb = pd.merge(left = ltrack_vars, right = ukbb, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_ukbb.shape)
print(pd.merge(left = ltrack_vars2, right = ukbb, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
merge_nabec = pd.merge(left = ltrack_vars, right = nabec, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_nabec.shape)
print(pd.merge(left = ltrack_vars2, right = nabec, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
merge_neurox = pd.merge(left = ltrack_vars, right = neurox, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner')
print(merge_neurox.shape)
print(pd.merge(left = ltrack_vars2, right = neurox, left_on = ['pos','ref','alt'], right_on = ['pos','ref','alt'], how = 'inner').shape)

In [None]:
count_df = count_df.append(pd.DataFrame({'tool':['ltrack'],'tool_var_count':[len(ltrack_vars.index)],'AMPPD_var_count':[len(merge_amp.index)],'UKBB_var_count':[len(merge_ukbb.index)],'NABEC_var_count':[len(merge_nabec.index)],'NEUROX_var_count':[len(merge_neurox.index)]}))
count_df

In [None]:
#add dataset variant counts
count_df = count_df.append(pd.DataFrame({'tool':['NA'],'tool_var_count':['NA'],'AMPPD_var_count':[len(amppd.index)],'UKBB_var_count':[len(ukbb.index)],'NABEC_var_count':[len(nabec.index)],'NEUROX_var_count':[len(neurox.index)]}))
count_df

In [None]:
count_df

In [None]:
temp_df = count_df[['tool','AMPPD_var_count']]
temp_df.columns = ['tool','variant count']
temp_df['Cohort'] = 'AMPPD'
print(temp_df)

plot_df = temp_df.copy()


temp_df = count_df[['tool','UKBB_var_count']]
temp_df.columns = ['tool','variant count']
temp_df['Cohort'] = 'UKBB'
print(temp_df)

plot_df = plot_df.append(temp_df)

temp_df = count_df[['tool','NABEC_var_count']]
temp_df.columns = ['tool','variant count']
temp_df['Cohort'] = 'NABEC'
print(temp_df)

plot_df = plot_df.append(temp_df)

temp_df = count_df[['tool','NEUROX_var_count']]
temp_df.columns = ['tool','variant count']
temp_df['Cohort'] = 'NEUROX'
print(temp_df)

plot_df = plot_df.append(temp_df)

temp_df = count_df[['tool','tool_var_count']]
temp_df.columns = ['tool','variant count']
temp_df['Cohort'] = 'tool'
print(temp_df)

plot_df = plot_df.append(temp_df)

plot_df.loc[plot_df['tool']=='NA','tool']='Cohort'
plot_df.loc[plot_df['variant count']=='NA','variant count']=0

plot_df_dataset_count = plot_df.copy()
plot_df = plot_df.loc[plot_df['tool']!='Cohort']

print(plot_df)

In [None]:
plot_df['variant count'] = plot_df['variant count'].astype('int32')

In [None]:
plot_df

In [None]:
dataset_count_df = plot_df_dataset_count.loc[(plot_df_dataset_count['tool']=='Cohort') & (plot_df_dataset_count['Cohort']!='tool')]
dataset_count_df = dataset_count_df[dataset_count_df['Cohort']!='NABEC']
print(dataset_count_df)

In [None]:
tool_total_df = plot_df.loc[plot_df['Cohort']=='tool']
print(tool_total_df)

In [None]:
percent_df = plot_df.copy()
percent_df['percent tool variants'] =0
percent_df.loc[percent_df['tool']=='snappy','percent tool variants'] = percent_df.loc[percent_df['tool']=='snappy','variant count']/29586 * 100
percent_df.loc[percent_df['tool']=='yhaplo','percent tool variants'] = percent_df.loc[percent_df['tool']=='yhaplo','variant count']/20771 * 100
percent_df.loc[percent_df['tool']=='ltrack','percent tool variants'] = percent_df.loc[percent_df['tool']=='ltrack','variant count']/74569 * 100


print(percent_df)
percent_df = percent_df.loc[(percent_df['Cohort']!='tool') &(percent_df['Cohort']!='NABEC')]
print(percent_df)


In [None]:
no_total_count_df = plot_df.copy()
no_total_count_df = no_total_count_df.loc[(no_total_count_df['Cohort']!='tool') & (no_total_count_df['Cohort']!='NABEC')]
print(no_total_count_df)

## 4) Plot

In [None]:
fig = plt.figure(figsize=(14, 8), dpi=80)
fig.subplots_adjust(hspace=0.5, wspace=0.2)

sns.set()
 #add subplot
ax1 = fig.add_subplot(2,2,1)

#plot
sns.set()
#plt.figure(figsize=(10,10))

sns_plot = sns.barplot(x="Cohort", y="variant count", data=dataset_count_df)
#sns_plot.get_legend().remove()

plt.xlabel("Cohort")
plt.ylabel("Variant Count")
plt.title("Number of Chromosome Y Variants Per Cohort")



#annotate axis = seaborn axis
for p in sns_plot.patches:
    sns_plot.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()/2),
                ha='center', va='center', fontsize=11, color='black', xytext=(0, 20),
                textcoords='offset points')
#_ = g.set_ylim(0,120) #To make space for the annotations



sns.set()
 #add subplot
ax2 = fig.add_subplot(2,2,2)

sns_plot = sns.barplot(x="tool", y="variant count", data=tool_total_df)
#sns_plot.get_legend().remove()

plt.xlabel("Tool")
plt.ylabel("Variant Count")
plt.title("Number of Chromosome Y Variants in Tool Reference")


sns.set()
 #add subplot
ax3 = fig.add_subplot(2,2,3)

sns_plot = sns.barplot(x="tool", y="variant count", hue="Cohort", data=no_total_count_df)

plt.xlabel("Tool")
plt.ylabel("Variants Count")
plt.title("Number of Cohort Variants Found in Tool Reference")


#plot
sns.set()
ax4 = fig.add_subplot(2,2,4)




sns_plot = sns.barplot(x="tool", y="percent tool variants", hue="Cohort", data=percent_df)
#sns_plot.get_legend().remove()

plt.xlabel("Tool")
plt.ylabel("Percent")
plt.title("Percent of Cohort Variants Included in Tool Reference")


            
fig.suptitle("Y Chromosome Variant Counts in Cohorts and Haplogroup Calling Tools")

sns_plot.get_figure().savefig(f"{WRKDIR}/dataset_tool_variant_counts.png",bbox_inches='tight') 