# Get Reference Data for Y Chromosome Haplogroup Calling Tools
- **Author(s)** - Frank Grenn
- **Date Started** - April 2022
- **Quick Description:** compare the reference data available for snappy, yhaplo and y-lineagetracker and attempt to generate reference files with identical data for each tool

In [None]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
WRKDIR = "$PATH/chrY"


## 1. Get Reference Data for Each Tool

#### snappy  
https://github.com/chrisgene/snappy

In [None]:
SNAPPY_DIR = f"{WRKDIR}/snappy"

In [None]:
snappy_id_to_pos = pd.read_table(f"{SNAPPY_DIR}/ref_files/id_to_pos.txt")
print(snappy_id_to_pos.shape)
print(snappy_id_to_pos.head())

In [None]:
snappy_pos_to_allele = pd.read_table(f"{SNAPPY_DIR}/ref_files/pos_to_allele.txt",header=None,skiprows=1)
snappy_pos_to_allele.columns = ['pos','anc','der']
print(snappy_pos_to_allele.shape)
print(snappy_pos_to_allele.head())

In [None]:
snappy_tree = pd.read_table(f"{SNAPPY_DIR}/ref_files/tree_structure.txt",header=None)
snappy_tree.columns = ['parent','child']
print(snappy_tree.shape)
print(snappy_tree.head())

In [None]:
snappy_y_hg_and_snps = pd.read_table(f"{SNAPPY_DIR}/ref_files/y_hg_and_snps.sort",header=None,skiprows=1)
snappy_y_hg_and_snps.columns = ['haplogroup','SNPs']
print(snappy_y_hg_and_snps.shape)
print(snappy_y_hg_and_snps.head())

#### yhaplo  
https://github.com/23andMe/yhaplo

In [None]:
YHAPLO_DIR = f"{WRKDIR}/yhaplo/yhaplo/input"

In [None]:
yhaplo_pref = pd.read_table(f"{YHAPLO_DIR}/preferred.snpNames.txt",header=None)
print(yhaplo_pref.shape)
print(yhaplo_pref.head())

In [None]:
yhaplo_rep_snp = pd.read_table(f"{YHAPLO_DIR}/representative.SNPs.isogg.2015tree.txt",header=None,sep = "\s+")
yhaplo_rep_snp.columns = ['haplo','snps']
print(yhaplo_rep_snp.shape)
print(len(set(yhaplo_rep_snp.haplo)))
print(yhaplo_rep_snp.head())

In [None]:
yhaplo_rep_snp_additional = pd.read_table(f"{YHAPLO_DIR}/representative.SNPs.additional.txt",header=None,sep = "\s+")
yhaplo_rep_snp_additional.columns = ['haplo','snps']
print(yhaplo_rep_snp_additional.shape)
print(yhaplo_rep_snp_additional.head())

In [None]:
#cut -f 1,2 {WRKDIR}/yhaplo/yhaplo/input/isogg.2016.01.04.txt > temp.isogg.2016.rep.snps.txt
yhaplo_snps = pd.read_table(f"{YHAPLO_DIR}/temp.isogg.2016.rep.snps.txt")#,header=None,sep = "\s+")
#yhaplo_snps.columns = ['haplo','snps']
print(yhaplo_snps.shape)
print(yhaplo_snps.head())

In [None]:
yhaplo_snps.columns

In [None]:
len(set(yhaplo_snps['SNP ']))

In [None]:
len(set(yhaplo_snps['Haplogroup ']))

#### y-lineagetracker  
https://codeocean.com/capsule/7424381/tree/v2

In [None]:
ltrack_haplos = pd.read_csv(f"{WRKDIR}/../Y-LineageTracker/LineageTracker/Data/HaplogroupInfo.csv")
print(ltrack_haplos.shape)
print(ltrack_haplos.head())

In [None]:
ltrack_haplos[ltrack_haplos.KeyInfo=="Key"].shape

In [None]:
len(set(ltrack_haplos[ltrack_haplos.KeyInfo=="Key"].Haplogroup))

## 2. Get Newest Haplogroup Data from ISOGG  
https://isogg.org/  
https://isogg.org/tree/ISOGG_YDNA_SNP_Index.html


In [None]:
isogg_snps = pd.read_csv(f"{WRKDIR}/SNP_Index_Human.csv")
print(isogg_snps.shape)
print(isogg_snps.head())

## 3. Reformat the Reference Data for the Different Tools
y-lineageTracker seems to have the most up-to-date data, with data for representative/key variants and haplogroup tree, so we will use that to reformat the reference data used in snappy and yhaplo

### Check the Y-LineageTracker Data

In [None]:
ltrack_haplos = pd.read_csv(f"{WRKDIR}/../Y-LineageTracker/LineageTracker/Data/HaplogroupInfo.csv")
print(ltrack_haplos.shape)
print(ltrack_haplos.head())

### Format for Snappy Tool
needs:  
`id_to_pos.txt` with snp id and hg19 positions  
`pos_to_allele.txt` with hg19 position, ref and alt alleles  
`tree_structure.txt` with child and parent haplogroups in tree  
`y_hg_and_snps.sort` with haplogroup and snps


#### id_to_pos.txt

In [None]:
new_id_to_pos = ltrack_haplos[['Mutation','Build37']]
new_id_to_pos.columns = ['id','pos']


In [None]:
#get rid of random \n endline characters in the data
new_id_to_pos['id']=new_id_to_pos['id'].str.replace("\n","")

In [None]:
#split id by '/' and create multiple rows
temp = new_id_to_pos['id'].str.split('/').apply(pd.Series,1).stack()
temp.index = temp.index.droplevel(-1)
temp.name = 'id'


In [None]:
del new_id_to_pos['id']
new_id_to_pos = new_id_to_pos.join(temp)
new_id_to_pos.head()

In [None]:
new_id_to_pos[['id','pos']].to_csv(f"{SNAPPY_DIR}/ref_files_new/id_to_pos.txt",index=None, sep = "\t")

#### pos_to_allele.txt

In [None]:
new_pos_to_allele = ltrack_haplos[['Build37','MutationInfo']]
#new_pos_to_allele['MutationInfo'] = new_pos_to_allele['MutationInfo'].str.replace("->","\t")
new_pos_to_allele[['ref','alt']] = new_pos_to_allele['MutationInfo'].str.split("->",1,expand=True)
new_pos_to_allele

In [None]:
new_pos_to_allele[['Build37','ref','alt']].to_csv(f"{SNAPPY_DIR}/ref_files_new/pos_to_allele.txt",index=None,header=None, sep = "\t")#quoting=csv.QUOTE_NONE,escapechar = '')
!sed -i '1s/^/id\tpos\n/' {SNAPPY_DIR}/ref_files_new/pos_to_allele.txt

#### tree_structure.txt
(converted manually)

#### y_hg_and_snps.sort

In [None]:
new_hg = ltrack_haplos[['Haplogroup','Mutation']]
new_hg

In [None]:
#split id by '/' and create multiple rows
temp = new_hg['Mutation'].str.split('/').apply(pd.Series,1).stack()
temp.index = temp.index.droplevel(-1)
temp.name = 'Mutation'


In [None]:
del new_hg['Mutation']
new_hg = new_hg.join(temp)
new_hg.head()

In [None]:
new_hg.head()

In [None]:
new_hg['Mutations'] = new_hg.groupby(['Haplogroup'])['Mutation'].transform(lambda x: ','.join(x))
new_hg = new_hg[['Haplogroup','Mutations']].drop_duplicates()
new_hg.columns = ['#haplogroup','SNPs']
new_hg.head()

In [None]:
#get rid of random \n endline characters in the data
new_hg['SNPs']=new_hg['SNPs'].str.replace("\n","")

In [None]:
new_hg.to_csv(f"{SNAPPY_DIR}/ref_files_new/y_hg_and_snps.sort",index=None, sep = "\t")

### Format for Yhaplo Tool
needs:  
`y.tree.primary.[date].nwk` file with haplogroup tree structure, similar to Y-LineageTracker format  
`isogg.[date].txt` with variant name, haplogroup, other variant names, rsid, hg19 position, mutation  
`isogg.multiallelic.txt` list of variant positions for variants with multiple alleles  
`representative.SNPs.isogg.[date]tree.txt` list of haplogroups and their representative SNPs (bold on isogg tables, "key" variants in Y-LineageTracker data)  
`representative.SNPs.additional.txt` additional list of haplogroups and their representative SNPS. Can probably leave this empty  
`preferred.snpNames.txt` list of preferred variant names for variants with multiple names. Can probably leave this empty  

#### y.tree.primary.[date].nwk
can copy this from y-lineagetracker

#### isogg.[date].txt

In [None]:
!head {YHAPLO_DIR}/isogg.2016.01.04.txt

In [None]:
ltrack_haplos.head()

In [None]:

new_isogg = ltrack_haplos[['Mutation','Haplogroup','rs','MutationInfo','Build37']]


In [None]:
#get rid of random \n endline characters in the data
new_isogg['Mutation']=new_isogg['Mutation'].str.replace("\n","")

In [None]:
#split id by '/' and create multiple rows
temp = new_isogg['Mutation'].str.split('/').apply(pd.Series,1).stack()
temp.index = temp.index.droplevel(-1)
temp.name = 'Mutation'

In [None]:
del new_isogg['Mutation']
new_isogg = new_isogg.join(temp)
new_isogg.head()

In [None]:
new_isogg = new_isogg.reset_index(drop = True)
new_isogg.head()

In [None]:
#need to have mutliple lines for variants with multiple names, and add the other names in an additional column

#new_isogg['Other_Names'] = new_isogg.groupby(['Haplogroup','rs','MutationInfo','Build37'])['Mutation'].transform(lambda x: '; '.join(x))
new_isogg['Other_Names'] = new_isogg.groupby(['Haplogroup','MutationInfo','Build37'])['Mutation'].transform(lambda x: '; '.join(x))

print(new_isogg.head())
new_isogg['Other_Names'] = new_isogg.apply(lambda x: str(x['Other_Names']).replace(f"{x['Mutation']}; ",""),axis=1)
new_isogg['Other_Names'] = new_isogg.apply(lambda x: str(x['Other_Names']).replace(f"; {x['Mutation']}",""),axis=1)
#new_isogg['Other_Names'] = new_isogg.apply(lambda x: str(x['Other_Names']).replace(x['Mutation'],""),axis=1)
print(new_isogg.head())

In [None]:
#check one with three different names
new_isogg[new_isogg.Build37==2662361]

In [None]:
new_isogg = new_isogg[['Mutation','Haplogroup','Other_Names','rs','Build37','MutationInfo']]
new_isogg.columns = ['SNP','Haplogroup','Other Names','RefSNP ID','Y-position (GRCh37)','Mutation']
print(new_isogg.head())
new_isogg.to_csv(f"{YHAPLO_DIR}/../input_new/isogg.updated.txt",index=None,sep = "\t")

#### isogg.multiallelic.txt

In [None]:
new_isogg[new_isogg.duplicated(['Y-position (GRCh37)'],keep=False)].head()

In [None]:
multiallelic = new_isogg.groupby(['Y-position (GRCh37)']).Mutation.nunique().gt(1)
multiallelic_variants = new_isogg.loc[new_isogg['Y-position (GRCh37)'].isin(multiallelic[multiallelic].index)]
print(multiallelic_variants.shape)
print(multiallelic_variants.head())

In [None]:
#check some
new_isogg[new_isogg['Y-position (GRCh37)']==14622354]

In [None]:
multiallelic_variants[['Y-position (GRCh37)']].drop_duplicates().to_csv(f"{YHAPLO_DIR}/../input_new/isogg.multiallelic.txt",index=None,header=None,sep = "\t")

#### representative.SNPs.isogg.[date]tree.txt and representative.SNPs.additional.txt

In [None]:
ltrack_haplos.head()

In [None]:
rep_snps = ltrack_haplos[['Mutation','Haplogroup','KeyInfo']]

In [None]:
#get rid of random \n endline characters in the data
rep_snps['Mutation']=rep_snps['Mutation'].str.replace("\n","")
rep_snps['Haplogroup']=rep_snps['Haplogroup'].str.replace("\n","")
print(rep_snps.head())

In [None]:
#split id by '/' and create multiple rows
temp = rep_snps['Mutation'].str.split('/').apply(pd.Series,1).stack()
temp.index = temp.index.droplevel(-1)
temp.name = 'Mutation'

del rep_snps['Mutation']
rep_snps = rep_snps.join(temp)
rep_snps.head()

In [None]:
rep_snps.loc[rep_snps['KeyInfo'].isna(),'Mutation']='.'
print(rep_snps.head())

In [None]:
key_snps = rep_snps[rep_snps['KeyInfo'].notna()]
print(key_snps.shape)
print(key_snps.head())

In [None]:
other_snps = rep_snps[~rep_snps.Haplogroup.isin(key_snps.Haplogroup)].drop_duplicates()
print(other_snps.shape)
print(other_snps.head())

In [None]:
rep_snps = key_snps.append(other_snps).sort_values(['Haplogroup'])
print(rep_snps.shape)
print(rep_snps.head())

In [None]:
rep_snps = rep_snps[['Haplogroup','Mutation']]
rep_snps.drop_duplicates().to_csv(f"{YHAPLO_DIR}/../input_new/representative.SNPs.isogg.txt",index=None,header=None,sep = "\t")

In [None]:
!touch {YHAPLO_DIR}/../input_new/representative.SNPs.additional.txt

#### preferred.snpNames.txt

In [None]:
!touch {YHAPLO_DIR}/../input_new/preferred.snpNames.txt

## 4. Run Using New Reference Data
only need to run snappy and yhaplo since we are using reference data used by Y-LineageTracker.  
rerun for all datasets (AMP-PD, UKBB, NeuroX).

### Rerun with Snappy

In [None]:
print(f"cd {SNAPPY_DIR}; module load python/2.7; module load plink; python SNAPPY_v0.2.2.py --infile {WRKDIR}/y_neurox/neurox_chrY_male_only --ref_files_dir ref_files_new --out {WRKDIR}/test_haplo_call/snappy_test/neurox")

In [None]:
!echo "cd {WRKDIR}; module load python/2.7; module load plink; python snappy/SNAPPY_v0.2.2.py --infile y_neurox/neurox_chrY_male_only --ref_files_dir snappy/ref_files_new --out test_haplo_call/snappy_test/neurox" >> {WRKDIR}/scripts/2_call_haplogroups/run_snappy.swarm
!echo "cd {WRKDIR}; module load python/2.7; module load plink; python snappy/SNAPPY_v0.2.2.py --infile y_nabec_files/nabec_males_only_hg19_chrY --ref_files_dir snappy/ref_files_new --out test_haplo_call/snappy_test/nabec" >> {WRKDIR}/scripts/2_call_haplogroups/run_snappy.swarm
!echo "cd {WRKDIR}; module load python/2.7; module load plink; python snappy/SNAPPY_v0.2.2.py --infile y_ukbb/chrY_male_only --ref_files_dir snappy/ref_files_new --out test_haplo_call/snappy_test/ukbb" >> {WRKDIR}/scripts/2_call_haplogroups/run_snappy.swarm
!echo "cd {WRKDIR}; module load python/2.7; module load plink; python snappy/SNAPPY_v0.2.2.py --infile y_male_only_bfiles/chrY_male_hemizygous_only_het_filter_hg19_final --ref_files_dir snappy/ref_files_new --out test_haplo_call/snappy_test/amppd" >> {WRKDIR}/scripts/2_call_haplogroups/run_snappy.swarm

In [None]:
#swarm -g 100 -f run_snappy.swarm

### Rerun with Yhaplo
need to modify the yhaplo/config.py file with new reference data file paths before running

In [None]:
!echo "module load python; mkdir {WRKDIR}/test_haplo_call/yhaplo_test/neurox; yhaplo -i {WRKDIR}/y_neurox/neurox_chrY_male_only.vcf -o {WRKDIR}/test_haplo_call/yhaplo_test/neurox" >> {WRKDIR}/scripts/2_call_haplogroups/run_yhaplo.swarm
!echo "module load python; mkdir {WRKDIR}/test_haplo_call/yhaplo_test/nabec; yhaplo -i {WRKDIR}/y_nabec_files/nabec_males_only_hg19_chrY.vcf -o {WRKDIR}/test_haplo_call/yhaplo_test/nabec" >> {WRKDIR}/scripts/2_call_haplogroups/run_yhaplo.swarm
!echo "module load python; mkdir {WRKDIR}/test_haplo_call/yhaplo_test/ukbb; yhaplo -i {WRKDIR}/y_ukbb/chrY_male_only.vcf -o {WRKDIR}/test_haplo_call/yhaplo_test/ukbb" >> {WRKDIR}/scripts/2_call_haplogroups/run_yhaplo.swarm
!echo "module load python; mkdir {WRKDIR}/test_haplo_call/yhaplo_test/amppd; yhaplo -i {WRKDIR}/y_male_only_vcf/chrY_male_hemizygous_only_het_filter_hg19_final.vcf -o {WRKDIR}/test_haplo_call/yhaplo_test/amppd" >> {WRKDIR}/scripts/2_call_haplogroups/run_yhaplo.swarm

In [None]:
#swarm -g 20 -f run_yhaplo.swarm

## 5. Read Old and New Results Before Comapring

In [None]:
UKBB_BFILE = f"{WRKDIR}/y_ukbb"
AMPPD_BFILE = f"{WRKDIR}/y_male_only_bfiles"
NABEC_BFILE = f"{WRKDIR}/y_nabec_files"
NEUROX_BFILE = f"{WRKDIR}/y_neurox"

UKBB_OUT = f"{WRKDIR}/output_ukbb"
AMPPD_OUT = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"
NABEC_OUT = f"{WRKDIR}/output_nabec"
NEUROX_OUT = f"{WRKDIR}/output_neurox"

NEW_OUT = f"{WRKDIR}/test_haplo_call"

### Get Sample Names Per Dataset

In [None]:
#AMP-PD
samples = pd.read_csv(f"{AMPPD_BFILE}/chrY_male_hemizygous_only_het_filter_hg19_final.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']

anc = pd.read_csv("$PATH/euro_king_pca_v2.5_July2021/genetic_ancestry_all_pca.csv")

eur_samples = pd.merge(left = samples, right = anc, left_on = "fid", right_on = "IID")
eur_samples = eur_samples[eur_samples.InfPop=="EUROPE"]

amp_males = list(map(str, eur_samples[eur_samples.sex==1]['fid'].tolist()))
print(len(amp_males))

amp_males_double_id = [iid+"_"+iid for iid in amp_males]
print(len(amp_males_double_id))
print(amp_males_double_id[0:10])

In [None]:
#UKBB
samples = pd.read_csv(f"{UKBB_BFILE}/chrY_male_only.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']

anc = pd.read_table("$PATH/covariates_phenome_to_use.txt")

eur_samples = pd.merge(left = samples, right = anc, left_on = "fid", right_on = "IID")
eur_samples = eur_samples[eur_samples.EUROPEAN==1]

ukbb_males = list(map(str, eur_samples[eur_samples.sex==1]['fid'].tolist()))
print(len(ukbb_males))

ukbb_males_double_id = [iid+"_"+iid for iid in ukbb_males]
print(len(ukbb_males_double_id))
print(ukbb_males_double_id[0:10])

In [None]:
#NABEC
samples = pd.read_csv(f"{NABEC_BFILE}/nabec_males_only_hg19_chrY.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

nabec_males = list(map(str, samples[samples.sex==1]['fid'].tolist()))
print(len(nabec_males))

nabec_males_double_id = [iid+"_"+iid for iid in nabec_males]
print(len(nabec_males_double_id))

In [None]:
#NEUROX
samples = pd.read_csv(f"{NEUROX_BFILE}/neurox_chrY_male_only.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

neurox_males = list(map(str, samples[samples.sex==1]['fid'].tolist()))
print(len(neurox_males))

neurox_males_double_id = [iid+"_"+iid for iid in neurox_males]
print(len(neurox_males_double_id))

### Read Snappy Data

In [None]:
def get_snappy_frequencies(out_path, haplo_file, sample_names):
   # !grep -v "no match" {haplo_file} > {out_path}/chrY_hgs_snappy_matches.out
#chrY_hgs_snappy.out
    snappy = pd.read_csv(f"{haplo_file}",sep="\t",header=None)
    
    #snappy = pd.read_csv(f"{out_path}/chrY_hgs_snappy_matches.out",sep="\t",header=None)
    snappy.columns = ['id','haplo','haplo_score','info_alleles']
    snappy['id'] = snappy['id'].astype(str)
    
    #some samples, like "PD-PDNZ095VCJ" have extra data in the "haplo" column, like "B2a1a M109,M152/Page60,P32,P50", and we only want the "B2a1a"
    #snappy['haplo']= snappy['haplo'].str.split(" ").str[0]
    snappy.loc[snappy.haplo!="no match","haplo"] = snappy.loc[snappy.haplo!="no match","haplo"].str.split(" ").str[0]

    snappy_male = snappy[snappy.id.isin(sample_names)]
    print(sample_names[0:10])
    #get major haplogroups, or first character of the haplogroups
    snappy_male['snappy_haplo_major'] = snappy_male['haplo'].str[0]
    snappy_male.loc[snappy_male.haplo!="no match","snappy_haplo_major"] = snappy_male.loc[snappy_male.haplo!="no match","haplo"].str[0]
    snappy_male.loc[snappy_male.haplo=="no match","snappy_haplo_major"] = "no match"
    print(snappy_male.shape)
    print(snappy_male.head())

    #get counts and percents for full haplogroups
    snappy_haplo_freqs = snappy_male['haplo'].value_counts().to_frame()
    snappy_haplo_freqs.columns = ['snappy_haplo_count']
    snappy_haplo_freqs['haplo']=snappy_haplo_freqs.index

    snappy_haplo_freqs['snappy_haplo_percent'] = snappy_haplo_freqs['snappy_haplo_count'] / len(snappy_male.index) * 100


    print(snappy_haplo_freqs.shape)
    print(snappy_haplo_freqs.head())

    #get counts and percents for major haplogroups
    snappy_haplo_major_freqs = snappy_male['snappy_haplo_major'].value_counts().to_frame()
    snappy_haplo_major_freqs.columns = ['snappy_haplo_major_count']
    snappy_haplo_major_freqs['haplo_major']=snappy_haplo_major_freqs.index

    snappy_haplo_major_freqs['snappy_haplo_major_percent'] = snappy_haplo_major_freqs['snappy_haplo_major_count'] / len(snappy_male.index) * 100


    print(snappy_haplo_major_freqs.shape)
    print(snappy_haplo_major_freqs.head())
    
    return snappy_male, snappy_haplo_freqs, snappy_haplo_major_freqs

In [None]:
#NEUROX
neurox_snappy_haplos, neurox_snappy_haplo_freqs, neurox_snappy_haplo_major_freqs = get_snappy_frequencies(NEUROX_OUT, NEUROX_OUT + "/chrY_hgs_snappy.out",neurox_males)

In [None]:
#new NEUROX
neurox_new_snappy_haplos, neurox_new_snappy_haplo_freqs, neurox_new_snappy_haplo_major_freqs = get_snappy_frequencies(NEW_OUT, NEW_OUT + "/snappy_test/neurox.out",neurox_males)

In [None]:
#NABEC
nabec_snappy_haplos, nabec_snappy_haplo_freqs, nabec_snappy_haplo_major_freqs = get_snappy_frequencies(NABEC_OUT, NABEC_OUT + "/snappy.out",nabec_males)

In [None]:
#new NABEC
nabec_new_snappy_haplos, nabec_new_snappy_haplo_freqs, nabec_new_snappy_haplo_major_freqs = get_snappy_frequencies(NEW_OUT, NEW_OUT + "/snappy_test/nabec.out",nabec_males)

In [None]:
#UKBB
ukbb_snappy_haplos, ukbb_snappy_haplo_freqs, ukbb_snappy_haplo_major_freqs = get_snappy_frequencies(UKBB_OUT, UKBB_OUT + "/chrY_hgs_snappy.out",ukbb_males)

In [None]:
#new UKBB
ukbb_new_snappy_haplos, ukbb_new_snappy_haplo_freqs, ukbb_new_snappy_haplo_major_freqs = get_snappy_frequencies(NEW_OUT, NEW_OUT + "/snappy_test/ukbb.out",ukbb_males)

In [None]:
ukbb_snappy_haplo_major_freqs

In [None]:
#AMPPD
amppd_snappy_haplos, amppd_snappy_haplo_freqs, amppd_snappy_haplo_major_freqs = get_snappy_frequencies(AMPPD_OUT, AMPPD_OUT + "/chrY_hgs_snappy.out",amp_males)

In [None]:
#new AMPPD
amppd_new_snappy_haplos, amppd_new_snappy_haplo_freqs, amppd_new_snappy_haplo_major_freqs = get_snappy_frequencies(NEW_OUT, NEW_OUT + "/snappy_test/amppd.out",amp_males)

### Read Yhaplo Data

In [None]:
def get_yhaplo_frequencies(out_path, haplo_file, sample_names):
    yhaplo = pd.read_csv(f"{haplo_file}",sep="\s+",header=None,names = ['id','haplo_short','haplo_short_rep_snp','haplo_long'])#pd.read_csv(f"{OUTDIR}/yhaplo_output/haplogroups.chrY_male_only.txt",sep="\s+",header=None)
    yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
    yhaplo['id'] = yhaplo['id'].astype(str)
    print(yhaplo.shape)
    print(yhaplo.head())

    #assume samples with "A" haplogroup were not assigned one.
    #yhaplo_pass = yhaplo[yhaplo['haplo_long']!='A']
    yhaplo_pass = yhaplo.copy()
    yhaplo_pass.loc[yhaplo_pass.haplo_long=='A','haplo_long'] = 'no match'

    yhaplo_male = yhaplo_pass[yhaplo_pass.id.isin(sample_names)]
    yhaplo_male['yhaplo_haplo_major'] = yhaplo_male['haplo_long'].str[0]
    yhaplo_male.loc[yhaplo_male.haplo_long=='no match','yhaplo_haplo_major'] = 'no match'
    yhaplo_male.loc[yhaplo_male.haplo_long!='no match','yhaplo_haplo_major'] = yhaplo_male.loc[yhaplo_male.haplo_long!='no match','haplo_long'].str[0]
    
    print(yhaplo_male.shape)
    print(yhaplo_male.head())

    yhaplo_haplo_freqs = yhaplo_male['haplo_long'].value_counts().to_frame()
    yhaplo_haplo_freqs.columns = ['yhaplo_haplo_count']
    yhaplo_haplo_freqs['haplo']=yhaplo_haplo_freqs.index
    yhaplo_haplo_freqs['yhaplo_haplo_percent'] = yhaplo_haplo_freqs['yhaplo_haplo_count'] / len(yhaplo_male.index) * 100
    print(yhaplo_haplo_freqs.shape)
    print(yhaplo_haplo_freqs.head())

    yhaplo_haplo_major_freqs = yhaplo_male['yhaplo_haplo_major'].value_counts().to_frame()
    yhaplo_haplo_major_freqs.columns = ['yhaplo_haplo_major_count']
    yhaplo_haplo_major_freqs['haplo_major']=yhaplo_haplo_major_freqs.index

    yhaplo_haplo_major_freqs['yhaplo_haplo_major_percent'] = yhaplo_haplo_major_freqs['yhaplo_haplo_major_count'] / len(yhaplo_male.index) * 100
    print(yhaplo_haplo_major_freqs.shape)
    print(yhaplo_haplo_major_freqs.head())
    
    return yhaplo_male, yhaplo_haplo_freqs, yhaplo_haplo_major_freqs

In [None]:
neurox_yhaplo_haplos, neurox_yhaplo_haplo_freqs, neurox_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NEUROX_OUT, f"{NEUROX_OUT}/yhaplo_output/haplogroups.neurox_chrY_male_only.txt",neurox_males_double_id)

In [None]:
neurox_new_yhaplo_haplos, neurox_new_yhaplo_haplo_freqs, neurox_new_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NEW_OUT, NEW_OUT + "/yhaplo_test/neurox/haplogroups.neurox_chrY_male_only.txt",neurox_males_double_id)

In [None]:
nabec_yhaplo_haplos, nabec_yhaplo_haplo_freqs, nabec_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NABEC_OUT, f"{NABEC_OUT}/yhaplo_output/haplogroups.nabec_males_only_hg19_chrY.txt",nabec_males_double_id)

In [None]:
nabec_new_yhaplo_haplos, nabec_new_yhaplo_haplo_freqs, nabec_new_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NEW_OUT, NEW_OUT + "/yhaplo_test/nabec/haplogroups.nabec_males_only_hg19_chrY.txt",nabec_males_double_id)

In [None]:
amppd_yhaplo_haplos, amppd_yhaplo_haplo_freqs, amppd_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(AMPPD_OUT, f"{AMPPD_OUT}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",amp_males_double_id)

In [None]:
amppd_new_yhaplo_haplos, amppd_new_yhaplo_haplo_freqs, amppd_new_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NEW_OUT, NEW_OUT + "/yhaplo_test/amppd/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",amp_males_double_id)

In [None]:
ukbb_yhaplo_haplos, ukbb_yhaplo_haplo_freqs, ukbb_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(UKBB_OUT, f"{UKBB_OUT}/yhaplo_output/haplogroups.chrY_male_only.txt",ukbb_males_double_id)

In [None]:
ukbb_new_yhaplo_haplos, ukbb_new_yhaplo_haplo_freqs, ukbb_new_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NEW_OUT, NEW_OUT + "/yhaplo_test/ukbb/haplogroups.chrY_male_only.txt",ukbb_males_double_id)

### Read Y-LineageTracker Data

In [None]:
def get_ltrack_frequencies(out_path, haplo_file, sample_names):
    #ltrack = pd.read_table(f"{AMPPD_OUT}/output_ltracker/ltrack_out.hapresult.hg",sep="\s+")
    ltrack = pd.read_table(f"{haplo_file}")
    ltrack['SampleID'] = ltrack['SampleID'].astype(str)
    #print(ltrack.shape)
    #print(ltrack.head())
    ltrack_male = ltrack[ltrack.SampleID.isin(sample_names)]
    
    
    ltrack_male.loc[ltrack_male.Haplogroup==".","Haplogroup"] = "no match"
    
    ltrack_male['ltrack_haplo_major'] = ltrack_male['Haplogroup'].str[0]#ltrack_male['Haplogroup'].str[0]
    ltrack_male.loc[ltrack_male.Haplogroup=="no match","ltrack_haplo_major"] = "no match"
    ltrack_male.loc[ltrack_male.Haplogroup!="no match","ltrack_haplo_major"] = ltrack_male.loc[ltrack_male.Haplogroup!="no match","ltrack_haplo_major"].str[0]
    ltrack_male['ltrack_keyhaplo_major'] = ltrack_male['KeyHaplogroup'].str[0]#ltrack_male['Haplogroup'].str[0]
    print(ltrack_male.shape)
    print(ltrack_male.head())

    ltrack_haplo_freqs = ltrack_male['Haplogroup'].value_counts().to_frame()
    ltrack_haplo_freqs.columns = ['ltrack_haplo_count']
    ltrack_haplo_freqs['haplo']=ltrack_haplo_freqs.index
    print(len(ltrack_male.index))
    ltrack_haplo_freqs['ltrack_haplo_percent'] = ltrack_haplo_freqs['ltrack_haplo_count'] / len(ltrack_male.index) * 100
    print(ltrack_haplo_freqs.shape)
    print(ltrack_haplo_freqs.head())

    ltrack_haplo_major_freqs = ltrack_male['ltrack_haplo_major'].value_counts().to_frame()
    ltrack_haplo_major_freqs.columns = ['ltrack_haplo_major_count']
    ltrack_haplo_major_freqs['haplo_major']=ltrack_haplo_major_freqs.index
    print(len(ltrack_male.index))
    ltrack_haplo_major_freqs['ltrack_haplo_major_percent'] = ltrack_haplo_major_freqs['ltrack_haplo_major_count'] / len(ltrack_male.index) * 100
    print(ltrack_haplo_major_freqs.shape)
    print(ltrack_haplo_major_freqs.head())
    
    return ltrack_male, ltrack_haplo_freqs, ltrack_haplo_major_freqs

In [None]:
ukbb_ltrack_haplos, ukbb_ltrack_haplo_freqs, ukbb_ltrack_haplo_major_freqs = get_ltrack_frequencies(UKBB_OUT, f"{UKBB_OUT}/ltrack_ukbb_hg19.lineageresult.txt",ukbb_males_double_id)

In [None]:
amppd_ltrack_haplos, amppd_ltrack_haplo_freqs, amppd_ltrack_haplo_major_freqs = get_ltrack_frequencies(AMPPD_OUT, f"{AMPPD_OUT}/output_ltracker/ltrack_hg19.lineageresult.txt",amp_males_double_id)

In [None]:
nabec_ltrack_haplos, nabec_ltrack_haplo_freqs, nabec_ltrack_haplo_major_freqs = get_ltrack_frequencies(NABEC_OUT, f"{NABEC_OUT}/output_ltrack/ltrack_hg19.lineageresult.txt",nabec_males_double_id)

In [None]:
neurox_ltrack_haplos, neurox_ltrack_haplo_freqs, neurox_ltrack_haplo_major_freqs = get_ltrack_frequencies(NEUROX_OUT, f"{NEUROX_OUT}/ltrack_neurox_hg19.lineageresult.txt",neurox_males_double_id)

## 6. Compare New Result to Old Results
compare:  
 - number of identical samples by full haplogroup between refs used
 - number of identical samples by major haplogroup between refs used 
 - major haplogroup frequencies between refs used
 - major haplogroup frequencies between tools for new refs only
 - anything else done in the main paper for haplogroup frequency checks

### Number of Identical Samples by Full and Major Haplogroup Between References Used

In [None]:
comparison_df = pd.DataFrame()

In [None]:
def get_snappy_comparison_df(df1,df2,dataset_name):
    
    df2.columns = ['new_'+c for c in df2.columns]
    df2 = df2.rename(columns = {'new_id':'id'})
    ref_comp = pd.merge(left = df1, right = df2, on = 'id')
    print(ref_comp.head())

    ret_dict = dict()
    ret_dict['tool']='snappy'
    print(ret_dict)

    #ret_df = pd.DataFrame()#(columns = ['tool','dataset','sample_count','original_full_haplo_count','new_full_haplo_count','samples_with_same_full_haplo_count','%samples_with_same_full_haplo_count','original_major_haplo_count','new_major_haplo_count','samples_with_same_major_haplo_count','%samples_with_same_major_haplo_count'])
    #ret_df['tool'] = 'snappy'
    #ret_df.tool = 'snappy'
    ret_dict['dataset'] = dataset_name
    
    ret_dict['sample_count'] = len(df1.index)
    
    ret_dict['original_full_haplo_count'] = len(set(df1.haplo))
    ret_dict['new_full_haplo_count'] = len(set(df2.new_haplo))
    

    ret_dict['samples_with_same_full_haplo_count'] = len(ref_comp[ref_comp.haplo == ref_comp.new_haplo].index)
    ret_dict['%samples_with_same_full_haplo_count'] = (len(ref_comp[ref_comp.haplo == ref_comp.new_haplo].index) / len(df1.index) ) * 100
    
    ret_dict['original_major_haplo_count'] = len(set(df1.snappy_haplo_major))
    ret_dict['new_major_haplo_count'] = len(set(df2.new_snappy_haplo_major))
    

    ret_dict['samples_with_same_major_haplo_count'] = len(ref_comp[ref_comp.snappy_haplo_major == ref_comp.new_snappy_haplo_major].index)
    ret_dict['%samples_with_same_major_haplo_count'] = (len(ref_comp[ref_comp.snappy_haplo_major == ref_comp.new_snappy_haplo_major].index) / len(df1.index) ) * 100
    
    return pd.DataFrame(data = [ret_dict])

In [None]:
def get_yhaplo_comparison_df(df1,df2,dataset_name):
    
    df2.columns = ['new_'+c for c in df2.columns]
    df2 = df2.rename(columns = {'new_id':'id'})
    ref_comp = pd.merge(left = df1, right = df2, on = 'id')
    print(ref_comp.head())

    ret_dict = dict()
    ret_dict['tool']='yhaplo'
    print(ret_dict)

    ret_dict['dataset'] = dataset_name
    
    ret_dict['sample_count'] = len(df1.index)
    
    ret_dict['original_full_haplo_count'] = len(set(df1.haplo_long))
    ret_dict['new_full_haplo_count'] = len(set(df2.new_haplo_long))
    

    ret_dict['samples_with_same_full_haplo_count'] = len(ref_comp[ref_comp.haplo_long == ref_comp.new_haplo_long].index)
    ret_dict['%samples_with_same_full_haplo_count'] = (len(ref_comp[ref_comp.haplo_long == ref_comp.new_haplo_long].index) / len(df1.index) ) * 100
    
    ret_dict['original_major_haplo_count'] = len(set(df1.yhaplo_haplo_major))
    ret_dict['new_major_haplo_count'] = len(set(df2.new_yhaplo_haplo_major))
    

    ret_dict['samples_with_same_major_haplo_count'] = len(ref_comp[ref_comp.yhaplo_haplo_major == ref_comp.new_yhaplo_haplo_major].index)
    ret_dict['%samples_with_same_major_haplo_count'] = (len(ref_comp[ref_comp.yhaplo_haplo_major == ref_comp.new_yhaplo_haplo_major].index) / len(df1.index) ) * 100
    
    return pd.DataFrame(data = [ret_dict])

In [None]:
def get_ltrack_comparison_df(df1,df2,dataset_name):
    
    df2.columns = ['new_'+c for c in df2.columns]
    df2 = df2.rename(columns = {'new_SampleID':'SampleID'})
    ref_comp = pd.merge(left = df1, right = df2, on = 'SampleID')
    print(ref_comp.head())

    ret_dict = dict()
    ret_dict['tool']='ltrack'
    print(ret_dict)

    ret_dict['dataset'] = dataset_name
    
    ret_dict['sample_count'] = len(df1.index)
    
    ret_dict['original_full_haplo_count'] = len(set(df1.Haplogroup))
    ret_dict['new_full_haplo_count'] = len(set(df2.new_Haplogroup))
    

    ret_dict['samples_with_same_full_haplo_count'] = len(ref_comp[ref_comp.Haplogroup == ref_comp.new_Haplogroup].index)
    ret_dict['%samples_with_same_full_haplo_count'] = (len(ref_comp[ref_comp.Haplogroup == ref_comp.new_Haplogroup].index) / len(df1.index) ) * 100
    
    ret_dict['original_major_haplo_count'] = len(set(df1.ltrack_haplo_major))
    ret_dict['new_major_haplo_count'] = len(set(df2.new_ltrack_haplo_major))
    

    ret_dict['samples_with_same_major_haplo_count'] = len(ref_comp[ref_comp.ltrack_haplo_major == ref_comp.new_ltrack_haplo_major].index)
    ret_dict['%samples_with_same_major_haplo_count'] = (len(ref_comp[ref_comp.ltrack_haplo_major == ref_comp.new_ltrack_haplo_major].index) / len(df1.index) ) * 100
    
    return pd.DataFrame(data = [ret_dict])

In [None]:
comparison_df = comparison_df.append(get_snappy_comparison_df(neurox_snappy_haplos.copy(),neurox_new_snappy_haplos.copy(),'neurox'))
comparison_df = comparison_df.append(get_snappy_comparison_df(nabec_snappy_haplos.copy(),nabec_new_snappy_haplos.copy(),'nabec'))
comparison_df = comparison_df.append(get_snappy_comparison_df(ukbb_snappy_haplos.copy(),ukbb_new_snappy_haplos.copy(),'ukbb'))
comparison_df = comparison_df.append(get_snappy_comparison_df(amppd_snappy_haplos.copy(),amppd_new_snappy_haplos.copy(),'amppd'))
comparison_df = comparison_df.append(get_yhaplo_comparison_df(neurox_yhaplo_haplos.copy(),neurox_new_yhaplo_haplos.copy(),'neurox'))
comparison_df = comparison_df.append(get_yhaplo_comparison_df(nabec_yhaplo_haplos.copy(),nabec_new_yhaplo_haplos.copy(),'nabec'))
comparison_df = comparison_df.append(get_yhaplo_comparison_df(ukbb_yhaplo_haplos.copy(),ukbb_new_yhaplo_haplos.copy(),'ukbb'))
comparison_df = comparison_df.append(get_yhaplo_comparison_df(amppd_yhaplo_haplos.copy(),amppd_new_yhaplo_haplos.copy(),'amppd'))

comparison_df = comparison_df.append(get_ltrack_comparison_df(neurox_ltrack_haplos.copy(),neurox_ltrack_haplos.copy(),'neurox'))
comparison_df = comparison_df.append(get_ltrack_comparison_df(nabec_ltrack_haplos.copy(),nabec_ltrack_haplos.copy(),'nabec'))
comparison_df = comparison_df.append(get_ltrack_comparison_df(ukbb_ltrack_haplos.copy(),ukbb_ltrack_haplos.copy(),'ukbb'))
comparison_df = comparison_df.append(get_ltrack_comparison_df(amppd_ltrack_haplos.copy(),amppd_ltrack_haplos.copy(),'amppd'))

In [None]:
comparison_df

In [None]:
comparison_df.to_csv(f"{WRKDIR}/haplo_caller_ref_comparison.csv",index=None)