# Calculate Haplogroup Frequencies in Samples
- **Author(s)** - Frank Grenn
- **Date Started** - February  2021
- **Quick Description:** check counts from the snappy, yhaplo and y-lineagetracker tools in the AMP-PD, NeuroX and UKBB samples 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [None]:
WRKDIR = "$PATH/chrY"
AMPDIR = "$PATH/PD/AMP-PD"

UKBB_BFILE = f"{WRKDIR}/y_ukbb"
AMPPD_BFILE = f"{WRKDIR}/y_male_only_bfiles"
NABEC_BFILE = f"{WRKDIR}/y_nabec_files"
NEUROX_BFILE = f"{WRKDIR}/y_neurox"

UKBB_OUT = f"{WRKDIR}/output_ukbb"
AMPPD_OUT = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"
NABEC_OUT = f"{WRKDIR}/output_nabec"
NEUROX_OUT = f"{WRKDIR}/output_neurox"

# 1) Get sample names for the various datasets

In [None]:
#AMP-PD
samples = pd.read_csv(f"{AMPPD_BFILE}/chrY_male_hemizygous_only_het_filter_hg19_final.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

anc = pd.read_csv("$PATH/genetic_ancestry_all_pca.csv")
print(anc.shape)
print(anc.head())

eur_samples = pd.merge(left = samples, right = anc, left_on = "fid", right_on = "IID")
eur_samples = eur_samples[eur_samples.InfPop=="EUROPE"]
print(eur_samples.shape)
print(eur_samples.head())

amp_males = list(map(str, eur_samples[eur_samples.sex==1]['fid'].tolist()))
print(len(amp_males))

amp_males_double_id = [iid+"_"+iid for iid in amp_males]
print(len(amp_males_double_id))
print(amp_males_double_id[0:10])

In [None]:
#UKBB
samples = pd.read_csv(f"{UKBB_BFILE}/chrY_male_only.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

anc = pd.read_table("$PATH/covariates_phenome_to_use.txt")
print(anc.shape)
print(anc.head())

eur_samples = pd.merge(left = samples, right = anc, left_on = "fid", right_on = "IID")
eur_samples = eur_samples[eur_samples.EUROPEAN==1]
print(eur_samples.shape)
print(eur_samples.head())

ukbb_males = list(map(str, eur_samples[eur_samples.sex==1]['fid'].tolist()))
print(len(ukbb_males))

ukbb_males_double_id = [iid+"_"+iid for iid in ukbb_males]
print(len(ukbb_males_double_id))
print(ukbb_males_double_id[0:10])

In [None]:
#NABEC
samples = pd.read_csv(f"{NABEC_BFILE}/nabec_males_only_hg19_chrY.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

nabec_males = list(map(str, samples[samples.sex==1]['fid'].tolist()))
print(len(nabec_males))

nabec_males_double_id = [iid+"_"+iid for iid in nabec_males]
print(len(nabec_males_double_id))

In [None]:
#NEUROX
samples = pd.read_csv(f"{NEUROX_BFILE}/neurox_chrY_male_only.fam",sep="\s",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

neurox_males = list(map(str, samples[samples.sex==1]['fid'].tolist()))
print(len(neurox_males))

neurox_males_double_id = [iid+"_"+iid for iid in neurox_males]
print(len(neurox_males_double_id))

## 2) SNAPPY results

In [None]:
def get_snappy_frequencies(out_path, haplo_file, sample_names):
   # !grep -v "no match" {haplo_file} > {out_path}/chrY_hgs_snappy_matches.out
#chrY_hgs_snappy.out
    snappy = pd.read_csv(f"{haplo_file}",sep="\t",header=None)
    #snappy = pd.read_csv(f"{out_path}/chrY_hgs_snappy_matches.out",sep="\t",header=None)
    snappy.columns = ['id','haplo','haplo_score','info_alleles']

    
    #some samples, like "PD-PDNZ095VCJ" have extra data in the "haplo" column, like "B2a1a M109,M152/Page60,P32,P50", and we only want the "B2a1a"
    #snappy['haplo']= snappy['haplo'].str.split(" ").str[0]
    snappy.loc[snappy.haplo!="no match","haplo"] = snappy.loc[snappy.haplo!="no match","haplo"].str.split(" ").str[0]

    snappy_male = snappy[snappy.id.isin(sample_names)]
    print(sample_names[0:10])
    #get major haplogroups, or first character of the haplogroups
    snappy_male['snappy_haplo_major'] = snappy_male['haplo'].str[0]
    snappy_male.loc[snappy_male.haplo!="no match","snappy_haplo_major"] = snappy_male.loc[snappy_male.haplo!="no match","haplo"].str[0]
    snappy_male.loc[snappy_male.haplo=="no match","snappy_haplo_major"] = "no match"
    print(snappy_male.shape)
    print(snappy_male.head())

    #get counts and percents for full haplogroups
    snappy_haplo_freqs = snappy_male['haplo'].value_counts().to_frame()
    snappy_haplo_freqs.columns = ['snappy_haplo_count']
    snappy_haplo_freqs['haplo']=snappy_haplo_freqs.index

    snappy_haplo_freqs['snappy_haplo_percent'] = snappy_haplo_freqs['snappy_haplo_count'] / len(snappy_male.index) * 100


    print(snappy_haplo_freqs.shape)
    print(snappy_haplo_freqs.head())

    #get counts and percents for major haplogroups
    snappy_haplo_major_freqs = snappy_male['snappy_haplo_major'].value_counts().to_frame()
    snappy_haplo_major_freqs.columns = ['snappy_haplo_major_count']
    snappy_haplo_major_freqs['haplo_major']=snappy_haplo_major_freqs.index

    snappy_haplo_major_freqs['snappy_haplo_major_percent'] = snappy_haplo_major_freqs['snappy_haplo_major_count'] / len(snappy_male.index) * 100


    print(snappy_haplo_major_freqs.shape)
    print(snappy_haplo_major_freqs.head())
    
    return snappy_haplo_freqs, snappy_haplo_major_freqs

In [None]:
#NEUROX
neurox_snappy_haplo_freqs, neurox_snappy_haplo_major_freqs = get_snappy_frequencies(NEUROX_OUT, NEUROX_OUT + "/chrY_hgs_snappy.out",neurox_males)

In [None]:
#NABEC
nabec_snappy_haplo_freqs, nabec_snappy_haplo_major_freqs = get_snappy_frequencies(NABEC_OUT, NABEC_OUT + "/snappy.out",nabec_males)

In [None]:
#UKBB
ukbb_snappy_haplo_freqs, ukbb_snappy_haplo_major_freqs = get_snappy_frequencies(UKBB_OUT, UKBB_OUT + "/chrY_hgs_snappy.out",ukbb_males)

In [None]:
#AMPPD
amppd_snappy_haplo_freqs, amppd_snappy_haplo_major_freqs = get_snappy_frequencies(AMPPD_OUT, AMPPD_OUT + "/chrY_hgs_snappy.out",amp_males)

## 3) YHaplo Tool Results

In [None]:
def get_yhaplo_frequencies(out_path, haplo_file, sample_names):
    yhaplo = pd.read_csv(f"{haplo_file}",sep="\s+",header=None)#pd.read_csv(f"{OUTDIR}/yhaplo_output/haplogroups.chrY_male_only.txt",sep="\s+",header=None)
    yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
    print(yhaplo.shape)
    print(yhaplo.head())

    #assume samples with "A" haplogroup were not assigned one.
    #yhaplo_pass = yhaplo[yhaplo['haplo_long']!='A']
    yhaplo_pass = yhaplo.copy()
    yhaplo_pass.loc[yhaplo_pass.haplo_long=='A','haplo_long'] = 'no match'

    yhaplo_male = yhaplo_pass[yhaplo_pass.id.isin(sample_names)]
    yhaplo_male['yhaplo_haplo_major'] = yhaplo_male['haplo_long'].str[0]
    yhaplo_male.loc[yhaplo_male.haplo_long=='no match','yhaplo_haplo_major'] = 'no match'
    yhaplo_male.loc[yhaplo_male.haplo_long!='no match','yhaplo_haplo_major'] = yhaplo_male.loc[yhaplo_male.haplo_long!='no match','haplo_long'].str[0]
    
    print(yhaplo_male.shape)
    print(yhaplo_male.head())

    yhaplo_haplo_freqs = yhaplo_male['haplo_long'].value_counts().to_frame()
    yhaplo_haplo_freqs.columns = ['yhaplo_haplo_count']
    yhaplo_haplo_freqs['haplo']=yhaplo_haplo_freqs.index
    yhaplo_haplo_freqs['yhaplo_haplo_percent'] = yhaplo_haplo_freqs['yhaplo_haplo_count'] / len(yhaplo_male.index) * 100
    print(yhaplo_haplo_freqs.shape)
    print(yhaplo_haplo_freqs.head())

    yhaplo_haplo_major_freqs = yhaplo_male['yhaplo_haplo_major'].value_counts().to_frame()
    yhaplo_haplo_major_freqs.columns = ['yhaplo_haplo_major_count']
    yhaplo_haplo_major_freqs['haplo_major']=yhaplo_haplo_major_freqs.index

    yhaplo_haplo_major_freqs['yhaplo_haplo_major_percent'] = yhaplo_haplo_major_freqs['yhaplo_haplo_major_count'] / len(yhaplo_male.index) * 100
    print(yhaplo_haplo_major_freqs.shape)
    print(yhaplo_haplo_major_freqs.head())
    
    return yhaplo_haplo_freqs, yhaplo_haplo_major_freqs

In [None]:
neurox_yhaplo_haplo_freqs, neurox_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NEUROX_OUT, f"{NEUROX_OUT}/yhaplo_output/haplogroups.neurox_chrY_male_only.txt",neurox_males_double_id)

In [None]:
nabec_yhaplo_haplo_freqs, nabec_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(NABEC_OUT, f"{NABEC_OUT}/yhaplo_output/haplogroups.nabec_males_only_hg19_chrY.txt",nabec_males_double_id)

In [None]:
amppd_yhaplo_haplo_freqs, amppd_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(AMPPD_OUT, f"{AMPPD_OUT}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",amp_males_double_id)

In [None]:
ukbb_yhaplo_haplo_freqs, ukbb_yhaplo_haplo_major_freqs = get_yhaplo_frequencies(UKBB_OUT, f"{UKBB_OUT}/yhaplo_output/haplogroups.chrY_male_only.txt",ukbb_males_double_id)

## 4) Y-LineageTracker Results

In [None]:
def get_ltrack_frequencies(out_path, haplo_file, sample_names):
    #ltrack = pd.read_table(f"{AMPPD_OUT}/output_ltracker/ltrack_out.hapresult.hg",sep="\s+")
    ltrack = pd.read_table(f"{haplo_file}")
    #print(ltrack.shape)
    #print(ltrack.head())
    ltrack_male = ltrack[ltrack.SampleID.isin(sample_names)]
    
    ltrack_male.loc[ltrack_male.Haplogroup==".","Haplogroup"] = "no match"
    
    ltrack_male['ltrack_haplo_major'] = ltrack_male['Haplogroup'].str[0]#ltrack_male['Haplogroup'].str[0]
    ltrack_male.loc[ltrack_male.Haplogroup=="no match","ltrack_haplo_major"] = "no match"
    ltrack_male.loc[ltrack_male.Haplogroup!="no match","ltrack_haplo_major"] = ltrack_male.loc[ltrack_male.Haplogroup!="no match","ltrack_haplo_major"].str[0]
    ltrack_male['ltrack_keyhaplo_major'] = ltrack_male['KeyHaplogroup'].str[0]#ltrack_male['Haplogroup'].str[0]
    print(ltrack_male.shape)
    print(ltrack_male.head())

    ltrack_haplo_freqs = ltrack_male['Haplogroup'].value_counts().to_frame()
    ltrack_haplo_freqs.columns = ['ltrack_haplo_count']
    ltrack_haplo_freqs['haplo']=ltrack_haplo_freqs.index
    print(len(ltrack_male.index))
    ltrack_haplo_freqs['ltrack_haplo_percent'] = ltrack_haplo_freqs['ltrack_haplo_count'] / len(ltrack_male.index) * 100
    print(ltrack_haplo_freqs.shape)
    print(ltrack_haplo_freqs.head())

    ltrack_haplo_major_freqs = ltrack_male['ltrack_haplo_major'].value_counts().to_frame()
    ltrack_haplo_major_freqs.columns = ['ltrack_haplo_major_count']
    ltrack_haplo_major_freqs['haplo_major']=ltrack_haplo_major_freqs.index
    print(len(ltrack_male.index))
    ltrack_haplo_major_freqs['ltrack_haplo_major_percent'] = ltrack_haplo_major_freqs['ltrack_haplo_major_count'] / len(ltrack_male.index) * 100
    print(ltrack_haplo_major_freqs.shape)
    print(ltrack_haplo_major_freqs.head())
    
    return ltrack_haplo_freqs, ltrack_haplo_major_freqs

In [None]:
ukbb_ltrack_haplo_freqs, ukbb_ltrack_haplo_major_freqs = get_ltrack_frequencies(UKBB_OUT, f"{UKBB_OUT}/ltrack_ukbb_hg19.lineageresult.txt",ukbb_males_double_id)

In [None]:
amppd_ltrack_haplo_freqs, amppd_ltrack_haplo_major_freqs = get_ltrack_frequencies(AMPPD_OUT, f"{AMPPD_OUT}/output_ltracker/ltrack_hg19.lineageresult.txt",amp_males_double_id)

In [None]:
nabec_ltrack_haplo_freqs, nabec_ltrack_haplo_major_freqs = get_ltrack_frequencies(NABEC_OUT, f"{NABEC_OUT}/output_ltrack/ltrack_hg19.lineageresult.txt",nabec_males_double_id)

In [None]:
neurox_ltrack_haplo_freqs, neurox_ltrack_haplo_major_freqs = get_ltrack_frequencies(NEUROX_OUT, f"{NEUROX_OUT}/ltrack_neurox_hg19.lineageresult.txt",neurox_males_double_id)

## 5) Combine by dataset

#### AMPPD

In [None]:
#AMPPD
print(len(set(amppd_snappy_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(amppd_yhaplo_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(amppd_ltrack_haplo_freqs['haplo'].tolist())- set(["no match"])))

In [None]:
amp_haplo_freqs = pd.merge(left = amppd_snappy_haplo_freqs, right = amppd_yhaplo_haplo_freqs, on = 'haplo',how = 'outer')

amp_haplo_freqs = pd.merge(left = amp_haplo_freqs, right = amppd_ltrack_haplo_freqs, on = 'haplo',how = 'outer')

amp_haplo_freqs=amp_haplo_freqs.sort_values(by=['haplo'])

amp_haplo_freqs = amp_haplo_freqs[['haplo','snappy_haplo_count','yhaplo_haplo_count','ltrack_haplo_count','snappy_haplo_percent','yhaplo_haplo_percent','ltrack_haplo_percent']]
print(amp_haplo_freqs.shape)
print(amp_haplo_freqs.head())
amp_haplo_freqs.to_csv(f"{AMPPD_OUT}/haplotype_full_male_only_new.csv",index = None)

In [None]:
print(len(set(amppd_snappy_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(amppd_yhaplo_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(amppd_ltrack_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))

In [None]:
amp_haplo_major_freqs = pd.merge(left = amppd_snappy_haplo_major_freqs, right = amppd_yhaplo_haplo_major_freqs, on = 'haplo_major',how = 'outer')
amp_haplo_major_freqs = pd.merge(left = amp_haplo_major_freqs, right = amppd_ltrack_haplo_major_freqs, on = 'haplo_major',how = 'outer')
amp_haplo_major_freqs=amp_haplo_major_freqs.sort_values(by=['haplo_major'])
print(amp_haplo_major_freqs.shape)
print(amp_haplo_major_freqs.head())
amp_haplo_major_freqs = amp_haplo_major_freqs[['haplo_major','snappy_haplo_major_count','yhaplo_haplo_major_count','ltrack_haplo_major_count','snappy_haplo_major_percent','yhaplo_haplo_major_percent','ltrack_haplo_major_percent']]
amp_haplo_major_freqs.to_csv(f"{AMPPD_OUT}/haplotype_major_male_only_new.csv",index = None)

### UKBB

In [None]:
set(ukbb_snappy_haplo_major_freqs.haplo_major.tolist()) - set(["no match"])

In [None]:
print(len(set(ukbb_snappy_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(ukbb_yhaplo_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(ukbb_ltrack_haplo_freqs['haplo'].tolist())- set(["no match"])))

ukbb_haplo_freqs = pd.merge(left = ukbb_snappy_haplo_freqs, right = ukbb_yhaplo_haplo_freqs, on = 'haplo',how = 'outer')

ukbb_haplo_freqs = pd.merge(left = ukbb_haplo_freqs, right = ukbb_ltrack_haplo_freqs, on = 'haplo',how = 'outer')

ukbb_haplo_freqs=ukbb_haplo_freqs.sort_values(by=['haplo'])

ukbb_haplo_freqs = ukbb_haplo_freqs[['haplo','snappy_haplo_count','yhaplo_haplo_count','ltrack_haplo_count','snappy_haplo_percent','yhaplo_haplo_percent','ltrack_haplo_percent']]
print(ukbb_haplo_freqs.shape)
print(ukbb_haplo_freqs.head())
ukbb_haplo_freqs.to_csv(f"{UKBB_OUT}/haplotype_full_male_only_new.csv",index = None)

print(len(set(ukbb_snappy_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(ukbb_yhaplo_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(ukbb_ltrack_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))

ukbb_haplo_major_freqs = pd.merge(left = ukbb_snappy_haplo_major_freqs, right = ukbb_yhaplo_haplo_major_freqs, on = 'haplo_major',how = 'outer')
ukbb_haplo_major_freqs = pd.merge(left = ukbb_haplo_major_freqs, right = ukbb_ltrack_haplo_major_freqs, on = 'haplo_major',how = 'outer')
ukbb_haplo_major_freqs=ukbb_haplo_major_freqs.sort_values(by=['haplo_major'])
print(ukbb_haplo_major_freqs.shape)
print(ukbb_haplo_major_freqs.head())
ukbb_haplo_major_freqs = ukbb_haplo_major_freqs[['haplo_major','snappy_haplo_major_count','yhaplo_haplo_major_count','ltrack_haplo_major_count','snappy_haplo_major_percent','yhaplo_haplo_major_percent','ltrack_haplo_major_percent']]
ukbb_haplo_major_freqs.to_csv(f"{UKBB_OUT}/haplotype_major_male_only_new.csv",index = None)

### NABEC

In [None]:
print(len(set(nabec_snappy_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(nabec_yhaplo_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(nabec_ltrack_haplo_freqs['haplo'].tolist())- set(["no match"])))

nabec_haplo_freqs = pd.merge(left = nabec_snappy_haplo_freqs, right = nabec_yhaplo_haplo_freqs, on = 'haplo',how = 'outer')

nabec_haplo_freqs = pd.merge(left = nabec_haplo_freqs, right = nabec_ltrack_haplo_freqs, on = 'haplo',how = 'outer')

nabec_haplo_freqs=nabec_haplo_freqs.sort_values(by=['haplo'])

nabec_haplo_freqs = nabec_haplo_freqs[['haplo','snappy_haplo_count','yhaplo_haplo_count','ltrack_haplo_count','snappy_haplo_percent','yhaplo_haplo_percent','ltrack_haplo_percent']]
print(nabec_haplo_freqs.shape)
print(nabec_haplo_freqs.head())
nabec_haplo_freqs.to_csv(f"{NABEC_OUT}/haplotype_full_male_only_new.csv",index = None)

print(len(set(nabec_snappy_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(nabec_yhaplo_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(nabec_ltrack_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))

nabec_haplo_major_freqs = pd.merge(left = nabec_snappy_haplo_major_freqs, right = nabec_yhaplo_haplo_major_freqs, on = 'haplo_major',how = 'outer')
nabec_haplo_major_freqs = pd.merge(left = nabec_haplo_major_freqs, right = nabec_ltrack_haplo_major_freqs, on = 'haplo_major',how = 'outer')
nabec_haplo_major_freqs=nabec_haplo_major_freqs.sort_values(by=['haplo_major'])
print(nabec_haplo_major_freqs.shape)
print(nabec_haplo_major_freqs.head())
nabec_haplo_major_freqs = nabec_haplo_major_freqs[['haplo_major','snappy_haplo_major_count','yhaplo_haplo_major_count','ltrack_haplo_major_count','snappy_haplo_major_percent','yhaplo_haplo_major_percent','ltrack_haplo_major_percent']]
nabec_haplo_major_freqs.to_csv(f"{NABEC_OUT}/haplotype_major_male_only_new.csv",index = None)

### NEUROX

In [None]:
print(len(set(neurox_snappy_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(neurox_yhaplo_haplo_freqs['haplo'].tolist())- set(["no match"])))
print(len(set(neurox_ltrack_haplo_freqs['haplo'].tolist())- set(["no match"])))

neurox_haplo_freqs = pd.merge(left = neurox_snappy_haplo_freqs, right = neurox_yhaplo_haplo_freqs, on = 'haplo',how = 'outer')

neurox_haplo_freqs = pd.merge(left = neurox_haplo_freqs, right = neurox_ltrack_haplo_freqs, on = 'haplo',how = 'outer')

neurox_haplo_freqs=neurox_haplo_freqs.sort_values(by=['haplo'])

neurox_haplo_freqs = neurox_haplo_freqs[['haplo','snappy_haplo_count','yhaplo_haplo_count','ltrack_haplo_count','snappy_haplo_percent','yhaplo_haplo_percent','ltrack_haplo_percent']]
print(neurox_haplo_freqs.shape)
print(neurox_haplo_freqs.head())
neurox_haplo_freqs.to_csv(f"{NEUROX_OUT}/haplotype_full_male_only_new.csv",index = None)

print(len(set(neurox_snappy_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(neurox_yhaplo_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))
print(len(set(neurox_ltrack_haplo_major_freqs['haplo_major'].tolist())- set(["no match"])))

neurox_haplo_major_freqs = pd.merge(left = neurox_snappy_haplo_major_freqs, right = neurox_yhaplo_haplo_major_freqs, on = 'haplo_major',how = 'outer')
neurox_haplo_major_freqs = pd.merge(left = neurox_haplo_major_freqs, right = neurox_ltrack_haplo_major_freqs, on = 'haplo_major',how = 'outer')
neurox_haplo_major_freqs=neurox_haplo_major_freqs.sort_values(by=['haplo_major'])
print(neurox_haplo_major_freqs.shape)
print(neurox_haplo_major_freqs.head())
neurox_haplo_major_freqs = neurox_haplo_major_freqs[['haplo_major','snappy_haplo_major_count','yhaplo_haplo_major_count','ltrack_haplo_major_count','snappy_haplo_major_percent','yhaplo_haplo_major_percent','ltrack_haplo_major_percent']]
neurox_haplo_major_freqs.to_csv(f"{NEUROX_OUT}/haplotype_major_male_only_new.csv",index = None)

## Plot

In [None]:
def make_plot_df(df):
    plot_df = pd.DataFrame()

    temp = df[['haplo_major','snappy_haplo_major_count']]
    temp.columns = ['haplo_major','count']
    temp['haplogroup caller'] = 'Snappy'

    plot_df = temp.copy()

    temp = df[['haplo_major','yhaplo_haplo_major_count']]
    temp.columns = ['haplo_major','count']
    temp['haplogroup caller'] = 'Yhaplo'

    plot_df = plot_df.append(temp)

    temp = df[['haplo_major','ltrack_haplo_major_count']]
    temp.columns = ['haplo_major','count']
    temp['haplogroup caller'] = 'Y-LineageTracker'

    plot_df = plot_df.append(temp)
    
    plot_df.loc[plot_df.haplo_major=="no match","haplo_major"] = "None"

    #plot_df = plot_df.loc[plot_df.haplo_major.isin(list(string.ascii_uppercase[:26]))]
    print(plot_df.head())
    
    return plot_df

In [None]:
ukbb_plot_df = make_plot_df(ukbb_haplo_major_freqs)

In [None]:
amp_plot_df = make_plot_df(amp_haplo_major_freqs)

In [None]:
nabec_plot_df = make_plot_df(nabec_haplo_major_freqs)

In [None]:
neurox_plot_df = make_plot_df(neurox_haplo_major_freqs)

In [None]:

fig = plt.figure(figsize=(14, 8), dpi=80)
fig.subplots_adjust(hspace=0.5, wspace=0.2)

sns.set()
 #add subplot
ax1 = fig.add_subplot(2,2,1)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

sns_plot = sns.barplot(x="haplo_major", y="count", hue="haplogroup caller", data=amp_plot_df)
sns_plot.get_legend().remove()

plt.xlabel("Major Haplogroup")
plt.ylabel("Sample Count")
plt.title("AMP-PD Sample Counts")
            
ax2 = fig.add_subplot(2,2,2)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

sns_plot = sns.barplot(x="haplo_major", y="count", hue="haplogroup caller", data=ukbb_plot_df)
sns_plot.get_legend().remove()

plt.xlabel("Major Haplogroup")
plt.ylabel("Sample Count")
plt.title("UKBiobank Sample Counts")

 #add subplot
#ax3 = fig.add_subplot(2,2,3)
               
#plot
#sns.set()
#plt.figure(figsize=(10,10))

#sns_plot = sns.barplot(x="haplo_major", y="count", hue="haplogroup caller", data=nabec_plot_df)
#sns_plot.get_legend().remove()

#plt.xlabel("Major Haplogroup")
#plt.ylabel("Sample Count")
#plt.title("NABEC Sample Counts")

 #add subplot
ax4 = fig.add_subplot(2,2,3)
               
#plot
sns.set()
#plt.figure(figsize=(10,10))

sns_plot = sns.barplot(x="haplo_major", y="count", hue="haplogroup caller", data=neurox_plot_df)
plt.legend(bbox_to_anchor=(1.1, 1),borderaxespad=0)


plt.xlabel("Major Haplogroup")
plt.ylabel("Sample Count")
plt.title("NeuroX Sample Counts")
            
fig.suptitle("Sample Major Haplogroup Counts")
plt.show()
sns_plot.get_figure().savefig(f"{WRKDIR}/major_haplogroup_count_plot.png",bbox_inches='tight')  