# Check Chr Y Full Haplogroups in UKBioBank PD Cases and Controls and PD Proxies and Controls
- **Author(s)** - Frank Grenn
- **Date Started** - June 2021
- **Quick Description:** logistic regression for full haplogroups with UKBB data.

In [None]:
import pandas as pd
import random
import scipy.stats as ss
import statsmodels.api as sm

In [None]:
WRKDIR = "$PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_ukbb"
OUTDIR = f"{WRKDIR}/output_ukbb"
CARDDIR = "$PATH/CARD"
#count cutoff for testing
cutoff = 50

## 1. Get Data and Subset Cases, Controls and Proxies

In [None]:
fam = pd.read_csv(f"{BFILEDIR}/chrY_eur_male_only.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

In [None]:
fam.pheno.value_counts()

In [None]:
auto_cc_pcs = pd.read_csv(f"{CARDDIR}/projects/chromosome_y_expression/ukbb/pcs_case_control_pca.txt",sep="\s+")
print(auto_cc_pcs.shape)
print(auto_cc_pcs.head())

In [None]:
auto_pc_pcs = pd.read_csv(f"{CARDDIR}/projects/chromosome_y_expression/ukbb/pcs_proxy_control_pca.txt",sep="\s+")
print(auto_pc_pcs.shape)
print(auto_pc_pcs.head())

In [None]:
meta = pd.read_table(f"{CARDDIR}/UKBIOBANK/PHENOTYPE_DATA/covariates_phenome_to_use.txt")
print(meta.shape)
print(meta.head())

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_cc_pcs, left_on = ['fid','iid'], right_on = ['FID','IID'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['FID','AGE_OF_RECRUIT']], left_on = ['fid'], right_on = ['FID'])
print(merge2.shape)
case_control_df = merge2.copy()

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pc_pcs, left_on = ['fid','iid'], right_on = ['FID','IID'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['FID','AGE_OF_RECRUIT']], left_on = ['fid'], right_on = ['FID'])
print(merge2.shape)
proxy_control_df = merge2.copy()

In [None]:
case_control_df.pheno.value_counts()

In [None]:
proxy_control_df.pheno.value_counts()

In [None]:
",".join(case_control_df.columns)

## Yhaplo Data

In [None]:
yhaplo = pd.read_csv(f"{OUTDIR}/yhaplo_output/haplogroups.chrY_male_only.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
yhaplo['haplo_major'] = yhaplo['haplo_long'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
yhaplo['id'] = yhaplo['id'].astype('int64')
#assume samples with "A" haplogroup were not assigned one.
#yhaplo = yhaplo[yhaplo['haplo_long']!='A']
print(yhaplo.shape)
print(yhaplo.head())

## Snappy Data

In [None]:
snappy = pd.read_csv(f"{OUTDIR}/chrY_hgs_snappy.out",sep="\t",header=None)
snappy.columns = ['id','haplo','haplo_score','info_alleles']

#some samples, like "PD-PDNZ095VCJ" have extra data in the "haplo" column, like "B2a1a M109,M152/Page60,P32,P50", and we only want the "B2a1a"
snappy['haplo']= snappy['haplo'].str.split(" ").str[0]

print(snappy.shape)
print(snappy.head())

In [None]:
snappy['haplo_major'] = snappy['haplo'].str[0]
print(snappy.shape)
print(snappy.head())

## Y-LineageTracker Data

In [None]:
#ltrack = pd.read_table(f"{AMPPD_OUT}/output_ltracker/ltrack_out.hapresult.hg",sep="\s+")
ltrack = pd.read_table(f"{OUTDIR}/ltrack_ukbb_hg19.lineageresult.txt")
print(ltrack.shape)
print(ltrack.head())


ltrack['haplo_major'] = ltrack['Haplogroup'].str[0]#ltrack_male['Haplogroup'].str[0]
ltrack.columns = ['id','haplo','keyhaplo','mutations','lineagetrack','haplo_major']

#ltrack['id'] = ltrack['id'].str.split('_').str[0].astype(int)

ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
ltrack['id'] = ltrack['id'].astype('int64')
print(ltrack.shape)
print(ltrack.head())

## 2. Setup some stats functions

In [None]:
#chi squared test for a specific haplotype
def chi_square_for_haplogroup(haplo,haplo_col,df,prnt):
    data = df.copy()
    data.loc[data[haplo_col] != haplo,haplo_col] = 'not '+haplo

    contingency_table = pd.crosstab(data[haplo_col], data['pheno'], margins = False) 



    g, p, dof, expctd = ss.chi2_contingency(contingency_table)
    if prnt:
        print(contingency_table)
        print(g)
        print(p)
        print(dof)
        print(expctd)
        
    return g, p, dof, expctd

In [None]:
#logistic regression for specific full haplogroup
def log_reg_for_haplogroup(haplo,df,prnt):
    
    
    model = sm.GLM.from_formula(f"pheno ~ {haplo} + AGE_OF_RECRUIT + PC1 + PC2 + PC3 + PC4 + PC5",family = sm.families.Binomial(), data = df)
    #model = sm.GLM.from_formula(f"pheno ~ {haplo}", data = data_no_gc_no_unknown)
    results = model.fit()
    if prnt:
        print(results.summary())
    results.summary()
    
    return results.pvalues[f'{haplo}'], results.params[haplo], results.bse[haplo]

In [None]:
#function to count all haplogroups, do chi squared and logistic regression for each, and return a dataframe
def run_chisq_and_log_reg(df,haplo_col_str, count_cutoff, prnt):
    data_crosstab = pd.crosstab(df[haplo_col_str], df['pheno'], margins = False)
    data_crosstab.columns = ['control','case']
    data_crosstab_filter = data_crosstab.copy()
    if(cutoff!=0):
        data_crosstab_filter = data_crosstab[data_crosstab.control+data_crosstab.case>=count_cutoff]
        
    haplos = set(data_crosstab_filter.index.tolist())

    #chi squared
    chisq_results = data_crosstab_filter.copy()
    chisq_results['p_chisq'] = 0.1
    for h in haplos:
        g, p, dof, expctd = chi_square_for_haplogroup(h,haplo_col_str,df,False)
        chisq_results.at[h,'p_chisq'] = p
    chisq_results.columns = ['controls','cases','p_chisq']
    chisq_results = chisq_results.reset_index()


    #logistic regression
    df_ohe = df.copy()
    df_ohe[haplo_col_str+'_orig'] = df_ohe[haplo_col_str]
    df_ohe = pd.get_dummies(df_ohe, columns = [haplo_col_str])
    df_ohe.pheno = df_ohe.pheno - 1



    logreg_results = data_crosstab_filter.copy()
    logreg_results['p_logreg'] = 0.1
    for h in haplos:
        p, beta,se = log_reg_for_haplogroup(f'{haplo_col_str}_{h}',df_ohe,False)
        logreg_results.at[h,'p_logreg'] = p
        logreg_results.at[h,'beta_logreg'] = beta
        logreg_results.at[h,'se_logreg'] = se
    logreg_results.columns = ['controls','cases','p_logreg','beta_logreg','se_logreg']
    logreg_results = logreg_results.reset_index()


    merge_results = pd.merge(left = logreg_results, right = chisq_results,left_on = [haplo_col_str,'controls','cases'], right_on = [haplo_col_str,'controls','cases'])

    merge_results['case_freq'] = merge_results['cases'] / data_crosstab['case'].sum()
    merge_results['control_freq'] = merge_results['controls'] / data_crosstab['control'].sum()

    merge_results = merge_results[[haplo_col_str,'controls','control_freq','cases','case_freq','p_chisq','p_logreg','beta_logreg','se_logreg']]


    if(prnt):
        print(data_crosstab)
        print(data_crosstab_filter.shape)
        print(data_crosstab_filter)
        print(len(haplos))
        print(chisq_results)
        print(df_ohe.shape)
        print(df_ohe.columns)
        print(logreg_results)
        print(merge_results.shape)
        
    return merge_results

In [None]:
#function to count all haplogroups, do chi squared and logistic regression for each, and return a dataframe
def run_chisq_and_log_reg_proxy_control(df,haplo_col_str, count_cutoff, prnt):
    data_crosstab = pd.crosstab(df[haplo_col_str], df['pheno'], margins = False)
    data_crosstab.columns = ['control','proxy']
    data_crosstab_filter = data_crosstab.copy()
    if(cutoff!=0):
        data_crosstab_filter = data_crosstab[data_crosstab.control+data_crosstab.proxy>=count_cutoff]
        
    haplos = set(data_crosstab_filter.index.tolist())

    #chi squared
    chisq_results = data_crosstab_filter.copy()
    chisq_results['p_chisq'] = 0.1
    for h in haplos:
        g, p, dof, expctd = chi_square_for_haplogroup(h,haplo_col_str,df,False)
        chisq_results.at[h,'p_chisq'] = p
    chisq_results.columns = ['controls','proxies','p_chisq']
    chisq_results = chisq_results.reset_index()


    #logistic regression
    df_ohe = df.copy()
    df_ohe[haplo_col_str+'_orig'] = df_ohe[haplo_col_str]
    df_ohe = pd.get_dummies(df_ohe, columns = [haplo_col_str])
    df_ohe.pheno = df_ohe.pheno - 1



    logreg_results = data_crosstab_filter.copy()
    logreg_results['p_logreg'] = 0.1
    for h in haplos:
        p, beta,se = log_reg_for_haplogroup(f'{haplo_col_str}_{h}',df_ohe,False)
        logreg_results.at[h,'p_logreg'] = p
        logreg_results.at[h,'beta_logreg'] = beta
        logreg_results.at[h,'se_logreg'] = se
    logreg_results.columns = ['controls','proxies','p_logreg','beta_logreg','se_logreg']
    logreg_results = logreg_results.reset_index()


    merge_results = pd.merge(left = logreg_results, right = chisq_results,left_on = [haplo_col_str,'controls','proxies'], right_on = [haplo_col_str,'controls','proxies'])

    merge_results['proxy_freq'] = merge_results['proxies'] / data_crosstab['proxy'].sum()
    merge_results['control_freq'] = merge_results['controls'] / data_crosstab['control'].sum()

    merge_results = merge_results[[haplo_col_str,'controls','control_freq','proxies','proxy_freq','p_chisq','p_logreg','beta_logreg','se_logreg']]


    if(prnt):
        print(data_crosstab)
        print(data_crosstab_filter.shape)
        print(data_crosstab_filter)
        print(len(haplos))
        print(chisq_results)
        print(df_ohe.shape)
        print(df_ohe.columns)
        print(logreg_results)
        print(merge_results.shape)
        
    return merge_results

## 3. Case Control Tests

In [None]:
#yhaplo
yhaplo_cc = pd.merge(left = yhaplo, right = case_control_df, left_on = 'id', right_on = 'fid')
print(yhaplo_cc.shape)
print(yhaplo_cc.head())

In [None]:
yhaplo_cc_cutoff_results = run_chisq_and_log_reg(yhaplo_cc,'haplo_long', cutoff, False)
yhaplo_cc_cutoff_results

In [None]:
#snappy
snappy_cc = pd.merge(left = snappy, right = case_control_df, left_on = 'id', right_on = 'fid')
print(snappy_cc.shape)
print(snappy_cc.head())

In [None]:
#create a column with ~ replaced with another character to get formulas to work
snappy_cc['haplo_no_tilde'] = snappy_cc['haplo'].str.replace('~','_')

In [None]:
snappy_cc_cutoff_results = run_chisq_and_log_reg(snappy_cc,'haplo_no_tilde', cutoff, False)
snappy_cc_cutoff_results['haplo'] = snappy_cc_cutoff_results['haplo_no_tilde'].str.replace('_','~')
snappy_cc_cutoff_results 

In [None]:
#y lineage tracker
ltrack_cc = pd.merge(left = ltrack, right = case_control_df, left_on = 'id', right_on = 'fid')
print(ltrack_cc.shape)
print(ltrack_cc.head())

In [None]:
#create a column with ~ replaced with another character to get formulas to work
ltrack_cc['haplo_no_tilde'] = ltrack_cc['haplo'].str.replace('~','_')

In [None]:
ltrack_cc_cutoff_results = run_chisq_and_log_reg(ltrack_cc,'haplo_no_tilde', cutoff, False)
ltrack_cc_cutoff_results['haplo'] = ltrack_cc_cutoff_results['haplo_no_tilde'].str.replace('_','~')
ltrack_cc_cutoff_results

## Combine and Compare Tool Results

In [None]:
snappy_cc_cutoff_results.columns = ['snappy_'+ c for c in snappy_cc_cutoff_results.columns]
print(snappy_cc_cutoff_results.shape)
print(snappy_cc_cutoff_results.head())

In [None]:

yhaplo_cc_cutoff_results.columns = ['yhaplo_'+ c for c in yhaplo_cc_cutoff_results.columns]
print(yhaplo_cc_cutoff_results.shape)
print(yhaplo_cc_cutoff_results.head())

In [None]:
ltrack_cc_cutoff_results.columns = ['ltrack_'+ c for c in ltrack_cc_cutoff_results.columns]
print(ltrack_cc_cutoff_results.shape)
print(ltrack_cc_cutoff_results.head())

In [None]:
res_cc_haplos = list(set(snappy_cc_cutoff_results.snappy_haplo.tolist() + yhaplo_cc_cutoff_results.yhaplo_haplo_long.tolist() + ltrack_cc_cutoff_results.ltrack_haplo.tolist()))
print(res_cc_haplos)

In [None]:
len(set(res_cc_haplos))

In [None]:
merge_cc = pd.DataFrame(data={'haplo':res_cc_haplos})

In [None]:
merge_cc = pd.merge(left = merge_cc, right = yhaplo_cc_cutoff_results, left_on = 'haplo', right_on = 'yhaplo_haplo_long', how = 'outer')
merge_cc = pd.merge(left = merge_cc, right = snappy_cc_cutoff_results, left_on = 'haplo', right_on = 'snappy_haplo', how = 'outer')

merge_cc = pd.merge(left = merge_cc, right = ltrack_cc_cutoff_results, left_on = 'haplo', right_on = 'ltrack_haplo', how = 'outer')
print(merge_cc.shape)
print(merge_cc.head())

In [None]:
merge_cc.columns

In [None]:
merge_cc[(merge_cc.snappy_p_logreg<0.05) | (merge_cc.ltrack_p_logreg<0.05) | (merge_cc.yhaplo_p_logreg<0.05)]

In [None]:
merge_cc.to_csv(f"{OUTDIR}/haplotype_full_pd_case_control_cutoff_50_new.csv", index=None)

## 4. Proxy Control Tests

In [None]:
proxy_control_df.shape

In [None]:
#yhaplo
yhaplo_pc = pd.merge(left = yhaplo, right = proxy_control_df, left_on = 'id', right_on = 'fid')
print(yhaplo_pc.shape)
print(yhaplo_pc.head())

In [None]:
yhaplo_pc_cutoff_results = run_chisq_and_log_reg_proxy_control(yhaplo_pc,'haplo_long', cutoff, False)
yhaplo_pc_cutoff_results

In [None]:
#snappy
snappy_pc = pd.merge(left = snappy, right = proxy_control_df, left_on = 'id', right_on = 'fid')
print(snappy_pc.shape)
print(snappy_pc.head())

In [None]:
#create a column with ~ replaced with another character to get formulas to work
snappy_pc['haplo_no_tilde'] = snappy_pc['haplo'].str.replace('~','_')

In [None]:
snappy_pc_cutoff_results = run_chisq_and_log_reg_proxy_control(snappy_pc,'haplo_no_tilde', cutoff, False)
snappy_pc_cutoff_results['haplo'] = snappy_pc_cutoff_results['haplo_no_tilde'].str.replace('_','~')
snappy_pc_cutoff_results 

In [None]:
#y lineage tracker
ltrack_pc = pd.merge(left = ltrack, right = proxy_control_df, left_on = 'id', right_on = 'fid')
print(ltrack_pc.shape)
print(ltrack_pc.head())

In [None]:
#create a column with ~ replaced with another character to get formulas to work
ltrack_pc['haplo_no_tilde'] = ltrack_pc['haplo'].str.replace('~','_')

In [None]:
ltrack_pc_cutoff_results = run_chisq_and_log_reg_proxy_control(ltrack_pc,'haplo_no_tilde', cutoff, False)
ltrack_pc_cutoff_results['haplo'] = ltrack_pc_cutoff_results['haplo_no_tilde'].str.replace('_','~')
ltrack_pc_cutoff_results

In [None]:
snappy_pc_cutoff_results.columns = ['snappy_'+ c for c in snappy_pc_cutoff_results.columns]
print(snappy_pc_cutoff_results.shape)
print(snappy_pc_cutoff_results.head())

In [None]:

yhaplo_pc_cutoff_results.columns = ['yhaplo_'+ c for c in yhaplo_pc_cutoff_results.columns]
print(yhaplo_pc_cutoff_results.shape)
print(yhaplo_pc_cutoff_results.head())

In [None]:
ltrack_pc_cutoff_results.columns = ['ltrack_'+ c for c in ltrack_pc_cutoff_results.columns]
print(ltrack_pc_cutoff_results.shape)
print(ltrack_pc_cutoff_results.head())

In [None]:
res_pc_haplos = list(set(snappy_pc_cutoff_results.snappy_haplo.tolist() + yhaplo_pc_cutoff_results.yhaplo_haplo_long.tolist() + ltrack_pc_cutoff_results.ltrack_haplo.tolist()))
print(res_pc_haplos)

In [None]:
len(set(res_pc_haplos))

In [None]:
merge_pc = pd.DataFrame(data={'haplo':res_pc_haplos})

In [None]:
merge_pc = pd.merge(left = merge_pc, right = yhaplo_pc_cutoff_results, left_on = 'haplo', right_on = 'yhaplo_haplo_long', how = 'outer')
merge_pc = pd.merge(left = merge_pc, right = snappy_pc_cutoff_results, left_on = 'haplo', right_on = 'snappy_haplo', how = 'outer')

merge_pc = pd.merge(left = merge_pc, right = ltrack_pc_cutoff_results, left_on = 'haplo', right_on = 'ltrack_haplo', how = 'outer')
print(merge_pc.shape)
print(merge_pc.head())

In [None]:
merge_pc.columns

In [None]:
merge_pc[(merge_pc.snappy_p_logreg<0.05) | (merge_pc.ltrack_p_logreg<0.05) | (merge_pc.yhaplo_p_logreg<0.05)]

In [None]:
merge_pc.to_csv(f"{OUTDIR}/haplotype_full_pd_proxy_control_cutoff_50_new.csv", index=None)