# Check Chr Y Full Haplogroups in LBD Cases and Controls
- **Author(s)** - Frank Grenn
- **Date Started** - March  2021
- **Quick Description:** logistic regression for full haplogroups with AMPPD LBD data.

In [None]:
import pandas as pd
import scipy.stats as ss
import statsmodels.api as sm

In [None]:
WRKDIR = "$PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_male_only_bfiles"
OUTDIR = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"

#count cutoff for testing
cutoff = 50

## 1. Get Data

In [None]:
fam = pd.read_csv(f"{BFILEDIR}/amppd_lbd_case_control_nogcs.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
#-9 is for lbd (cases) in this file
fam.loc[fam.pheno==-9,'pheno']=2
print(fam.shape)
print(fam.head())

In [None]:
fam.pheno.value_counts()

In [None]:
auto_pcs = pd.read_csv(f"{BFILEDIR}/amppd_lbd_case_control_autosome_pcs.eigenvec",sep="\s+",header=None)
auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())

In [None]:
meta = pd.read_csv("$PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs, left_on = ['fid','iid'], right_on = ['fid','iid'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['ID','AGE_BASELINE','LATEST_DX']], left_on = ['fid'], right_on = ['ID'])
print(merge2.shape)
meta_merge = merge2.copy()

In [None]:
meta_merge.LATEST_DX.value_counts()

## Yhaplo Data

In [None]:
#just get yhaplo data for now because first character of haplogroup for all samples is the same between the yhaplo and snappy tools
yhaplo = pd.read_csv(f"{OUTDIR}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
yhaplo['haplo_major'] = yhaplo['haplo_long'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
yhaplo_case_control_no_gc = pd.merge(left = yhaplo[['id','haplo_long','haplo_major']], right = meta_merge,left_on = 'id', right_on = "fid")
print(yhaplo_case_control_no_gc.shape)
print(yhaplo_case_control_no_gc.head())

In [None]:
yhaplo_case_control_no_gc.haplo_major.value_counts()

## Snappy Data

In [None]:
snappy = pd.read_csv(f"{OUTDIR}/chrY_hgs_snappy_matches.out",sep="\t",header=None)
snappy.columns = ['id','haplo','haplo_score','info_alleles']

#some samples, like "PD-PDNZ095VCJ" have extra data in the "haplo" column, like "B2a1a M109,M152/Page60,P32,P50", and we only want the "B2a1a"
snappy['haplo']= snappy['haplo'].str.split(" ").str[0]
snappy['haplo_major'] = snappy['haplo'].str[0]
print(snappy.shape)
print(snappy.head())

In [None]:
snappy_case_control_no_gc = pd.merge(left = snappy[['id','haplo','haplo_major']], right = meta_merge,left_on = 'id', right_on = "fid")
print(snappy_case_control_no_gc.shape)
print(snappy_case_control_no_gc.head())

In [None]:
snappy_case_control_no_gc.haplo_major.value_counts()

## Y-LineageTracker Data

In [None]:
ltrack = pd.read_table(f"{OUTDIR}/output_ltracker/ltrack_hg19.lineageresult.txt")

ltrack['haplo_major'] = ltrack['Haplogroup'].str[0]
ltrack.columns = ['id','haplo','keyhaplo','mutations','lineagetrack','haplo_major']
ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
print(ltrack.shape)
print(ltrack.head())

In [None]:
ltrack_case_control_no_gc = pd.merge(left = ltrack[['id','haplo','haplo_major']], right = meta_merge,left_on = 'id', right_on = 'fid')
print(ltrack_case_control_no_gc.shape)
print(ltrack_case_control_no_gc.head())

In [None]:
ltrack_case_control_no_gc.haplo_major.value_counts()

## 2. Setup some stats functions

In [None]:
#chi squared test for a specific haplotype
def chi_square_for_haplogroup(haplo,haplo_col,df,prnt):
    data = df.copy()
    data.loc[data[haplo_col] != haplo,haplo_col] = 'not '+haplo

    contingency_table = pd.crosstab(data[haplo_col], data['pheno'], margins = False) 



    g, p, dof, expctd = ss.chi2_contingency(contingency_table)
    if prnt:
        print(contingency_table)
        print(g)
        print(p)
        print(dof)
        print(expctd)
        
    return g, p, dof, expctd

In [None]:
#logistic regression for specific full haplogroup
def log_reg_for_haplogroup(haplo,df,prnt):
    
    
    model = sm.GLM.from_formula(f"pheno ~ {haplo} + AGE_BASELINE + pc1 + pc2 + pc3 + pc4 + pc5",family = sm.families.Binomial(), data = df)
    #model = sm.GLM.from_formula(f"pheno ~ {haplo}", data = data_no_gc_no_unknown)
    results = model.fit()
    if prnt:
        print(results.summary())
    results.summary()
    
    return results.pvalues[f'{haplo}'], results.params[haplo], results.bse[haplo]

In [None]:
#function to count all haplogroups, do chi squared and logistic regression for each, and return a dataframe
def run_chisq_and_log_reg(df,haplo_col_str, count_cutoff, prnt):
    data_crosstab = pd.crosstab(df[haplo_col_str], df['pheno'], margins = False)
    data_crosstab.columns = ['control','case']
    data_crosstab_filter = data_crosstab.copy()
    if(cutoff!=0):
        data_crosstab_filter = data_crosstab[data_crosstab.control+data_crosstab.case>=count_cutoff]
        
    haplos = set(data_crosstab_filter.index.tolist())

    #chi squared
    chisq_results = data_crosstab_filter.copy()
    chisq_results['p_chisq'] = 0.1
    for h in haplos:
        g, p, dof, expctd = chi_square_for_haplogroup(h,haplo_col_str,df,False)
        chisq_results.at[h,'p_chisq'] = p
    chisq_results.columns = ['controls','cases','p_chisq']
    chisq_results = chisq_results.reset_index()


    #logistic regression
    df_ohe = df.copy()
    df_ohe[haplo_col_str+'_orig'] = df_ohe[haplo_col_str]
    df_ohe = pd.get_dummies(df_ohe, columns = [haplo_col_str])
    df_ohe.pheno = df_ohe.pheno - 1



    logreg_results = data_crosstab_filter.copy()
    logreg_results['p_logreg'] = 0.1
    for h in haplos:
        p, beta,se = log_reg_for_haplogroup(f'{haplo_col_str}_{h}',df_ohe,False)
        logreg_results.at[h,'p_logreg'] = p
        logreg_results.at[h,'beta_logreg'] = beta
        logreg_results.at[h,'se_logreg'] = se
    logreg_results.columns = ['controls','cases','p_logreg','beta_logreg','se_logreg']
    logreg_results = logreg_results.reset_index()


    merge_results = pd.merge(left = logreg_results, right = chisq_results,left_on = [haplo_col_str,'controls','cases'], right_on = [haplo_col_str,'controls','cases'])

    merge_results['case_freq'] = merge_results['cases'] / data_crosstab['case'].sum()
    merge_results['control_freq'] = merge_results['controls'] / data_crosstab['control'].sum()

    merge_results = merge_results[[haplo_col_str,'controls','control_freq','cases','case_freq','p_chisq','p_logreg','beta_logreg','se_logreg']]


    if(prnt):
        print(data_crosstab)
        print(data_crosstab_filter.shape)
        print(data_crosstab_filter)
        print(len(haplos))
        print(chisq_results)
        print(df_ohe.shape)
        print(df_ohe.columns)
        print(logreg_results)
        print(merge_results.shape)
        
    return merge_results

## 3. Run Tests

#### Yhaplo

In [None]:
yhaplo_cutoff_results = run_chisq_and_log_reg(yhaplo_case_control_no_gc,'haplo_long', cutoff, False)
yhaplo_cutoff_results

#### Snappy

In [None]:
snappy_cutoff_results = run_chisq_and_log_reg(snappy_case_control_no_gc,'haplo', cutoff, False)
snappy_cutoff_results 

#### Y-LineageTracker

In [None]:
ltrack_cutoff_results = run_chisq_and_log_reg(ltrack_case_control_no_gc,'haplo', cutoff, False)
ltrack_cutoff_results

## Combine and Compare Tool Results

In [None]:
snappy_cutoff_results.columns = ['snappy_'+ c for c in snappy_cutoff_results.columns]
print(snappy_cutoff_results.shape)
print(snappy_cutoff_results.head())

In [None]:
yhaplo_cutoff_results.columns = ['yhaplo_'+ c for c in yhaplo_cutoff_results.columns]
print(yhaplo_cutoff_results.shape)
print(yhaplo_cutoff_results.head())

In [None]:
ltrack_cutoff_results.columns = ['ltrack_'+ c for c in ltrack_cutoff_results.columns]
print(ltrack_cutoff_results.shape)
print(ltrack_cutoff_results.head())

In [None]:
res_haplos = list(set(snappy_cutoff_results.snappy_haplo.tolist() + yhaplo_cutoff_results.yhaplo_haplo_long.tolist() + ltrack_cutoff_results.ltrack_haplo.tolist()))
print(res_haplos)

In [None]:
merge = pd.DataFrame(data={'haplo':res_haplos})

In [None]:
merge = pd.merge(left = merge, right = yhaplo_cutoff_results, left_on = 'haplo', right_on = 'yhaplo_haplo_long', how = 'outer')
merge = pd.merge(left = merge, right = snappy_cutoff_results, left_on = 'haplo', right_on = 'snappy_haplo', how = 'outer')

merge = pd.merge(left = merge, right = ltrack_cutoff_results, left_on = 'haplo', right_on = 'ltrack_haplo', how = 'outer')
print(merge.shape)
print(merge.head())

In [None]:
merge[(merge.snappy_p_logreg<0.05) | (merge.ltrack_p_logreg<0.05) | (merge.yhaplo_p_logreg<0.05)]

In [None]:
merge.to_csv(f"{OUTDIR}/haplotype_full_lbd_case_control_no_gc_cutoff_50_new.csv", index=None)