# Check Chr Y Major (First Character) Haplogroups in LBD Cases and Controls
- **Author(s)** - Frank Grenn
- **Quick Description:** logistic regression for major haplogroups with AMPPD LBD data.

In [None]:
import pandas as pd
import scipy.stats as ss
import statsmodels.api as sm

In [None]:
WRKDIR = "/PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_male_only_bfiles"
OUTDIR = f"{WRKDIR}/output_male_hemizygous_only_het_filter_run"

## 1. Get Data

In [None]:
fam = pd.read_csv(f"{BFILEDIR}/amppd_lbd_case_control_nogcs.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.pheno.value_counts())

print(fam.shape)
print(fam.head())

In [None]:
fam.pheno.value_counts()

In [None]:
auto_pcs = pd.read_csv(f"{BFILEDIR}/amppd_lbd_case_control_autosome_pcs.eigenvec",sep="\s+",header=None)
auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())

In [None]:
meta = pd.read_csv("/PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
#just get yhaplo data for now because first character of haplogroup for all samples is the same between the yhaplo and snappy tools
yhaplo = pd.read_csv(f"{OUTDIR}/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
yhaplo['haplo_major'] = yhaplo['haplo_long'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs, left_on = ['fid','iid'], right_on = ['fid','iid'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['ID','AGE_BASELINE','LATEST_DX']], left_on = ['fid'], right_on = ['ID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = yhaplo[['id','haplo_major','haplo_long']], left_on = ['fid'], right_on = ['id'])
print(merge3.shape)
yhaplo_meta_df = merge3.copy()

In [None]:
yhaplo_meta_df.LATEST_DX.value_counts()

In [None]:
yhaplo_meta_df.haplo_major.value_counts()

In [None]:
yhaplo_meta_df.pheno.value_counts()

## 2. Chi Squared Test For Case Control

In [None]:
#contingency table for chi squared later
data_crosstab = pd.crosstab(yhaplo_meta_df['haplo_major'], yhaplo_meta_df['pheno'], margins = False) 
print(data_crosstab)

In [None]:
data_crosstab.columns = ['control','case']

In [None]:
data_crosstab

In [None]:
#check the table is counting correctly
yhaplo_meta_df[(yhaplo_meta_df.pheno==1)& (yhaplo_meta_df.haplo_major=='R')].shape

#### chi squared for speficic haplotypes

In [None]:
#chi squared test for a specific haplotype
def chi_square_for_haplogroup(haplo,prnt):
    data = yhaplo_meta_df.copy()
    data.loc[data.haplo_major != haplo,'haplo_major'] = 'not '+haplo

    contingency_table = pd.crosstab(data['haplo_major'], data['pheno'], margins = False) 



    g, p, dof, expctd = ss.chi2_contingency(contingency_table)
    if prnt:
        print("Observed:")
        print(contingency_table)
        #print(g)
        #print(p)
        #print(dof)
        print("Expected:")
        print(expctd)
        
    return g, p, dof, expctd

In [None]:
g, p, dof, expctd = chi_square_for_haplogroup('R',True)

In [None]:
p

In [None]:
data = yhaplo_meta_df.copy()
data.loc[data.haplo_major != 'R','haplo_major'] = 'not '+'R'

contingency_table = pd.crosstab(data['haplo_major'], data['pheno'], margins = False) 
contingency_table.columns = ['control','case']
contingency_table

In [None]:
p

In [None]:
haplos = set(yhaplo_meta_df['haplo_major'])
haplos

In [None]:
case_control_chisq_results = data_crosstab.copy()
case_control_chisq_results['p_chisq'] = 0.1
for h in haplos:
    print(h)
    g, p, dof, expctd = chi_square_for_haplogroup(h,False)
    case_control_chisq_results.at[h,'p_chisq'] = p
case_control_chisq_results.columns = ['controls','cases','p_chisq']
case_control_chisq_results = case_control_chisq_results.reset_index()
print(case_control_chisq_results)

## 3. Logistic Regression for Case Control

In [None]:
yhaplo_meta_df_ohe = yhaplo_meta_df.copy()
yhaplo_meta_df_ohe['haplo_major_orig'] = yhaplo_meta_df_ohe['haplo_major']
yhaplo_meta_df_ohe = pd.get_dummies(yhaplo_meta_df_ohe, columns = ['haplo_major'])
yhaplo_meta_df_ohe.pheno = yhaplo_meta_df_ohe.pheno - 1
print(yhaplo_meta_df_ohe.shape)

In [None]:
yhaplo_meta_df_ohe.columns

In [None]:
#logistic regression for specific major haplogroup
def log_reg_for_haplogroup(haplo,prnt):
    
    
    model = sm.GLM.from_formula(f"pheno ~ {haplo} + AGE_BASELINE + pc1 + pc2 + pc3 + pc4 + pc5",family = sm.families.Binomial(), data = yhaplo_meta_df_ohe)
    #model = sm.GLM.from_formula(f"pheno ~ {haplo}", data = data_no_gc_no_unknown)
    results = model.fit()
    if prnt:
        print(results.summary())
    results.summary()
    
    return results.pvalues[haplo], results.params[haplo], results.bse[haplo]

In [None]:
case_control_logreg_results = data_crosstab.copy()
case_control_logreg_results['p_logreg'] = 0.1
for h in haplos:
    print(h)
    p, beta,se = log_reg_for_haplogroup(f'haplo_major_{h}',False)
    case_control_logreg_results.at[h,'p_logreg'] = p
    case_control_logreg_results.at[h,'beta_logreg'] = beta
    case_control_logreg_results.at[h,'se_logreg'] = se
case_control_logreg_results.columns = ['controls','cases','p_logreg','beta_logreg','se_logreg']
case_control_logreg_results = case_control_logreg_results.reset_index()
print(case_control_logreg_results)

## 4. Combine and Output

In [None]:
merge_results = pd.merge(left = case_control_logreg_results, right = case_control_chisq_results,left_on = ['haplo_major','controls','cases'], right_on = ['haplo_major','controls','cases'])
print(merge_results.shape)

In [None]:
merge_results['case_freq'] = merge_results['cases'] / sum(merge_results['cases'])
merge_results['control_freq'] = merge_results['controls'] / sum(merge_results['controls'])

In [None]:
merge_results = merge_results[['haplo_major','controls','control_freq','cases','case_freq','p_chisq','p_logreg','beta_logreg','se_logreg']]

In [None]:
merge_results

In [None]:
merge_results.to_csv(f"{OUTDIR}/haplotype_major_lbd_case_control_new.csv", index=None)