# Check Chr Y Major (First Character) Haplogroups in PD Cases, Controls and Proxies Using UKBioBank Data
- **Author(s)** - Frank Grenn
- **Quick Description:** logistic regression for major haplogroups with UKBB data.

In [None]:
import pandas as pd
import random
import scipy.stats as ss
import statsmodels.api as sm

In [None]:
WRKDIR = "/PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_ukbb"
OUTDIR = f"{WRKDIR}/output_ukbb"
CARDDIR = "/PATH"

## 1. Get Data

In [None]:
fam = pd.read_csv(f"{BFILEDIR}/chrY_eur_male_only.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

In [None]:
fam.pheno.value_counts()

In [None]:
auto_cc_pcs = pd.read_csv(f"{CARDDIR}/projects/chromosome_y_expression/ukbb/pcs_case_control_pca.txt",sep="\s+")
print(auto_cc_pcs.shape)
print(auto_cc_pcs.head())

In [None]:
auto_pc_pcs = pd.read_csv(f"{CARDDIR}/projects/chromosome_y_expression/ukbb/pcs_proxy_control_pca.txt",sep="\s+")
print(auto_pc_pcs.shape)
print(auto_pc_pcs.head())

In [None]:
meta = pd.read_table(f"{CARDDIR}/UKBIOBANK/PHENOTYPE_DATA/covariates_phenome_to_use.txt")
print(meta.shape)
print(meta.head())

In [None]:
yhaplo = pd.read_csv(f"{OUTDIR}/yhaplo_output/haplogroups.chrY_male_only.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
yhaplo['haplo_major'] = yhaplo['haplo_long'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
yhaplo['id'] = yhaplo['id'].astype('int64')
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
yhaplo.dtypes

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_cc_pcs, left_on = ['fid','iid'], right_on = ['FID','IID'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['FID','AGE_OF_RECRUIT']], left_on = ['fid'], right_on = ['FID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = yhaplo[['id','haplo_major','haplo_long']], left_on = ['fid'], right_on = ['id'])
print(merge3.shape)
case_control_df = merge3.copy()

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pc_pcs, left_on = ['fid','iid'], right_on = ['FID','IID'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['FID','AGE_OF_RECRUIT']], left_on = ['fid'], right_on = ['FID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = yhaplo[['id','haplo_major','haplo_long']], left_on = ['fid'], right_on = ['id'])
print(merge3.shape)
proxy_control_df = merge3.copy()

In [None]:
case_control_df.pheno.value_counts()

In [None]:
proxy_control_df.pheno.value_counts()

In [None]:
",".join(case_control_df.columns)

#### Setup some stats functions

In [None]:
#chi squared test for a specific haplotype
def chi_square_for_haplogroup(haplo,df,prnt):
    data = df.copy()
    data.loc[data.haplo_major != haplo,'haplo_major'] = 'not '+haplo

    contingency_table = pd.crosstab(data['haplo_major'], data['pheno'], margins = False) 



    g, p, dof, expctd = ss.chi2_contingency(contingency_table)
    if prnt:
        print(contingency_table)
        print(g)
        print(p)
        print(dof)
        print(expctd)
        
    return g, p, dof, expctd

In [None]:
#logistic regression for specific major haplogroup
def log_reg_for_haplogroup(haplo,df,prnt):
    
    model = sm.GLM.from_formula(f"pheno ~ {haplo} + AGE_OF_RECRUIT + PC1 + PC2 + PC3 + PC4 + PC5",family = sm.families.Binomial(), data = df)
    results = model.fit()
    if prnt:
        print(results.summary())
    results.summary()
    
    return results.pvalues[haplo], results.params[haplo], results.bse[haplo]

## 2. Case Control Tests
use case_control_df created ealier  
Then do chi squared tests and logistic regression

### Chi Squared Case Control

In [None]:
#contingency table for chi squared later
data_crosstab_cc = pd.crosstab(case_control_df['haplo_major'], case_control_df['pheno'], margins = False)
data_crosstab_cc.columns = ['control','case']
print(data_crosstab_cc)

In [None]:
#check the table is counting correctly
case_control_df[(case_control_df.pheno==1)& (case_control_df.haplo_major=='R')].shape

#### chi squared for speficic haplotypes

In [None]:
g, p, dof, expctd = chi_square_for_haplogroup('R',case_control_df,True)

In [None]:
p

In [None]:
haplos = set(case_control_df['haplo_major'])
haplos

In [None]:
case_control_chisq_results = data_crosstab_cc.copy()
case_control_chisq_results['p_chisq'] = 0.1
for h in haplos:
    print(h)
    g, p, dof, expctd = chi_square_for_haplogroup(h,case_control_df,False)
    case_control_chisq_results.at[h,'p_chisq'] = p
case_control_chisq_results.columns = ['controls','cases','p_chisq']
case_control_chisq_results = case_control_chisq_results.reset_index()
print(case_control_chisq_results)

In [None]:
sum(case_control_chisq_results['cases'])

In [None]:
sum(case_control_chisq_results['controls'])

### Logistic Regression for Case Control

In [None]:
#one hot encode the major haplogroups
case_control_df_ohe = case_control_df.copy()
case_control_df_ohe['haplo_major_orig'] = case_control_df_ohe['haplo_major']
case_control_df_ohe = pd.get_dummies(case_control_df_ohe, columns = ['haplo_major'])
case_control_df_ohe.pheno = case_control_df_ohe.pheno - 1
print(case_control_df_ohe.shape)

In [None]:
case_control_df_ohe.columns

In [None]:
case_control_logreg_results = data_crosstab_cc.copy()
case_control_logreg_results['p_logreg'] = 0.1
case_control_logreg_results['beta_logreg'] = 0.1
for h in haplos:
    print(h)
    p, beta, se = log_reg_for_haplogroup(f'haplo_major_{h}',case_control_df_ohe,False)
    case_control_logreg_results.at[h,'p_logreg'] = p
    case_control_logreg_results.at[h,'beta_logreg'] = beta
    case_control_logreg_results.at[h,'se_logreg'] = se
case_control_logreg_results.columns = ['controls','cases','p_logreg','beta_logreg','se_logreg']
case_control_logreg_results = case_control_logreg_results.reset_index()
print(case_control_logreg_results)

### Combine Case Control Data and Output

In [None]:
merge_results = pd.merge(left = case_control_logreg_results, right = case_control_chisq_results,left_on = ['haplo_major','controls','cases'], right_on = ['haplo_major','controls','cases'])
print(merge_results.shape)

In [None]:
merge_results['case_freq'] = merge_results['cases'] / sum(merge_results['cases'])
merge_results['control_freq'] = merge_results['controls'] / sum(merge_results['controls'])

In [None]:
merge_results = merge_results[['haplo_major','controls','control_freq','cases','case_freq','p_chisq','p_logreg','beta_logreg','se_logreg']]

In [None]:
merge_results

In [None]:
merge_results.to_csv(f"{OUTDIR}/haplotype_major_pd_case_control_newest.csv", index=None)

## 3. Proxy Control Tests

use the proxy_control_df created earlier

### Chi Squared Proxy Control

In [None]:
#contingency table for chi squared later
data_crosstab_pc = pd.crosstab(proxy_control_df['haplo_major'], proxy_control_df['pheno'], margins = False)
data_crosstab_pc.columns = ['control','proxy']
print(data_crosstab_pc)

In [None]:
haplos = set(proxy_control_df['haplo_major'])
haplos

In [None]:
proxy_control_chisq_results = data_crosstab_pc.copy()
proxy_control_chisq_results['p_chisq'] = 0.1
for h in haplos:
    print(h)
    g, p, dof, expctd = chi_square_for_haplogroup(h,proxy_control_df,False)
    proxy_control_chisq_results.at[h,'p_chisq'] = p
proxy_control_chisq_results.columns = ['controls','proxies','p_chisq']
proxy_control_chisq_results = proxy_control_chisq_results.reset_index()
print(proxy_control_chisq_results)

In [None]:
sum(proxy_control_chisq_results['proxies'])

In [None]:
sum(proxy_control_chisq_results['controls'])

### Logistic Regression for Proxy Control

In [None]:
#one hot encode the major haplogroups
proxy_control_df_ohe = proxy_control_df.copy()
proxy_control_df_ohe['haplo_major_orig'] = proxy_control_df_ohe['haplo_major']
proxy_control_df_ohe = pd.get_dummies(proxy_control_df_ohe, columns = ['haplo_major'])
proxy_control_df_ohe.pheno = proxy_control_df_ohe.pheno - 1
print(proxy_control_df_ohe.shape)

In [None]:
proxy_control_df_ohe.columns

In [None]:
proxy_control_logreg_results = data_crosstab_pc.copy()
proxy_control_logreg_results['p_logreg'] = 0.1
proxy_control_logreg_results['beta_logreg'] = 0.1
for h in haplos:
    print(h)
    p, beta, se = log_reg_for_haplogroup(f'haplo_major_{h}',proxy_control_df_ohe,False)
    proxy_control_logreg_results.at[h,'p_logreg'] = p
    proxy_control_logreg_results.at[h,'beta_logreg'] = beta
    proxy_control_logreg_results.at[h,'se_logreg'] = se
proxy_control_logreg_results.columns = ['controls','proxies','p_logreg','beta_logreg','se_logreg']
proxy_control_logreg_results = proxy_control_logreg_results.reset_index()
print(proxy_control_logreg_results)

### Combine Proxy Control Data and Output

In [None]:
merge_results = pd.merge(left = proxy_control_logreg_results, right = proxy_control_chisq_results,left_on = ['haplo_major','controls','proxies'], right_on = ['haplo_major','controls','proxies'])
print(merge_results.shape)

In [None]:
merge_results['proxy_freq'] = merge_results['proxies'] / sum(merge_results['proxies'])
merge_results['control_freq'] = merge_results['controls'] / sum(merge_results['controls'])

In [None]:
merge_results = merge_results[['haplo_major','controls','control_freq','proxies','proxy_freq','p_chisq','p_logreg','beta_logreg','se_logreg']]

In [None]:
merge_results

In [None]:
merge_results.to_csv(f"{OUTDIR}/haplotype_major_pd_proxy_control_newest.csv", index=None)