# Single Variant Testing 
- **Author(s)** - Frank Grenn
- **Date Started** - August 2021
- **Quick Description:** Y chromosome single variant tests using plink and metal.

In [None]:
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import numpy as np
import sys

In [None]:
WRKDIR = "$PATH/chrY"

## AMP-PD PD Case Control

### try with data from liftover to hg19

In [None]:
fam = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

In [None]:
fam.pheno.value_counts()

In [None]:
auto_pcs = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_autosome_pcs.eigenvec",sep="\s+",header=None)
auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())

In [None]:
#check yhaplo haplogroups
yhaplo = pd.read_csv(f"{WRKDIR}/output_male_hemizygous_only_het_filter_run/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo']
yhaplo['haplo_major'] = yhaplo['haplo'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
#yhaplo = yhaplo[yhaplo['haplo_long']!='A']#assume samples with "A" haplogroup were not assigned one.
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
#use Ylineagetracker haplogroups since that tool assigned the most unique haplogroups compared to other tools
ltrack = pd.read_csv(f"{WRKDIR}/output_male_hemizygous_only_het_filter_run/output_ltracker/ltrack_hg19.hapresult.hg",sep="\s+")
ltrack.columns = ['id','haplo']#['id','haplo_short','haplo_short_rep_snp','haplo_long']
ltrack['haplo_major'] = ltrack['haplo'].str[0]
ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
print(ltrack.shape)
print(ltrack.head())
print(len(set(ltrack.haplo)))

In [None]:
meta = pd.read_csv("$PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs[['fid','iid','pc1','pc2','pc3','pc4','pc5']], left_on = ['fid','iid'], right_on = ['fid','iid'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['ID','AGE_BASELINE']], left_on = ['fid'], right_on = ['ID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = ltrack[['id','haplo_major']], left_on = ['fid'], right_on = 'id')
print(merge3.shape)
print(merge3.head())
meta_merge = merge3[['fid','iid','pheno','pc1','pc2','pc3','pc4','pc5','AGE_BASELINE','haplo_major']].copy()

In [None]:

meta_merge['haplo_major_orig'] = meta_merge['haplo_major']
meta_merge = pd.get_dummies(meta_merge, columns = ['haplo_major'])
print(meta_merge.head())
#meta_merge.pheno = meta_merge.pheno - 1

In [None]:
meta_merge['PHENO_PLINK'] = meta_merge['pheno']-1
meta_merge = meta_merge.rename(columns={"fid": "FID", "iid": "IID"})
print(meta_merge.head())

In [None]:
meta_merge['PHENO_PLINK'].value_counts()

In [None]:
meta_merge.dtypes

In [None]:
#check pc haplogroup correlations
#corrs= np.corrcoef(meta_merge[['pc1','pc2','pc3','pc4','pc5','haplo_major_A','haplo_major_B','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']].T)
corrs= np.corrcoef(meta_merge[['pc1','pc2','pc3','pc4','pc5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']].T)
corrdf = pd.DataFrame(corrs)
corrdf.columns = ['pc1','pc2','pc3','pc4','pc5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']
corrdf.index = ['pc1','pc2','pc3','pc4','pc5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']
corrdf

In [None]:
",".join(meta_merge.columns)

In [None]:
meta_merge.to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_covars.txt",sep="\t",index=None)

In [None]:
meta_merge.pheno.value_counts()

In [None]:
meta_merge.haplo_major_orig.value_counts()

In [None]:
print(f"plink --bfile {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs \
--maf 0.05 \
--covar {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_covars.txt \
--covar-name pc1,pc2,pc3,pc4,pc5,AGE_BASELINE,haplo_major_E,haplo_major_G,haplo_major_I,haplo_major_J,haplo_major_R \
--pheno {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_covars.txt --pheno-name pheno \
--freq \
--logistic hide-covar --ci 0.95 --out {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg")

In [None]:
logistic = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg.assoc.logistic",sep = "\s+").sort_values("P")

print(logistic.shape)
print(logistic.head())
print(logistic.tail())
logistic = logistic.dropna()
print(logistic.shape)
print(logistic[(logistic['P']!=0)].shape)
print(logistic[(logistic['P']!=0) & (logistic['P']<0.05)].shape)

In [None]:
maf = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg.frq",sep = "\s+")
print(maf.shape)
print(maf.head())

In [None]:
bim = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs.bim",sep="\s+",header=None)
bim.columns = ['chr','snp','pos','bp','a1','a2']
print(bim.shape)
print(bim.head())
print(logistic.shape)

logistic_merge = pd.merge(left = maf, right = bim, left_on = ['SNP','A1'],right_on = ['snp','a1'])
logistic_merge = logistic_merge[['CHR','SNP','bp','a1','a2','MAF']]
print(logistic_merge.shape)
print(logistic_merge.head())

logistic_merge = pd.merge(left = logistic, right = logistic_merge[['SNP','bp','a1','a2','MAF']], left_on = ['SNP','BP','A1'],right_on = ['SNP','bp','a1'])
logistic_merge = logistic_merge[['CHR','SNP','BP','a1','a2','MAF','TEST','NMISS','OR','SE','L95','U95','STAT','P']]
print(logistic_merge.dtypes)
logistic_merge['CHR_BP_A1_A2'] = logistic_merge['CHR'].astype(str)+"_"+logistic_merge['BP'].astype(str)+"_"+logistic_merge['a1'].astype(str)+"_"+logistic_merge['a2'].astype(str)
print(logistic_merge.shape)
print(logistic_merge.head())

logistic_merge.to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg_filter.assoc.logistic",index=None,sep="\t")

In [None]:
#check for duplicates
logistic_merge[logistic_merge.duplicated(subset = 'CHR_BP_A1_A2',keep=False)].sort_values('CHR_BP_A1_A2')

In [None]:
#remove duplicates
logistic_merge_edit = logistic_merge.drop_duplicates(subset = 'CHR_BP_A1_A2',keep=False)
logistic_merge_edit = logistic_merge.loc[logistic_merge.OR!=float('inf'),]

logistic_merge_edit.to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg_filter.assoc.logistic",index=None,sep="\t")

## AMP-PD LBD Case Control

In [None]:
fam = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

fam.loc[fam['pheno']==-9,'pheno']=2

auto_pcs = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_autosome_pcs.eigenvec",sep="\s+",header=None)
auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())

#just get yhaplo data for now because first character of haplogroup for all samples is the same between the yhaplo and snappy tools
yhaplo = pd.read_csv(f"{WRKDIR}/output_male_hemizygous_only_het_filter_run/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo']
yhaplo['haplo_major'] = yhaplo['haplo'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
#yhaplo = yhaplo[yhaplo['haplo_long']!='A']#assume samples with "A" haplogroup were not assigned one.
print(yhaplo.shape)
print(yhaplo.head())

meta = pd.read_csv("$PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())


#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs[['fid','iid','pc1','pc2','pc3','pc4','pc5']], left_on = ['fid','iid'], right_on = ['fid','iid'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['ID','AGE_BASELINE']], left_on = ['fid'], right_on = ['ID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = yhaplo[['id','haplo_major']], left_on = ['fid'], right_on = 'id')
print(merge3.shape)
print(merge3.head())
meta_merge = merge3[['fid','iid','pheno','pc1','pc2','pc3','pc4','pc5','AGE_BASELINE','haplo_major']].copy()


meta_merge['haplo_major_orig'] = meta_merge['haplo_major']
meta_merge = pd.get_dummies(meta_merge, columns = ['haplo_major'])
print(meta_merge.head())
#meta_merge.pheno = meta_merge.pheno - 1

meta_merge['PHENO_PLINK'] = meta_merge['pheno']-1
meta_merge = meta_merge.rename(columns={"fid": "FID", "iid": "IID"})
print(meta_merge.head())

#check pc haplogroup correlations
corrs= np.corrcoef(meta_merge[['pc1','pc2','pc3','pc4','pc5','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']].T)
corrdf = pd.DataFrame(corrs)
corrdf.columns = ['pc1','pc2','pc3','pc4','pc5','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']
corrdf.index = ['pc1','pc2','pc3','pc4','pc5','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']
corrdf

print(meta_merge.pheno.value_counts())
meta_merge.to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_covars.txt",sep="\t",index=None)

In [None]:
corrdf

In [None]:
meta_merge['haplo_major_orig'].value_counts()

In [None]:
print(f"plink --bfile {WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs \
--maf 0.05 \
--covar {WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_covars.txt \
--covar-name pc1,pc2,pc3,pc4,pc5,AGE_BASELINE,haplo_major_E,haplo_major_G,haplo_major_I,haplo_major_J,haplo_major_R \
--pheno {WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_covars.txt --pheno-name pheno \
--freq \
--logistic hide-covar --ci 0.95 --out {WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_logreg")

In [None]:
log_lbd = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_logreg.assoc.logistic",sep = "\s+").sort_values("P")

print(log_lbd.shape)
print(log_lbd.head())

log_lbd = log_lbd.dropna()
print(log_lbd.shape)

print(log_lbd[(log_lbd['P']!=0)].shape)
print(log_lbd[(log_lbd['P']!=0) & (log_lbd['P']<0.05)].shape)

In [None]:
maf = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_logreg.frq",sep = "\s+")
print(maf.shape)
print(maf.head())

In [None]:
bim = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs.bim",sep="\s+",header=None)
bim.columns = ['chr','snp','pos','bp','a1','a2']
print(bim.shape)
print(bim.head())

log_lbd_merge = pd.merge(left = maf, right = bim, left_on = ['SNP','A1'],right_on = ['snp','a1'])
log_lbd_merge = log_lbd_merge[['CHR','SNP','bp','a1','a2','MAF']]
print(log_lbd_merge.shape)
print(log_lbd_merge.head())

log_lbd_merge = pd.merge(left = log_lbd, right = log_lbd_merge[['SNP','bp','a1','a2','MAF']], left_on = ['SNP','BP','A1'],right_on = ['SNP','bp','a1'])
log_lbd_merge = log_lbd_merge[['CHR','SNP','BP','a1','a2','MAF','TEST','NMISS','OR','SE','L95','U95','STAT','P']]
print(log_lbd_merge.shape)
print(log_lbd_merge.head())

log_lbd_merge['CHR_BP_A1_A2'] = log_lbd_merge['CHR'].astype(str)+"_"+log_lbd_merge['BP'].astype(str)+"_"+log_lbd_merge['a1'].astype(str)+"_"+log_lbd_merge['a2'].astype(str)

log_lbd_merge.to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_logreg_filter.assoc.logistic",index=None,sep="\t")

## NeuroX

In [None]:
fam = pd.read_csv(f"{WRKDIR}/y_neurox/neurox_chrY_male_only.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

auto_pcs = pd.read_csv(f"{WRKDIR}/y_neurox/neurox_case_control_allchr_pruned_pcs.eigenvec",sep="\s+",header=None)
auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())


#yhaplo = pd.read_csv(f"{WRKDIR}/output_neurox/yhaplo_output/haplogroups.neurox_chrY_male_only.txt",sep="\s+",header=None)
#yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo']
#yhaplo['haplo_major'] = yhaplo['haplo'].str[0]
#yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
#print(yhaplo.shape)
#print(yhaplo.head())


#use Ylineagetracker haplogroups since that tool assigned the most unique haplogroups compared to other tools
ltrack = pd.read_csv(f"{WRKDIR}/output_neurox/ltrack_neurox_hg19.hapresult.hg",sep="\s+")
ltrack.columns = ['id','haplo']
ltrack['haplo_major'] = ltrack['haplo'].str[0]
ltrack.loc[ltrack.haplo==".","haplo"] = "no_match"
ltrack.loc[ltrack.haplo_major==".","haplo_major"] = "no_match"
ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
print(ltrack.shape)
print(ltrack.head())
print(len(set(ltrack.haplo)))

meta = pd.read_table("$PATH/IPDGC_all_samples_covariates.txt")
print(meta.shape)
print(meta.head())


#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs[['fid','iid','pc1','pc2','pc3','pc4','pc5']], left_on = ['fid','iid'], right_on = ['fid','iid'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['FID','AGE']], left_on = ['fid'], right_on = ['FID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = ltrack[['id','haplo_major']], left_on = ['fid'], right_on = 'id')
print(merge3.shape)
print(merge3.head())
meta_merge = merge3[['fid','iid','pheno','pc1','pc2','pc3','pc4','pc5','AGE','haplo_major']].copy()


meta_merge['haplo_major_orig'] = meta_merge['haplo_major']
meta_merge = pd.get_dummies(meta_merge, columns = ['haplo_major'])
print(meta_merge.head())
#meta_merge.pheno = meta_merge.pheno - 1

meta_merge['PHENO_PLINK'] = meta_merge['pheno']-1
meta_merge = meta_merge.rename(columns={"fid": "FID", "iid": "IID"})
print(meta_merge.head())

print(meta_merge.haplo_major_orig.value_counts())

print(",".join(meta_merge.columns))

#check pc haplogroup correlations
#corrs= np.corrcoef(meta_merge[['pc1','pc2','pc3','pc4','pc5','haplo_major_A','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']].T)
corrs= np.corrcoef(meta_merge[['pc1','pc2','pc3','pc4','pc5','haplo_major_D','haplo_major_E','haplo_major_G','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_P','haplo_major_R','haplo_major_T','haplo_major_no_match']].T)
corrdf = pd.DataFrame(corrs)
corrdf.columns = ['pc1','pc2','pc3','pc4','pc5','haplo_major_D','haplo_major_E','haplo_major_G','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_P','haplo_major_R','haplo_major_T','haplo_major_no_match']
corrdf.index = ['pc1','pc2','pc3','pc4','pc5','haplo_major_D','haplo_major_E','haplo_major_G','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_P','haplo_major_R','haplo_major_T','haplo_major_no_match']
print(corrdf)

print(meta_merge.pheno.value_counts())
meta_merge.to_csv(f"{WRKDIR}/y_neurox/neurox_chrY_male_only_covars.txt",sep="\t",index=None)

In [None]:
print(meta_merge.haplo_major_orig.value_counts())

In [None]:
print(meta_merge.haplo_major_orig.value_counts())

In [None]:
#--covar-name pc1,pc2,pc3,pc4,pc5,AGE,haplo_major_E,haplo_major_F,haplo_major_G,haplo_major_I,haplo_major_J,haplo_major_R,haplo_major_T \
print(f"plink --bfile {WRKDIR}/y_neurox/neurox_chrY_male_only \
--maf 0.05 \
--covar {WRKDIR}/y_neurox/neurox_chrY_male_only_covars.txt \
--covar-name pc1,pc2,pc3,pc4,pc5,AGE,haplo_major_P,haplo_major_T,haplo_major_E,haplo_major_J,haplo_major_I,haplo_major_K,haplo_major_R \
--pheno {WRKDIR}/y_neurox/neurox_chrY_male_only_covars.txt --pheno-name pheno \
--freq \
--logistic hide-covar --ci 0.95 --out {WRKDIR}/y_neurox/neurox_chrY_male_only_logreg")

In [None]:
log_neurox = pd.read_csv(f"{WRKDIR}/y_neurox/neurox_chrY_male_only_logreg.assoc.logistic",sep = "\s+").sort_values("P")

print(log_neurox.shape)
print(log_neurox.head())

log_neurox = log_neurox.dropna()
print(log_neurox.shape)

print(log_neurox[(log_neurox['P']!=0)].shape)
print(log_neurox[(log_neurox['P']!=0) & (log_neurox['P']<0.05)].shape)

In [None]:
maf = pd.read_csv(f"{WRKDIR}/y_neurox/neurox_chrY_male_only_logreg.frq",sep = "\s+")
print(maf.shape)
print(maf.head())

In [None]:
bim = pd.read_csv(f"{WRKDIR}/y_neurox/neurox_chrY_male_only.bim",sep="\s+",header=None)
bim.columns = ['chr','snp','pos','bp','a1','a2']
print(bim.shape)
print(bim.head())

log_neurox_merge = pd.merge(left = maf, right = bim, left_on = ['SNP','A1'],right_on = ['snp','a1'])
log_neurox_merge = log_neurox_merge[['CHR','SNP','bp','a1','a2','MAF']]
print(log_neurox_merge.shape)
print(log_neurox_merge.head())

log_neurox_merge = pd.merge(left = log_neurox, right = log_neurox_merge[['SNP','bp','a1','a2','MAF']], left_on = ['SNP','BP','A1'],right_on = ['SNP','bp','a1'])
log_neurox_merge = log_neurox_merge[['CHR','SNP','BP','a1','a2','MAF','TEST','NMISS','OR','SE','L95','U95','STAT','P']]
print(log_neurox_merge.shape)
print(log_neurox_merge.head())

log_neurox_merge['CHR_BP_A1_A2'] = log_neurox_merge['CHR'].astype(str)+"_"+log_neurox_merge['BP'].astype(str)+"_"+log_neurox_merge['a1'].astype(str)+"_"+log_neurox_merge['a2'].astype(str)


log_neurox_merge.to_csv(f"{WRKDIR}/y_neurox/neurox_chrY_male_only_logreg_filter.assoc.logistic",index=None,sep="\t")

## UKBB Case Control

In [None]:
fam = pd.read_csv(f"{WRKDIR}/y_ukbb/chrY_male_only.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())
print(fam.pheno.value_counts())

In [None]:
print(f"plink --bfile {WRKDIR}/y_ukbb/chrY_male_only --keep {WRKDIR}/y_ukbb/ukbb_case_control_samples.txt --make-bed --out {WRKDIR}/y_ukbb/chrY_male_only_case_control")

In [None]:
fam = pd.read_csv(f"{WRKDIR}/y_ukbb/chrY_male_only_case_control.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())
print(fam.pheno.value_counts())

In [None]:
auto_pcs = pd.read_csv(f"{CARDDIR}/projects/chromosome_y_expression/ukbb/pcs_case_control_pca.txt",sep="\s+")#,header=None)
#auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())

In [None]:
yhaplo = pd.read_csv(f"{WRKDIR}/output_ukbb/yhaplo_output/haplogroups.chrY_male_only.txt",sep="\s+",header=None)
yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
yhaplo['haplo_major'] = yhaplo['haplo_long'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
yhaplo['id'] = yhaplo['id'].astype('int64')
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
#use Ylineagetracker haplogroups since that tool assigned the most unique haplogroups compared to other tools
ltrack = pd.read_csv(f"{WRKDIR}/output_ukbb/ltrack_ukbb_hg19.hapresult.hg",sep="\s+")
ltrack.columns = ['id','haplo']
ltrack['haplo_major'] = ltrack['haplo'].str[0]
#ltrack.loc[ltrack.haplo==".","haplo"] = "no_match"
#ltrack.loc[ltrack.haplo_major==".","haplo_major"] = "no_match"
ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
ltrack['id'] = ltrack['id'].astype('int64')
print(ltrack.shape)
print(ltrack.head())
print(len(set(ltrack.haplo)))
print((set(ltrack.haplo_major)))

In [None]:
meta = pd.read_table(f"{CARDDIR}/UKBIOBANK/PHENOTYPE_DATA/covariates_phenome_to_use.txt")
print(meta.shape)
print(meta.head())

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs[['FID','IID','PC1','PC2','PC3','PC4','PC5']], left_on = ['fid','iid'], right_on = ['FID','IID'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['FID','AGE_OF_RECRUIT']], left_on = ['fid'], right_on = ['FID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = ltrack[['id','haplo_major']], left_on = ['fid'], right_on = 'id')
print(merge3.shape)
print(merge3.head())
meta_merge = merge3[['fid','iid','pheno','PC1','PC2','PC3','PC4','PC5','AGE_OF_RECRUIT','haplo_major']].copy()


meta_merge['haplo_major_orig'] = meta_merge['haplo_major']
meta_merge = pd.get_dummies(meta_merge, columns = ['haplo_major'])
print(meta_merge.head())
#meta_merge.pheno = meta_merge.pheno - 1

meta_merge['PHENO_PLINK'] = meta_merge['pheno']-1
meta_merge = meta_merge.rename(columns={"fid": "FID", "iid": "IID"})
print(meta_merge.head())

print(meta_merge.haplo_major_orig.value_counts())

print(",".join(meta_merge.columns))
#check pc haplogroup correlations
#
#corrs= np.corrcoef(meta_merge[['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']].T)
corrs= np.corrcoef(meta_merge[['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_N','haplo_major_O','haplo_major_P','haplo_major_R','haplo_major_T']].T)
corrdf = pd.DataFrame(corrs)
corrdf.columns = ['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_N','haplo_major_O','haplo_major_P','haplo_major_R','haplo_major_T']
corrdf.index = ['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_N','haplo_major_O','haplo_major_P','haplo_major_R','haplo_major_T']
print(corrdf)

print(meta_merge.pheno.value_counts())
meta_merge.to_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_case_control_covars.txt",sep="\t",index=None)

In [None]:
print(meta_merge.haplo_major_orig.value_counts())

In [None]:
print(f"plink --bfile {WRKDIR}/y_ukbb/chrY_male_only_case_control \
--maf 0.05 \
--covar {WRKDIR}/y_ukbb/ukbb_chrY_case_control_covars.txt \
--covar-name PC1,PC2,PC3,PC4,PC5,AGE_OF_RECRUIT,haplo_major_J,haplo_major_G,haplo_major_E,haplo_major_K,haplo_major_O,haplo_major_I,haplo_major_R \
--pheno {WRKDIR}/y_ukbb/ukbb_chrY_case_control_covars.txt --pheno-name pheno \
--freq \
--logistic hide-covar --ci 0.95 --out {WRKDIR}/y_ukbb/ukbb_chrY_case_control_logreg")

In [None]:
log_ukbb_cc = pd.read_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_case_control_logreg.assoc.logistic",sep = "\s+").sort_values("P")

print(log_ukbb_cc.shape)
print(log_ukbb_cc.head())

log_ukbb_cc = log_ukbb_cc.dropna()
print(log_ukbb_cc.shape)

print(log_ukbb_cc[(log_ukbb_cc['P']!=0)].shape)
print(log_ukbb_cc[(log_ukbb_cc['P']!=0) & (log_ukbb_cc['P']<0.05)].shape)
print(log_ukbb_cc[(log_ukbb_cc['P']!=0) & (log_ukbb_cc['P']<0.05)])

In [None]:
maf = pd.read_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_case_control_logreg.frq",sep = "\s+")
print(maf.shape)
print(maf.head())

In [None]:
bim = pd.read_csv(f"{WRKDIR}/y_ukbb/chrY_male_only_case_control.bim",sep="\s+",header=None)
bim.columns = ['chr','snp','pos','bp','a1','a2']
print(bim.shape)
print(bim.head())

log_ukbb_cc_merge = pd.merge(left = maf, right = bim, left_on = ['SNP','A1'],right_on = ['snp','a1'])
log_ukbb_cc_merge = log_ukbb_cc_merge[['CHR','SNP','bp','a1','a2','MAF']]
print(log_ukbb_cc_merge.shape)
print(log_ukbb_cc_merge.head())

log_ukbb_cc_merge = pd.merge(left = log_ukbb_cc, right = log_ukbb_cc_merge[['SNP','bp','a1','a2','MAF']], left_on = ['SNP','BP','A1'],right_on = ['SNP','bp','a1'])
log_ukbb_cc_merge = log_ukbb_cc_merge[['CHR','SNP','BP','a1','a2','MAF','TEST','NMISS','OR','SE','L95','U95','STAT','P']]
print(log_ukbb_cc_merge.shape)
print(log_ukbb_cc_merge.head())

log_ukbb_cc_merge['CHR_BP_A1_A2'] = log_ukbb_cc_merge['CHR'].astype(str)+"_"+log_ukbb_cc_merge['BP'].astype(str)+"_"+log_ukbb_cc_merge['a1'].astype(str)+"_"+log_ukbb_cc_merge['a2'].astype(str)


log_ukbb_cc_merge.to_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_case_control_logreg_filter.assoc.logistic",index=None,sep="\t")

## UKBB Proxy Control

In [None]:
print(f"plink --bfile {WRKDIR}/y_ukbb/chrY_male_only --keep {WRKDIR}/y_ukbb/ukbb_proxy_control_samples.txt --make-bed --out {WRKDIR}/y_ukbb/chrY_male_only_proxy_control")

In [None]:
recode_fam = pd.read_csv(f"{WRKDIR}/y_ukbb/chrY_male_only_proxy_control.fam",sep="\s+",header=None)
recode_fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(recode_fam.pheno.value_counts())
recode_fam.loc[recode_fam['pheno']==3,'pheno']=2
print(recode_fam.pheno.value_counts())
recode_fam.to_csv(f"{WRKDIR}/y_ukbb/chrY_male_only_proxy_control.fam",sep="\t",header=None, index=None)

In [None]:
fam = pd.read_csv(f"{WRKDIR}/y_ukbb/chrY_male_only_proxy_control.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())
print(fam.pheno.value_counts())

auto_pcs = pd.read_csv(f"{CARDDIR}/projects/chromosome_y_expression/ukbb/pcs_proxy_control_pca.txt",sep="\s+")#,header=None)
#auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())

#yhaplo = pd.read_csv(f"{WRKDIR}/output_ukbb/yhaplo_output/haplogroups.chrY_male_only.txt",sep="\s+",header=None)
#yhaplo.columns = ['id','haplo_short','haplo_short_rep_snp','haplo_long']
#yhaplo['haplo_major'] = yhaplo['haplo_long'].str[0]
#yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
#yhaplo['id'] = yhaplo['id'].astype('int64')
#print(yhaplo.shape)
#print(yhaplo.head())

#use Ylineagetracker haplogroups since that tool assigned the most unique haplogroups compared to other tools
ltrack = pd.read_csv(f"{WRKDIR}/output_ukbb/ltrack_ukbb_hg19.hapresult.hg",sep="\s+")
ltrack.columns = ['id','haplo']
ltrack['haplo_major'] = ltrack['haplo'].str[0]
#ltrack.loc[ltrack.haplo==".","haplo"] = "no_match"
#ltrack.loc[ltrack.haplo_major==".","haplo_major"] = "no_match"
ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
ltrack['id'] = ltrack['id'].astype('int64')
print(ltrack.shape)
print(ltrack.head())
print(len(set(ltrack.haplo)))
print((set(ltrack.haplo_major)))

meta = pd.read_table(f"{CARDDIR}/UKBIOBANK/PHENOTYPE_DATA/covariates_phenome_to_use.txt")
print(meta.shape)
print(meta.head())

#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs[['FID','IID','PC1','PC2','PC3','PC4','PC5']], left_on = ['fid','iid'], right_on = ['FID','IID'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['FID','AGE_OF_RECRUIT']], left_on = ['fid'], right_on = ['FID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = ltrack[['id','haplo_major']], left_on = ['fid'], right_on = 'id')
print(merge3.shape)
print(merge3.head())
meta_merge = merge3[['fid','iid','pheno','PC1','PC2','PC3','PC4','PC5','AGE_OF_RECRUIT','haplo_major']].copy()


meta_merge['haplo_major_orig'] = meta_merge['haplo_major']
meta_merge = pd.get_dummies(meta_merge, columns = ['haplo_major'])
print(meta_merge.head())
#meta_merge.pheno = meta_merge.pheno - 1

meta_merge['PHENO_PLINK'] = meta_merge['pheno']-1
meta_merge = meta_merge.rename(columns={"fid": "FID", "iid": "IID"})
print(meta_merge.head())

print(meta_merge.haplo_major_orig.value_counts())

print(",".join(meta_merge.columns))

#check pc haplogroup correlations
#
#corrs= np.corrcoef(meta_merge[['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_L','haplo_major_N','haplo_major_Q','haplo_major_R','haplo_major_T']].T)
corrs= np.corrcoef(meta_merge[['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_N','haplo_major_O','haplo_major_P','haplo_major_R','haplo_major_T']].T)
corrdf = pd.DataFrame(corrs)
corrdf.columns = ['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_N','haplo_major_O','haplo_major_P','haplo_major_R','haplo_major_T']
corrdf.index = ['PC1','PC2','PC3','PC4','PC5','haplo_major_A','haplo_major_C','haplo_major_E','haplo_major_G','haplo_major_H','haplo_major_I','haplo_major_J','haplo_major_K','haplo_major_N','haplo_major_O','haplo_major_P','haplo_major_R','haplo_major_T']
print(corrdf)

print(meta_merge.pheno.value_counts())
meta_merge.to_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_covars.txt",sep="\t",index=None)

In [None]:
print(meta_merge.haplo_major_orig.value_counts())

In [None]:
print(f"plink --bfile {WRKDIR}/y_ukbb/chrY_male_only_proxy_control \
--maf 0.05 \
--covar {WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_covars.txt \
--covar-name PC1,PC2,PC3,PC4,PC5,AGE_OF_RECRUIT,haplo_major_T,haplo_major_N,haplo_major_J,haplo_major_G,haplo_major_E,haplo_major_K,haplo_major_O,haplo_major_I,haplo_major_R \
--pheno {WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_covars.txt --pheno-name pheno \
--freq \
--logistic hide-covar --ci 0.95 --out {WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_logreg")

In [None]:
log_ukbb_pc = pd.read_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_logreg.assoc.logistic",sep = "\s+").sort_values("P")

print(log_ukbb_pc.shape)
print(log_ukbb_pc.head())

log_ukbb_pc = log_ukbb_pc.dropna()
print(log_ukbb_pc.shape)

print(log_ukbb_pc[(log_ukbb_pc['P']!=0)].shape)
print(log_ukbb_pc[(log_ukbb_pc['P']!=0) & (log_ukbb_pc['P']<0.05)].shape)
print(log_ukbb_pc[(log_ukbb_pc['P']!=0) & (log_ukbb_pc['P']<0.05)])

In [None]:
maf = pd.read_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_logreg.frq",sep = "\s+")
print(maf.shape)
print(maf.head())

In [None]:
bim = pd.read_csv(f"{WRKDIR}/y_ukbb/chrY_male_only_proxy_control.bim",sep="\s+",header=None)
bim.columns = ['chr','snp','pos','bp','a1','a2']
print(bim.shape)
print(bim.head())

log_ukbb_pc_merge = pd.merge(left = maf, right = bim, left_on = ['SNP','A1'],right_on = ['snp','a1'])
log_ukbb_pc_merge = log_ukbb_pc_merge[['CHR','SNP','bp','a1','a2','MAF']]
print(log_ukbb_pc_merge.shape)
print(log_ukbb_pc_merge.head())

log_ukbb_pc_merge = pd.merge(left = log_ukbb_pc, right = log_ukbb_pc_merge[['SNP','bp','a1','a2','MAF']], left_on = ['SNP','BP','A1'],right_on = ['SNP','bp','a1'])
log_ukbb_pc_merge = log_ukbb_pc_merge[['CHR','SNP','BP','a1','a2','MAF','TEST','NMISS','OR','SE','L95','U95','STAT','P']]
print(log_ukbb_pc_merge.shape)
print(log_ukbb_pc_merge.head())

log_ukbb_pc_merge['CHR_BP_A1_A2'] = log_ukbb_pc_merge['CHR'].astype(str)+"_"+log_ukbb_pc_merge['BP'].astype(str)+"_"+log_ukbb_pc_merge['a1'].astype(str)+"_"+log_ukbb_pc_merge['a2'].astype(str)


log_ukbb_pc_merge.to_csv(f"{WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_logreg_filter.assoc.logistic",index=None,sep="\t")

##### Check common variants


In [None]:
log_amp_pd = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg_filter.assoc.logistic")
print(log_amp_pd.shape)
log_amp_lbd = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_logreg_filter.assoc.logistic")
print(log_amp_lbd.shape)
log_neurox = pd.read_table(f"{WRKDIR}/y_neurox/neurox_chrY_male_only_logreg_filter.assoc.logistic")
print(log_neurox.shape)
log_ukbb_cc = pd.read_table(f"{WRKDIR}/y_ukbb/ukbb_chrY_case_control_logreg_filter.assoc.logistic")
print(log_ukbb_cc.shape)
log_ukbb_pc = pd.read_table(f"{WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_logreg_filter.assoc.logistic")
print(log_ukbb_pc.shape)
print(log_ukbb_pc.head())

In [None]:
len(set(log_amp_pd.CHR_BP_A1_A2) | set(log_neurox.CHR_BP_A1_A2) | set(log_ukbb_cc.CHR_BP_A1_A2) | set(log_ukbb_pc.CHR_BP_A1_A2))


## Meta Analyze with METAL

#### make metal file:
(may want to remove the lbd dataset)   
```
SCHEME STDERR
AVERAGEFREQ ON
MINMAXFREQ ON
FREQLABEL MAF

MARKER CHR_BP_A1_A2
ALLELE a1 a2
EFFECT log(OR)
STDERR SE
PVALUE P 
WEIGHT NMISS
PROCESS $PATH/amppd_case_control_nogcs_logreg_filter.assoc.logistic

MARKER CHR_BP_A1_A2
ALLELE a1 a2
EFFECT log(OR)
STDERR SE
PVALUE P 
WEIGHT NMISS
PROCESS $PATH/amppd_lbd_case_control_nogcs_logreg_filter.assoc.logistic

MARKER CHR_BP_A1_A2
ALLELE a1 a2
EFFECT log(OR)
STDERR SE
PVALUE P 
WEIGHT NMISS
PROCESS $PATH/neurox_chrY_male_only_logreg_filter.assoc.logistic

MARKER CHR_BP_A1_A2
ALLELE a1 a2
EFFECT log(OR)
STDERR SE
PVALUE P 
WEIGHT NMISS
PROCESS $PATH/ukbb_chrY_case_control_logreg_filter.assoc.logistic

MARKER CHR_BP_A1_A2
ALLELE a1 a2
EFFECT log(OR)
STDERR SE
PVALUE P 
WEIGHT NMISS
PROCESS $PATH/ukbb_chrY_proxy_control_logreg_filter.assoc.logistic

OUTFILE CHRY_SINGLE_VARIANT_TEST_METAL_NEW .tbl
ANALYZE HETEROGENEITY


QUIT
```

In [None]:
metal_results = pd.read_table(f"{WRKDIR}/CHRY_SINGLE_VARIANT_TEST_METAL_NEW1.tbl").sort_values("P-value")
print(metal_results.shape)
print(metal_results.head())

In [None]:
metal_results[metal_results.MinFreq!=metal_results.MaxFreq].shape

In [None]:
metal_results.loc[metal_results.HetDf >0].shape

### Get Variant Positions and Annotate

In [None]:
avinput = metal_results.copy()
avinput['chr']='Y'
avinput['start'] = avinput['MarkerName'].str.split("_").str[1].astype('int32')
avinput['ref'] = avinput['MarkerName'].str.split("_").str[2]
avinput['alt'] = avinput['MarkerName'].str.split("_").str[3]

#need to adjust end positions for ref alleles with more than one nucleotide
avinput['end'] = avinput['start'] + avinput['ref'].str.len()-1
#replace * with - for annovar syntax
avinput['ref'] = avinput['ref'].replace('*','-')
avinput['alt'] = avinput['alt'].replace('*','-')


avinput = avinput[['chr','start','end','ref','alt','MarkerName']]
avinput.columns = ['chr','start','end','ref','alt','snp']

print(avinput.shape)
print(avinput.head())
avinput.to_csv(f"{WRKDIR}/metal_results.avinput",index=None, sep = "\t")

In [None]:
print(f"table_annovar.pl {WRKDIR}/metal_results.avinput $ANNOVAR_DATA/hg19/ -buildver hg19 --thread 16  -out {WRKDIR}/metal_results.annovar  -remove -protocol avsnp150,refGene,ensGene,gnomad211_genome  -operation f,g,g,f  -nastring .")

In [None]:
metal_anno = pd.read_table(f"{WRKDIR}/metal_results.annovar.hg19_multianno.txt").drop_duplicates()
metal_anno = metal_anno.iloc[1:len(metal_anno.index),:]
metal_anno = metal_anno.astype({'Start': 'int64'})

metal_anno['Ref'] = metal_anno['Ref'].replace('-','*')
metal_anno['Alt'] = metal_anno['Alt'].replace('-','*')
metal_anno['CHR_BP_A1_A2'] = "24_"+metal_anno['Start'].astype(str)+"_"+metal_anno['Ref'].astype(str)+"_"+metal_anno['Alt'].astype(str)


#print(metal_anno.dtypes)
print(metal_anno.shape)
print(metal_anno.head())

In [None]:
print(metal_anno.columns)

In [None]:
print(set(metal_anno['Func.refGene']))

In [None]:
print(metal_anno[metal_anno['Func.refGene']=='exonic'])

In [None]:
metal_anno.columns

In [None]:
metal_results_anno = pd.merge(left = metal_results, right = metal_anno, left_on ='MarkerName', right_on = 'CHR_BP_A1_A2').drop_duplicates()

print(metal_anno.shape)
print(metal_results_anno.shape)
print(metal_results_anno.head())

In [None]:
metal_results_anno.columns

In [None]:
metal_results_anno = metal_results_anno[['Chr','Start','Allele1','Allele2','Ref','Alt','Freq1','MinFreq','MaxFreq','Effect','StdErr','P-value','Direction','HetISq','HetChiSq','HetDf','HetPVal','avsnp150','Func.refGene','Gene.refGene','GeneDetail.refGene','ExonicFunc.refGene']].sort_values('P-value')
metal_results_anno['Allele1'] = metal_results_anno['Allele1'].str.upper()
metal_results_anno['Allele2'] = metal_results_anno['Allele2'].str.upper()
print(metal_results_anno.shape)
print(metal_results_anno.head())
metal_results_anno.to_csv(f"{WRKDIR}/metal_results_annotated_new.csv",index=None)

In [None]:
print(metal_results_anno.loc[metal_results_anno['P-value']<0.05,'Func.refGene'].shape)
print(set(metal_results_anno.loc[metal_results_anno['P-value']<0.05,'Func.refGene']))

In [None]:
print(metal_results_anno.loc[(metal_results_anno['P-value']<0.05) ,])

## Filter annotation by variants in multiple datasets

In [None]:
metal_results_anno = pd.read_csv(f"{WRKDIR}/metal_results_annotated_new.csv")
print(metal_results_anno.shape)
print(metal_results_anno.head())

In [None]:

print(metal_results_anno[metal_results_anno.HetDf>=1].shape)


In [None]:
metal_results_anno.head()

In [None]:
metal_results_anno[metal_results_anno.HetDf>=1].to_csv(f"{WRKDIR}/metal_results_annotated_new_multiple_datasets.csv",index=None)

## Check if top hits in all datasets

In [None]:
metal_results_anno = pd.read_csv(f"{WRKDIR}/metal_results_annotated_new.csv")
print(metal_results_anno.head())

In [None]:
print(metal_results_anno.loc[(metal_results_anno['P-value']<0.05) ,].shape)
print(metal_results_anno.loc[(metal_results_anno['P-value']<0.05/3387) ,].shape)

In [None]:
metal_results_anno.loc[(metal_results_anno['P-value']<0.05/3387) ,]

In [None]:
top_hits = metal_results_anno.loc[(metal_results_anno['P-value']<0.05/3320) ,]
for index, row in top_hits.iterrows():

    var = f"{row.Start}_{row.Ref}_{row.Alt}"
    print(index)
    print(var)
    print("amppd")
    !grep {var} {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg_filter.assoc.logistic
    #!grep {var} {WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs_logreg_filter.assoc.logistic
    print("neurox")
    !grep {var} {WRKDIR}/y_neurox/neurox_chrY_male_only_logreg_filter.assoc.logistic
    print("ukbb cc")
    !grep {var} {WRKDIR}/y_ukbb/ukbb_chrY_case_control_logreg_filter.assoc.logistic
    print("ukbb pc")
    !grep {var} {WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_logreg_filter.assoc.logistic
    print("\n")
    
    

## Identify variants in multiple datasets

In [None]:
vars_df = pd.DataFrame()

In [None]:
amp_pd = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_logreg_filter.assoc.logistic")
amp_pd = amp_pd[['CHR_BP_A1_A2']]
amp_pd['dataset'] = 'amp_pd'
print(amp_pd.shape)
print(amp_pd.head())

vars_df = vars_df.append(amp_pd)
print(vars_df.shape)

In [None]:
neurox = pd.read_table(f"{WRKDIR}/y_neurox/neurox_chrY_male_only_logreg_filter.assoc.logistic")
neurox = neurox[['CHR_BP_A1_A2']]
neurox['dataset'] = 'neurox'
print(neurox.shape)
print(neurox.head())

vars_df = vars_df.append(neurox)
print(vars_df.shape)

In [None]:
ukbb_cc = pd.read_table(f"{WRKDIR}/y_ukbb/ukbb_chrY_case_control_logreg_filter.assoc.logistic")
ukbb_cc = ukbb_cc[['CHR_BP_A1_A2']]
ukbb_cc['dataset'] = 'ukbb_cc'
print(ukbb_cc.shape)
print(ukbb_cc.head())

vars_df = vars_df.append(ukbb_cc)
print(vars_df.shape)

In [None]:
ukbb_pc = pd.read_table(f"{WRKDIR}/y_ukbb/ukbb_chrY_proxy_control_logreg_filter.assoc.logistic")
ukbb_pc = ukbb_pc[['CHR_BP_A1_A2']]
ukbb_pc['dataset'] = 'ukbb_pc'
print(ukbb_pc.shape)
print(ukbb_pc.head())

vars_df = vars_df.append(ukbb_pc)
print(vars_df.shape)

In [None]:
freqs = pd.DataFrame(vars_df['CHR_BP_A1_A2'].value_counts())
freqs

In [None]:
freqs[freqs['CHR_BP_A1_A2']==4]

In [None]:
freqs[freqs['CHR_BP_A1_A2']==3]

In [None]:
freqs[freqs['CHR_BP_A1_A2']==2].shape