# Single Variant Testing 
- **Author(s)** - Frank Grenn
- **Date Started** - August 2021
- **Quick Description:** Y chromosome single variant tests using AMPPD data before liftover from hg38 to hg19 was done (to include more variants).

In [None]:
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import numpy as np
import sys

In [None]:
WRKDIR = "$PATH/chrY"

## Get AMPPD Covariate Data

In [None]:
fam = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

In [None]:
fam.pheno.value_counts()

In [None]:
auto_pcs = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_autosome_pcs.eigenvec",sep="\s+",header=None)
auto_pcs.columns = ['fid','iid'] + ['pc'+str(n) for n in range(1,21)]
print(auto_pcs.shape)
print(auto_pcs.head())

In [None]:
#check yhaplo haplogroups
yhaplo = pd.read_csv(f"{WRKDIR}/output_male_hemizygous_only_het_filter_run/output_yhaplo/haplogroups.chrY_male_hemizygous_only_het_filter_hg19_final.txt",sep="\s+",header=None)
yhaplo.columns = ['id','yhaplo_haplo_short','yhaplo_haplo_short_rep_snp','yhaplo_haplo']
yhaplo['yhaplo_haplo_major'] = yhaplo['yhaplo_haplo'].str[0]
yhaplo['id'] = [i[:len(i)//2] for i in yhaplo.id]
#yhaplo = yhaplo[yhaplo['haplo_long']!='A']#assume samples with "A" haplogroup were not assigned one.
print(yhaplo.shape)
print(yhaplo.head())

In [None]:
#use Ylineagetracker haplogroups since that tool assigned the most unique haplogroups compared to other tools
ltrack = pd.read_csv(f"{WRKDIR}/output_male_hemizygous_only_het_filter_run/output_ltracker/ltrack_hg19.hapresult.hg",sep="\s+")
ltrack.columns = ['id','ltrack_haplo']#['id','haplo_short','haplo_short_rep_snp','haplo_long']
ltrack['ltrack_haplo_major'] = ltrack['ltrack_haplo'].str[0]
ltrack['id'] = [i[:len(i)//2] for i in ltrack.id]
print(ltrack.shape)
print(ltrack.head())
print(len(set(ltrack.ltrack_haplo)))

In [None]:
meta = pd.read_csv("$PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
#merge
merge1 = pd.merge(left = fam[['fid','iid','sex','pheno']], right = auto_pcs[['fid','iid','pc1','pc2','pc3','pc4','pc5']], left_on = ['fid','iid'], right_on = ['fid','iid'])
print(merge1.shape)
merge2 = pd.merge(left = merge1, right = meta[['ID','AGE_BASELINE']], left_on = ['fid'], right_on = ['ID'])
print(merge2.shape)
merge3 = pd.merge(left = merge2, right = ltrack[['id','ltrack_haplo','ltrack_haplo_major']], left_on = ['fid'], right_on = 'id')
print(merge3.shape)
merge4 = pd.merge(left = merge3, right = yhaplo[['id','yhaplo_haplo','yhaplo_haplo_major']], left_on = ['fid'], right_on = 'id')
print(merge4.shape)
print(merge4.head())
meta_merge = merge4[['fid','iid','pheno','pc1','pc2','pc3','pc4','pc5','AGE_BASELINE','ltrack_haplo','ltrack_haplo_major','yhaplo_haplo','yhaplo_haplo_major']].copy()

In [None]:
meta_merge.loc[meta_merge.yhaplo_haplo_major == meta_merge.ltrack_haplo_major,].shape

In [None]:
meta_merge.loc[meta_merge.yhaplo_haplo_major != meta_merge.ltrack_haplo_major,]

In [None]:
meta_merge.yhaplo_haplo_major.value_counts()

In [None]:
meta_merge.ltrack_haplo_major.value_counts()

In [None]:

meta_merge['yhaplo_haplo_major_orig'] = meta_merge['yhaplo_haplo_major']
meta_merge = pd.get_dummies(meta_merge, columns = ['yhaplo_haplo_major'])
meta_merge['ltrack_haplo_major_orig'] = meta_merge['ltrack_haplo_major']
meta_merge = pd.get_dummies(meta_merge, columns = ['ltrack_haplo_major'])
print(meta_merge.head())
#meta_merge.pheno = meta_merge.pheno - 1

In [None]:
meta_merge['PHENO_PLINK'] = meta_merge['pheno']-1
meta_merge = meta_merge.rename(columns={"fid": "FID", "iid": "IID"})
print(meta_merge.head())

In [None]:
meta_merge['PHENO_PLINK'].value_counts()

In [None]:
meta_merge.dtypes

In [None]:
",".join(meta_merge.columns)

In [None]:
meta_merge.to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_covars.txt",sep="\t",index=None)

## Filter SNPs

### try using data with no liftover (hg38)

#### filter out multiallelic variants, deletions/insertions and *

In [None]:
snps = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover.bim",header=None, names = ['chr','snp','pos','bp','ref','alt'])
print(snps.shape)

In [None]:
#remove *
snps_filter = snps.loc[(snps.ref!="*") & (snps.alt!="*"),]
print(snps_filter.shape)

In [None]:
#remove insertions/deletions
snps_filter = snps_filter.loc[(snps_filter.ref.str.len()==1) & (snps_filter.alt.str.len()==1),]
print(snps_filter.shape)

In [None]:
#remove multiallelic based on identical BP
snps_filter = snps_filter.drop_duplicates(subset = "bp", keep = False)
print(snps_filter.shape)

In [None]:
print(snps_filter.head())
print(snps_filter.tail())

In [None]:
len(set(snps_filter.snp))

In [None]:
snps_filter[['snp']].to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_snps_only.txt",header=None,index=None)

#### filter for only the snps we want

In [None]:
print(f"plink --bfile {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover \
--extract {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_snps_only.txt \
--make-bed --out {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_snps_only")

#### do the logistic regression


In [None]:
#,ltrack_haplo_major_E,ltrack_haplo_major_G,ltrack_haplo_major_I,ltrack_haplo_major_J,ltrack_haplo_major_R
#,yhaplo_haplo_major_E,yhaplo_haplo_major_G,yhaplo_haplo_major_I,yhaplo_haplo_major_J,yhaplo_haplo_major_R
print(f"plink --bfile {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_snps_only \
--maf 0.05 \
--covar {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_covars.txt \
--covar-name pc1,pc2,pc3,pc4,pc5,AGE_BASELINE,ltrack_haplo_major_G,ltrack_haplo_major_I,ltrack_haplo_major_J \
--pheno {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_covars.txt --pheno-name pheno \
--freq \
--logistic hide-covar --ci 0.95 --out {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_snps_only_logreg")

In [None]:
logistic_results = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_snps_only_logreg.assoc.logistic", sep = "\s+")
print(logistic_results.shape)
print(logistic_results.sort_values("P").head())


In [None]:
maf = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_snps_only_logreg.frq",sep = "\s+")
print(maf.shape)
print(maf.head())

In [None]:
bim = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover.bim",sep="\s+",header=None)
bim.columns = ['chr','snp','pos','bp','a1','a2']
print(bim.shape)
print(bim.head())



In [None]:
snp_mafs = pd.merge(left = maf, right = bim, left_on = ['SNP','A1'],right_on = ['snp','a1'])
snp_mafs = snp_mafs[['CHR','SNP','bp','a1','a2','MAF']]
print(snp_mafs.shape)
print(snp_mafs.head())

In [None]:
logistic_merge = pd.merge(left = logistic_results, right = snp_mafs[['SNP','bp','a1','a2','MAF']], left_on = ['SNP','BP','A1'],right_on = ['SNP','bp','a1'])
logistic_merge = logistic_merge[['CHR','SNP','BP','a1','a2','MAF','TEST','NMISS','OR','SE','L95','U95','STAT','P']]
print(logistic_merge.dtypes)
logistic_merge['CHR_BP_A1_A2'] = logistic_merge['CHR'].astype(str)+"_"+logistic_merge['BP'].astype(str)+"_"+logistic_merge['a1'].astype(str)+"_"+logistic_merge['a2'].astype(str)
print(logistic_merge.shape)
print(logistic_merge.head())


In [None]:
logistic_merge_filter = logistic_merge.copy()

#### annotate

In [None]:
avinput = logistic_merge_filter.copy()
avinput['chr']='Y'
avinput['start'] = avinput['CHR_BP_A1_A2'].str.split("_").str[1].astype('int32')
avinput['ref'] = avinput['CHR_BP_A1_A2'].str.split("_").str[2]
avinput['alt'] = avinput['CHR_BP_A1_A2'].str.split("_").str[3]

#need to adjust end positions for ref alleles with more than one nucleotide
avinput['end'] = avinput['start'] + avinput['ref'].str.len()-1
#replace * with - for annovar syntax
avinput['ref'] = avinput['ref'].replace('*','-')
avinput['alt'] = avinput['alt'].replace('*','-')


avinput = avinput[['chr','start','end','ref','alt','CHR_BP_A1_A2']]
avinput.columns = ['chr','start','end','ref','alt','snp']

print(avinput.shape)
print(avinput.head())
avinput.to_csv(f"{WRKDIR}/amppd_case_control_before_liftover_filter_alleles.avinput",index=None, sep = "\t")

In [None]:
print(f"table_annovar.pl {WRKDIR}/amppd_case_control_before_liftover_filter_alleles.avinput $ANNOVAR_DATA/hg38/ -buildver hg38 --thread 16  -out {WRKDIR}/amppd_case_control_before_liftover_filter_alleles.annovar  -remove -protocol avsnp150,refGene,ensGene,gnomad211_genome  -operation f,g,g,f  -nastring .")


In [None]:
anno = pd.read_table(f"{WRKDIR}/amppd_case_control_before_liftover_filter_alleles.annovar.hg38_multianno.txt").drop_duplicates()
anno = anno.iloc[1:len(anno.index),:]
anno = anno.astype({'Start': 'int64'})

anno['Ref'] = anno['Ref'].replace('-','*')
anno['Alt'] = anno['Alt'].replace('-','*')
anno['CHR_BP_A1_A2'] = "24_"+anno['Start'].astype(str)+"_"+anno['Ref'].astype(str)+"_"+anno['Alt'].astype(str)


#print(metal_anno.dtypes)
print(anno.shape)
print(anno.head())

In [None]:
#merge
merge_filter = pd.merge(left = logistic_merge_filter, right = anno[['CHR_BP_A1_A2','avsnp150','Func.refGene','Gene.refGene','GeneDetail.refGene','ExonicFunc.refGene']], on  = "CHR_BP_A1_A2")
print(merge_filter.shape)
print(merge_filter.head())

In [None]:
merge_filter.sort_values("P").to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs_before_liftover_logreg_filter_alleles_new_annotated.assoc.logistic",index=None,sep="\t")