# Subset AMP-PD Data for PD Case Control and LBD Case Control and Calculate PCs
- **Author(s)** - Frank Grenn
- **Quick Description:** subset amppd data into PD and LBD datasets

In [None]:
import pandas as pd

In [None]:
WRKDIR = "/PATH/chrY"

In [None]:
samples = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/chrY_male_hemizygous_only_het_filter_hg19_final.fam",sep="\s+",header=None)
samples.columns = ['fid','iid','pid','mid','sex','pheno']
print(samples.shape)
print(samples.head())

In [None]:
meta = pd.read_csv("/PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
meta_merge = pd.merge(left = samples, right = meta, left_on = 'fid', right_on = 'ID')
print(meta_merge.shape)
print(meta_merge.head())

In [None]:
print(meta_merge.LATEST_DX.value_counts())

In [None]:
ppmi_meta = meta_merge[meta_merge.ID.str.contains('PP-')]

In [None]:
ppmi_meta['PATNO'] = ppmi_meta['ID'].str.split('-').str[1]

In [None]:
ppmi_meta.shape

In [None]:
#5 digit patnos should be genetic carriers, so remove those
ppmi_meta[ppmi_meta.PATNO.str.len()==5].shape

In [None]:
gc_ids = (ppmi_meta[ppmi_meta.PATNO.str.len()==5]['ID']).tolist()
print(len(gc_ids))
print(gc_ids[1:10])

In [None]:
samples_no_gcs = meta_merge[~meta_merge.ID.isin(gc_ids)]
print(samples_no_gcs.shape)

In [None]:
anc = pd.read_csv("/PATH/genetic_ancestry_all_pca.csv")
print(anc.shape)
print(anc.head())

In [None]:
eur_samples_no_gcs = pd.merge(left = samples_no_gcs, right = anc, left_on = "ID", right_on = "IID")
print(eur_samples_no_gcs.shape)

eur_samples_no_gcs = eur_samples_no_gcs[eur_samples_no_gcs.InfPop=="EUROPE"]
print(eur_samples_no_gcs.shape)
print(eur_samples_no_gcs.head())

In [None]:
eur_samples_no_gcs.columns

## AMP-PD Case Control

In [None]:
print(eur_samples_no_gcs.LATEST_DX.value_counts())

In [None]:
print(eur_samples_no_gcs[eur_samples_no_gcs.pheno==1].LATEST_DX.value_counts())
print(eur_samples_no_gcs[eur_samples_no_gcs.pheno==2].LATEST_DX.value_counts())
print(eur_samples_no_gcs[eur_samples_no_gcs.pheno==-9].LATEST_DX.value_counts())



In [None]:
#get list of cases and controls to keep
cc = eur_samples_no_gcs[(eur_samples_no_gcs.pheno==1) | (eur_samples_no_gcs.pheno==2)]
print(cc.head())
cc[['fid','iid']].to_csv(f"{WRKDIR}/y_male_only_bfiles/pd_eur_case_control_samples.txt",sep="\t",header=None, index=None)

In [None]:
#subset plink binaries and calculate pcs
print(f"plink --bfile {WRKDIR}/y_male_only_bfiles/chrY_male_hemizygous_only_het_filter_hg19_final --keep {WRKDIR}/y_male_only_bfiles/pd_eur_case_control_samples.txt --make-bed --out {WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs")

In [None]:
#calc pcs of autosomes
print(f"plink --bfile /PATH/pruned_data --keep {WRKDIR}/y_male_only_bfiles/pd_eur_case_control_samples.txt --not-chr 23,24,25 --pca --make-bed --out {WRKDIR}/y_male_only_bfiles/amppd_case_control_autosome_pcs")

## AMP-PD LBD Case Control

In [None]:
print(eur_samples_no_gcs[eur_samples_no_gcs.pheno==1].LATEST_DX.value_counts())
print(eur_samples_no_gcs[eur_samples_no_gcs.pheno==2].LATEST_DX.value_counts())
print(eur_samples_no_gcs[eur_samples_no_gcs.pheno==-9].LATEST_DX.value_counts())

In [None]:
lbd_samples = eur_samples_no_gcs[(eur_samples_no_gcs.LATEST_DX=='LBD') | (eur_samples_no_gcs.LATEST_DX=='Dementia With Lewy Bodies')]
print(len(lbd_samples))

In [None]:
control_samples = eur_samples_no_gcs[eur_samples_no_gcs.LATEST_DX=='No PD Nor Other Neurological Disorder']
print(len(control_samples))

In [None]:
lbd_cc = eur_samples_no_gcs[(eur_samples_no_gcs.fid.isin(lbd_samples.fid)) | (eur_samples_no_gcs.fid.isin(control_samples.fid))]
lbd_cc.pheno = -9
lbd_cc.loc[lbd_cc.fid.isin(lbd_samples.fid.tolist()),'pheno'] = 2
lbd_cc.loc[lbd_cc.fid.isin(control_samples.fid.tolist()),'pheno'] = 1
print(lbd_cc.shape)


In [None]:
lbd_cc['pheno'].value_counts()

In [None]:
#get list of lbd cases and controls to keep
lbd_cc[['fid','iid']].to_csv(f"{WRKDIR}/y_male_only_bfiles/lbd_eur_case_control_samples.txt",sep="\t",header=None, index=None)

In [None]:
#subset plink binaries and calculate pcs
print(f"plink --bfile {WRKDIR}/y_male_only_bfiles/chrY_male_hemizygous_only_het_filter_hg19_final --keep {WRKDIR}/y_male_only_bfiles/lbd_eur_case_control_samples.txt --make-bed --out {WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs")

In [None]:
#calc pcs of autosomes
print(f"plink --bfile /PATH/pruned_data --keep {WRKDIR}/y_male_only_bfiles/lbd_eur_case_control_samples.txt --not-chr 23,24,25 --pca --make-bed --out {WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_autosome_pcs")

In [None]:
#recode the lbd phenotypes
fam = pd.read_table(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())
print(fam.pheno.value_counts())

In [None]:
fam.loc[fam.fid.isin(lbd_samples.fid.tolist()),'pheno']=2
fam.loc[fam.fid.isin(control_samples.fid.tolist()),'pheno']=1
print(fam.pheno.value_counts())

In [None]:
fam.to_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs.fam",header=None, index = None, sep = " ")

In [None]:
pdcc = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_case_control_nogcs.fam",sep="\s+",header=None)
pdcc.columns = ['fid','iid','pid','mid','sex','pheno']

print(pdcc.shape)
print(pdcc.head())
print(pdcc.pheno.value_counts())

In [None]:
lbdcc = pd.read_csv(f"{WRKDIR}/y_male_only_bfiles/amppd_lbd_case_control_nogcs.fam",sep="\s+",header=None)
lbdcc.columns = ['fid','iid','pid','mid','sex','pheno']

print(lbdcc.shape)
print(lbdcc.head())
print(lbdcc.pheno.value_counts())