# Setup UKBioBank Y Chromosome Files for Y Haplogroup Tools
- **Author(s)** - Frank Grenn
- **Date Started** - March  2021
- **Quick Description:** remove females, identify cases and proxies, select random control samples for case-control and proxy-control datasets, and calculate autosomal pcs for case-control and proxy-control datasets. All for UKBB data

In [None]:
import pandas as pd
import random

In [None]:
WRKDIR = "$PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_ukbb"
OUTDIR = f"{WRKDIR}/y_ukbb"
CARDDIR = "$PATH"

### Remove females

In [None]:
!(module load plink; plink --bed {BFILEDIR}/ukb_cal_chrY_v2.bed --bim {BFILEDIR}/ukb_snp_chrY_v2.bim --fam {BFILEDIR}/ukb33601_cal_chr1_v2_s488363.fam --filter-males --make-bed --out {BFILEDIR}/chrY_male_only)

### Keep only European Samples

In [None]:
anc = pd.read_table("$PATH/covariates_phenome_to_use.txt")
print(anc.shape)
print(anc.head())

In [None]:
eur = anc[anc.EUROPEAN==1]
print(eur.shape)
#get list of cases and controls to keep
eur[['FID','IID']].to_csv(f"{BFILEDIR}/eur_samples.txt",sep="\t",header=None, index=None)

In [None]:
#subset plink binaries and calculate pcs
print(f"plink --bfile {BFILEDIR}/chrY_male_only --keep {BFILEDIR}/eur_samples.txt --make-bed --out {BFILEDIR}/chrY_eur_male_only")

### Update Phenotypes

In [None]:
fam = pd.read_table(f"{BFILEDIR}/chrY_eur_male_only.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

In [None]:
pd_ukbb_codes = pd.read_table("$PATH/parkinson_disease.txt")
print(pd_ukbb_codes.shape)
print(pd_ukbb_codes.head())
print(set(pd_ukbb_codes['131023-0.0']))

In [None]:
pd_ukbb_phenos = pd.read_table("$PATH/parkinson_disease_plink.txt",header=None)
pd_ukbb_phenos.columns = ['fid','iid']
print(pd_ukbb_phenos.shape)
print(pd_ukbb_phenos.head())

In [None]:
proxy = pd.read_csv("$PATH/disease_of_father.txt.csv")
print(proxy.shape)
print(proxy.head())

In [None]:
#11 is for PD
pd_proxies = proxy.loc[(proxy==11).any(1)]

In [None]:
#how many PD cases also have PD fathers?
len([i for i in pd_proxies.eid.tolist() if i in pd_ukbb_phenos.fid.tolist()])

In [None]:
#get the samples with PD fathers that don't have PD themselves
pd_proxy_list = [i for i in pd_proxies.eid.tolist() if i not in pd_ukbb_phenos.fid.tolist()]

In [None]:
len(pd_proxy_list)

set phenotypes. 1 for control, 2 for PD case, 3 for father with PD

In [None]:
#set to control by default
fam.pheno = 1

In [None]:
fam.loc[fam.fid.isin(pd_ukbb_phenos.fid.tolist()),'pheno']=2

In [None]:
fam.loc[fam.fid.isin(pd_proxy_list),'pheno']=3

In [None]:
fam[fam.pheno==2].shape#2170

In [None]:
fam[fam.pheno==1].shape#217002

In [None]:
fam[fam.pheno==3].shape#4334

In [None]:
fam.to_csv(f"{BFILEDIR}/chrY_eur_male_only.fam",header=None, index = None, sep = " ")

### Check variant frequencies

In [None]:
!(module load plink; plink --bfile {BFILEDIR}/chrY_male_only --freqx --out {BFILEDIR}/chrY_male_only)

### Convert to VCF

In [None]:
!(module load plink;plink --bfile {BFILEDIR}/chrY_male_only --recode vcf --out {BFILEDIR}/chrY_male_only)

### Get Case-Control and Proxy-Control Sample List
randomize controls

In [None]:
fam = pd.read_csv(f"{BFILEDIR}/chrY_eur_male_only.fam",header=None, sep = "\s+")
fam.columns = ['fid','iid','mat','pat','sex','pheno']
print(fam.shape)
print(fam.head())
print(fam['pheno'].value_counts())

In [None]:
#how many controls, cases, and proxies do we have?
print(fam[fam.pheno==1].shape)
print(fam[fam.pheno==2].shape)
print(fam[fam.pheno==3].shape)

In [None]:
num_controls = len(fam[fam.pheno==1].index) * 1/3
num_controls

In [None]:
#random sample the controls
random_control_ids_for_case_control = random.sample(fam[fam.pheno==1].fid.tolist(),int(num_controls))
print(len(random_control_ids_for_case_control))
print(random_control_ids_for_case_control[0:10])

In [None]:
#use the rest for the proxy-control analysis
temp = set(fam[fam.pheno==1].fid.tolist()) ^ set(random_control_ids_for_case_control)

random_control_ids_for_proxy_control = list(temp)
print(len(random_control_ids_for_proxy_control))
print(random_control_ids_for_proxy_control[0:10])

In [None]:
case_control_df = fam[(fam.pheno==2) | (fam.fid.isin(random_control_ids_for_case_control))]
print(set(case_control_df.pheno))
print(case_control_df.shape)
print(case_control_df.head())

In [None]:
case_control_df[['fid','iid']].to_csv(f"{BFILEDIR}/ukbb_case_control_samples.txt",sep="\t",index=None,header=None)


In [None]:
proxy_control_df = fam[(fam.pheno==3) | (fam.fid.isin(random_control_ids_for_proxy_control))]
print(set(proxy_control_df.pheno))
print(proxy_control_df.shape)
print(proxy_control_df.head())

In [None]:
proxy_control_df[['fid','iid']].to_csv(f"{BFILEDIR}/ukbb_proxy_control_samples.txt",sep="\t",index=None,header=None)

### Calculate Autosomal PCs

#### case control

In [None]:
#subset all chromosomes and combine
for i in range(1,23):
    print(i)
    !echo plink --bed {CARDDIR}/UKBIOBANK/GENOTYPE_DATA/ukb_cal_chr{i}_v2.bed --bim {CARDDIR}/UKBIOBANK/GENOTYPE_DATA/ukb_snp_chr{i}_v2.bim --fam {CARDDIR}/UKBIOBANK/GENOTYPE_DATA/ukb33601_cal_chr1_v2_s488363.fam --keep-fam {BFILEDIR}/ukbb_case_control_samples.txt --make-bed --out {CARDDIR}/projects/chromosome_y_expression/ukbb/chr{i}_case_control >> {CARDDIR}/projects/chromosome_y_expression/ukbb/subset_chr.swarm
    !echo {CARDDIR}/projects/chromosome_y_expression/ukbb/chr{i}_case_control >> {CARDDIR}/projects/chromosome_y_expression/ukbb/merge_list.txt

In [None]:
!wc -l {BFILEDIR}/ukbb_case_control_samples.txt

In [None]:
#merge
print(f"plink --merge-list {CARDDIR}/projects/chromosome_y_expression/ukbb/merge_list.txt --make-bed --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr")

In [None]:
#prune
print(f"plink --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr --indep-pairwise 1000 10 0.02 --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr_pruning")
print(f"plink --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr --extract {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr_pruning.prune.in --make-bed --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr_pruned")


In [None]:
#use flash pca
print(f"module load flashpca\n\
flashpca --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr_pruned --suffix _case_control_pca.txt --numthreads 28")

In [None]:
#or use plink (slow)
print(f"plink --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr_pruned --not-chr 23,24,25,26 --pca 5 --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_case_control_allchr_pruned_pcs")

#### proxy control

In [None]:
#subset all chromosomes and combine
for i in range(1,23):
    print(i)
    !echo plink --bed {CARDDIR}/UKBIOBANK/GENOTYPE_DATA/ukb_cal_chr{i}_v2.bed --bim {CARDDIR}/UKBIOBANK/GENOTYPE_DATA/ukb_snp_chr{i}_v2.bim --fam {CARDDIR}/UKBIOBANK/GENOTYPE_DATA/ukb33601_cal_chr1_v2_s488363.fam --keep-fam {BFILEDIR}/ukbb_proxy_control_samples.txt --make-bed --out {CARDDIR}/projects/chromosome_y_expression/ukbb/chr{i}_proxy_control >> {CARDDIR}/projects/chromosome_y_expression/ukbb/subset_chr.swarm
    !echo {CARDDIR}/projects/chromosome_y_expression/ukbb/chr{i}_proxy_control >> {CARDDIR}/projects/chromosome_y_expression/ukbb/merge_list.txt

In [None]:
#merge
print(f"plink --merge-list {CARDDIR}/projects/chromosome_y_expression/ukbb/merge_list.txt --make-bed --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr")

In [None]:
#prune
print(f"plink --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr --indep-pairwise 1000 10 0.02 --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr_pruning")
print(f"plink --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr --extract {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr_pruning.prune.in --make-bed --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr_pruned")


In [None]:
#use flash pca
print(f"module load flashpca\n\
flashpca --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr_pruned --suffix _proxy_control_pca.txt --numthreads 28")

In [None]:
#or use plink (slow)
print(f"plink --bfile {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr_pruned --not-chr 23,24,25,26 --pca --out {CARDDIR}/projects/chromosome_y_expression/ukbb/ukbb_proxy_control_allchr_pruned_pcs")