# Setup NEUROX Y Chromosome Files for Y Haplogroup Tools
- **Author(s)** - Frank Grenn
- **Quick Description:** remove females from chrY data and calculate autosomal pcs for case-control NeuroX dataset

In [None]:
import pandas as pd

In [None]:
WRKDIR = "/PATH/chrY"
BFILEDIR = f"{WRKDIR}/y_neurox"
OUTDIR = f"{WRKDIR}/y_neurox"

### get samples to use from covariate file

In [None]:
covs = pd.read_table("/PATH/IPDGC_all_samples_covariates.txt")
print(covs.shape)
print(covs.head())

In [None]:
covs.columns

In [None]:
set(covs['PHENO_PLINK'])

In [None]:
set(covs['PHENO'])

In [None]:
neurox_fam = pd.read_table("/PATH/RawData_BinaryFormat.fam")
neurox_fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(neurox_fam.shape)
print(neurox_fam.head())
print(neurox_fam['pheno'].value_counts())

In [None]:
covs_neurox = covs[covs['DATASET']=='NEUROX_DBGAP']
print(covs_neurox.shape)

In [None]:
print(covs_neurox.head())

In [None]:
covs_neurox[['FID']].to_csv(f"{OUTDIR}/samples_to_use.txt", index=None, header=None)

In [None]:
#use plink to subset binaries for only the samples we want
print(f"module load plink")
print(f"plink --bfile /PATH/RawData_BinaryFormat --keep-fam {OUTDIR}/samples_to_use.txt --make-bed --out {OUTDIR}/neurox")

### Remove females and non chr Y variants

In [None]:
!(module load plink; plink --bfile {OUTDIR}/neurox --filter-males --chr 24 --make-bed --out {OUTDIR}/neurox_chrY_male_only)

### Check Phenotypes

In [None]:
fam = pd.read_table(f"{OUTDIR}/neurox_chrY_male_only.fam",sep="\s+",header=None)
fam.columns = ['fid','iid','pid','mid','sex','pheno']
print(fam.shape)
print(fam.head())

In [None]:
print(set(fam['sex']))
print(set(fam['pheno']))

In [None]:
merged = pd.merge(left = fam, right = covs, left_on = 'fid', right_on = 'FID', how = 'left')
print(merged.shape)
print(merged[merged['pheno']==merged['PHENO_PLINK']].shape)

In [None]:
merged['pheno'].value_counts()

### Convert to VCF

In [None]:
!(module load plink;plink --bfile {OUTDIR}/neurox_chrY_male_only --recode vcf --out {OUTDIR}/neurox_chrY_male_only)

### Calculate Autosomal PCs

In [None]:
#get the males from the binary file
print(f"cut -f 1,2 {OUTDIR}/neurox_chrY_male_only.fam > male_samples.txt")

In [None]:
#prune
print(f"plink --bfile /PATH/RawData_BinaryFormat --keep-fam {OUTDIR}/male_samples.txt --indep-pairwise 1000 10 0.02 --out {OUTDIR}/neurox_case_control_allchr_pruning")
print(f"plink --bfile /PATH/RawData_BinaryFormat --keep-fam {OUTDIR}/male_samples.txt --extract {OUTDIR}/neurox_case_control_allchr_pruning.prune.in --make-bed --out {OUTDIR}/neurox_case_control_allchr_pruned")

In [None]:
#use plink to subset binaries for only the samples we want
print(f"module load plink")
print(f"plink --bfile {OUTDIR}/neurox_case_control_allchr_pruned --keep-fam {OUTDIR}/male_samples.txt --must-have-sex --not-chr 23,24,25,26 --pca --make-bed --out {OUTDIR}/neurox_case_control_allchr_pruned_pcs")