# Filter for Hemizygous Variants Only
- **Author(s)** - Frank Grenn
- **Date Started** - March  2021
- **Quick Description:**  filter chrY AMP-PD data to only include males, hemizygous variant, and variants with low heterozygosity

In [None]:
import pandas as pd

In [None]:
WRKDIR = "$PATH/chrY/y_male_only_bfiles"

In [None]:
#get the plink files
!(module load plink; plink --bfile $PATH/bfile/chrY --make-bed --out {WRKDIR}/temp)

### update sex and pheno in fam file

In [None]:
temp = pd.read_table(f"{WRKDIR}/chrY.fam",sep="\s+",header=None)
temp.columns = ['fid','iid','pid','mid','sex','pheno']
print(temp.shape)
print(temp.head())

In [None]:
meta = pd.read_csv("$PATH/AMPPD_releasev2_covariates_Feb2021.csv")
print(meta.shape)
print(meta.head())

In [None]:
tempm = pd.merge(left =temp, right = meta, left_on = 'fid', right_on = 'ID')
print(tempm.shape)

In [None]:
tempm = tempm[['fid','iid','pid','mid','SEX','PHENO','LATEST_DX']]
print(tempm.head())

In [None]:
tempm['PHENO'] = tempm['PHENO'].fillna(-9)
tempm['PHENO'] = tempm['PHENO'].astype(int)
set(tempm['PHENO'])

In [None]:
print(tempm[tempm.PHENO==1].shape)
print(tempm[tempm.PHENO==2].shape)
print(tempm[tempm.PHENO==-9].shape)

In [None]:
tempm[tempm.PHENO==-9]

In [None]:
tempm[tempm.PHENO==1].LATEST_DX.value_counts()

In [None]:
tempm[tempm.PHENO==2].LATEST_DX.value_counts()

In [None]:
tempm[tempm.PHENO==-9].LATEST_DX.value_counts()

In [None]:
tempm[['fid','iid','pid','mid','SEX','PHENO']].to_csv(f"{WRKDIR}/temp.fam",sep="\t",header=None,index=None)

### give variants unique ids
or at least set rsid variants ids to rsid_a1_a2

In [None]:
tempbim = pd.read_table(f"{WRKDIR}/temp.bim",sep="\t",header=None)
tempbim.columns = ['chr','snp','pos','bp','a1','a2']
print(tempbim.shape)
print(tempbim.head())

In [None]:
#see which have duplicate snp values
tempbim[tempbim.duplicated(subset = 'snp',keep=False)].sort_values('snp')

In [None]:
len(set(tempbim['snp']))

In [None]:
tempbim[tempbim['snp'].str.contains('rs')].head()

In [None]:
#set snp col values with rsids to rsid_a1_a2 to make them unique
tempbim.loc[tempbim['snp'].str.contains('rs'),'snp'] = tempbim.loc[tempbim['snp'].str.contains('rs'),'snp'] + '_' + tempbim.loc[tempbim['snp'].str.contains('rs'),'a1'] + '_' + tempbim.loc[tempbim['snp'].str.contains('rs'),'a2']

In [None]:
#check if we no longer have duplicated snp values
tempbim[tempbim.duplicated(subset = 'snp',keep=False)].sort_values('snp')

In [None]:
tempbim.head()

In [None]:
tempbim.tail()

In [None]:
print(tempbim.shape)
print(len(set(tempbim['snp'].tolist())))

In [None]:
tempbim.to_csv(f"{WRKDIR}/temp.bim",sep="\t",header=None,index=None)


### remove females

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp --filter-males --make-bed --out {WRKDIR}/temp_males)

### check frequencies

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp_males --freqx --out {WRKDIR}/temp_males)

In [None]:
count = pd.read_table(f"{WRKDIR}/temp_males.frqx")
print(count.shape)
print(count.head())
print(count.tail())

### try --set-hh-missing

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp_males --set-hh-missing --make-bed --out {WRKDIR}/temp_males_miss)

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp_males_miss --freqx --out {WRKDIR}/temp_males_miss)

In [None]:
miss_count = pd.read_table(f"{WRKDIR}/temp_males_miss.frqx")
print(miss_count.shape)
print(miss_count.head())
print(miss_count.tail())

### do same, but change chr to autosomal to compare counts

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp --make-bed --out {WRKDIR}/temp22)

In [None]:
temp22bim = pd.read_table(f"{WRKDIR}/temp22.bim",sep="\s+",header=None)
temp22bim.columns = ['chr','snp','pos','bp','a1','a2']
print(temp22bim.shape)
print(temp22bim.head())

In [None]:
temp22bim['chr']=22

In [None]:
temp22bim.to_csv(f"{WRKDIR}/temp22.bim",sep="\t",header=None,index=None)

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp22 --filter-males --make-bed --out {WRKDIR}/temp22_males)

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp22_males --freqx --out {WRKDIR}/temp22_males)

In [None]:
recode_count = pd.read_table(f"{WRKDIR}/temp22_males.frqx")
print(recode_count.shape)
print(recode_count.head())
#print(recode_count.tail())

In [None]:
#compare with the --set-hh-missing results
print(miss_count.shape)
print(miss_count.head())

#### so looks like the heterozygous variants were correctly set to missing

### Identify Variants with High Heterozygosity 

In [None]:
#how many samples
!wc -l {WRKDIR}/temp22_males.fam

In [None]:
#how many variants are heterozygous in 10% or more of the samples
recode_count[recode_count['C(HET)']/5470>=0.1].shape

In [None]:
recode_count[recode_count['C(HET)']/5470>=0.1]

In [None]:
bad_variants = list(set(recode_count[recode_count['C(HET)']/5470>=0.1]['SNP']))

In [None]:
len(bad_variants)

In [None]:
len(set(bad_variants))

In [None]:
pd.DataFrame(data={'bad_variants':bad_variants}).to_csv(f"{WRKDIR}/high_heterozygous_variants.txt",index=None,header=None)

In [None]:
#drop the high heterozygous variants
!(module load plink; plink --bfile {WRKDIR}/temp_males_miss --exclude {WRKDIR}/high_heterozygous_variants.txt --make-bed --out {WRKDIR}/chrY_male_hemizygous_only_het_filter)

### cleanup

In [None]:
!(module load plink; plink --bfile {WRKDIR}/temp_males_miss --make-bed --out {WRKDIR}/chrY_male_hemizygous_only)

In [None]:
!rm {WRKDIR}/temp*