# Collect Variant Population Frequencies Using Annovar
- **Author** - Frank Grenn
- **Date Started** - April 2020
- **Quick Description:** use annovar to get population frequencies for all gwas risk variants for the app

In [None]:
import pandas as pd

In [None]:
DATADIR= "$PATH/AppDataProcessing"
WRKDIR = f"{DATADIR}/othersummarystats"

## Get Population Frequencies From gnomAD using ANNOVAR

In [None]:
variants = pd.read_csv(f"{DATADIR}/gwas_risk_variants.csv")
print(variants.shape)
print(variants.head())

In [None]:


avinput = variants[['CHR','BP','BP','REF','ALT']]
avinput.to_csv(f"{WRKDIR}/gwas_risk_variants.avinput",index=None,sep=' ')

In [None]:
with open(f"{WRKDIR}/get_frequencies_annovar.sh","w") as bash_file:
    bash_file.write(f'''#!/bin/bash\n\
module load annovar\n\
annotate_variation.pl --filter --build hg19 --dbtype gnomad211_genome --buildver hg19 --otherinfo {WRKDIR}/gwas_risk_variants.avinput $ANNOVAR_DATA/hg19''')
bash_file.close()

In [None]:
print(f"sbatch {WRKDIR}/get_frequencies_annovar.sh")

gnomad211_genome:
 Chr    Start   End     Ref     Alt     AF      AF_popmax       AF_male AF_female       AF_raw  AF_afr  AF_sas  AF_amr  AF_eas  AF_nfe  AF_fin  AF_asj  AF_oth   non_topmed_AF_popmax    non_neuro_AF_popmax     non_cancer_AF_popmax    controls_AF_popmax

gnomad_genome:
#Chr    Start   End     Ref     Alt     gnomAD_genome_ALL       gnomAD_genome_AFR       gnomAD_genome_AMR       gnomAD_genome_ASJ       gnomAD_genome_EAgnomAD_genome_FIN        gnomAD_genome_NFE       gnomAD_genome_OTH

In [None]:
freqs = pd.read_csv(f"{WRKDIR}/gwas_risk_variants.avinput.hg19_gnomad211_genome_dropped",sep="\t",header=None)
freqs.columns = ['db','freqs','snp']
print(freqs.shape)
print(freqs.head())

In [None]:
freqs[['CHR','START','END','REF','ALT']]=freqs.snp.str.split(" ",expand=True)
print(freqs.head())

In [None]:
freqs[['AF','AF_popmax','AF_male','AF_female','AF_raw','AF_afr','AF_sas','AF_amr','AF_eas','AF_nfe','AF_fin','AF_asj','AF_oth','non_topmed_AF_popmax','non_neuro_AF_popmax','non_cancer_AF_popmax','controls_AF_popmax']]=freqs.freqs.str.split(",",expand=True)
print(freqs.head())

In [None]:
freqs['CHR_BP'] = freqs['CHR'].astype(str)+":"+freqs['START'].astype(str)
freqs = freqs.drop(labels=['freqs','snp','db','CHR','REF','ALT'],axis=1)
print(freqs.head())

In [None]:
merged = pd.merge(left=variants,right = freqs, on = 'CHR_BP',how='inner')
print(merged.shape)
print(merged.head())

In [None]:
print(merged.columns)

In [None]:
merged = merged[['BP', 'CHR', 'CHR_BP', 'GWAS', 'LOC_NUM', 'NEAR_GENE', 'RSID',
       'REF', 'ALT', 'START', 'END', 'AF', 'AF_popmax', 'AF_male', 'AF_female',
       'AF_raw', 'AF_afr', 'AF_sas', 'AF_amr', 'AF_eas', 'AF_nfe', 'AF_fin',
       'AF_asj', 'AF_oth', 'non_topmed_AF_popmax', 'non_neuro_AF_popmax',
       'non_cancer_AF_popmax', 'controls_AF_popmax']]

In [None]:
merged.head()

## Get Frequencies from our Plink Files 
#### first get case and control frequency

In [None]:
plink_bin = "$PATH/PD_FINAL_PLINK_2018/HARDCALLS_PD_september_2018_no_cousins"

In [None]:
print("module load plink")
print(f"plink --bfile {plink_bin} --assoc --out pd_freq")

In [None]:
print("awk '{print $2,$5,$6}' pd_freq.assoc > pd_freq.txt")

In [None]:
data = pd.read_csv(f"$PATH/pd_freq.txt",sep=' ')
print(data.shape)
print(data.head())

In [None]:
freqs_assoc = pd.merge(left=merged,right = data, left_on = 'CHR_BP',right_on = 'SNP',how='inner')
freqs_assoc=freqs_assoc.drop('SNP',axis=1)
print(freqs_assoc.shape)
print(freqs_assoc.head())

#### second, get AFF and UNAFF values

In [None]:
print(f"plink --bfile {plink_bin} --model")

In [None]:
print("grep SNP $PATH/plink.model > model_geno.txt")
print("grep GENO $PATH/plink.model >> model_geno.txt")

In [None]:
model = pd.read_csv(f"$PATH/model_geno.txt",sep='\s+')
print(model.shape)
print(model.head())

In [None]:
model_aff = model[['SNP','AFF','UNAFF']]

In [None]:
freqs_model = pd.merge(left=freqs_assoc,right = model_aff, left_on = 'CHR_BP',right_on = 'SNP',how='inner')
freqs_model=freqs_model.drop('SNP',axis=1)
print(freqs_model.shape)
print(freqs_model.head())

In [None]:
freqs_model = freqs_model.rename(columns={"F_A": "Frequency_PD", "F_U": "Frequency_control"})

In [None]:
freqs_model.columns

## Check Frequencies match the minor allele, which we will make the effect allele in the browser
this only applies to META5 gwas risk variants as of now. Progression and Asian gwas variants frequencies are already for the minor allele

In [None]:
#this file should have the effect allele assigned as the minor allele which we will use to compare
meta5_loci = pd.read_csv(f"{DATADIR}/META5Loci.csv")

meta5_loci = meta5_loci[['RSID','EFFECT_FREQ','EFFECT_ALLELE','OTHER_ALLELE']]
print(meta5_loci.shape)
print(meta5_loci.head())

In [None]:
freqs_model_meta5 = freqs_model[freqs_model['GWAS']=='META5']
print(freqs_model_meta5.shape)

In [None]:
merged = pd.merge(meta5_loci, freqs_model_meta5, left_on = 'RSID', right_on = 'RSID',how = 'inner')
print(merged.shape)


In [None]:
meta5_match=merged[merged['EFFECT_ALLELE']==merged['ALT'].str.upper()]
print(meta5_match.shape)
print(meta5_match.head())

In [None]:
meta5_mismatch=merged[merged['EFFECT_ALLELE']!=merged['ALT'].str.upper()]
print(meta5_mismatch.shape)
print(meta5_mismatch.head())

In [None]:
meta5_mismatch.columns

In [None]:
meta5_mismatch['AF']=1-meta5_mismatch['AF'].astype(float)
meta5_mismatch['AF_popmax']=1-meta5_mismatch['AF_popmax'].astype(float)
meta5_mismatch['AF_male']=1-meta5_mismatch['AF_male'].astype(float)
meta5_mismatch['AF_female']=1-meta5_mismatch['AF_female'].astype(float)
meta5_mismatch['AF_raw']=1-meta5_mismatch['AF_raw'].astype(float)
meta5_mismatch['AF_afr']=1-meta5_mismatch['AF_afr'].astype(float)

meta5_mismatch['AF_amr']=1-meta5_mismatch['AF_amr'].astype(float)
meta5_mismatch['AF_eas']=1-meta5_mismatch['AF_eas'].astype(float)
meta5_mismatch['AF_nfe']=1-meta5_mismatch['AF_nfe'].astype(float)
meta5_mismatch['AF_fin']=1-meta5_mismatch['AF_fin'].astype(float)
meta5_mismatch['AF_asj']=1-meta5_mismatch['AF_asj'].astype(float)
meta5_mismatch['AF_oth']=1-meta5_mismatch['AF_oth'].astype(float)
meta5_mismatch['non_topmed_AF_popmax']=1-meta5_mismatch['non_topmed_AF_popmax'].astype(float)
meta5_mismatch['non_neuro_AF_popmax']=1-meta5_mismatch['non_neuro_AF_popmax'].astype(float)
meta5_mismatch['controls_AF_popmax']=1-meta5_mismatch['controls_AF_popmax'].astype(float)


In [None]:
final_meta5 = meta5_match.append(meta5_mismatch)
final_meta5 = final_meta5[['BP', 'CHR', 'CHR_BP', 'GWAS', 'LOC_NUM', 'NEAR_GENE', 'RSID', 'REF',
       'ALT', 'START', 'END', 'AF', 'AF_popmax', 'AF_male', 'AF_female',
       'AF_raw', 'AF_afr', 'AF_sas', 'AF_amr', 'AF_eas', 'AF_nfe', 'AF_fin',
       'AF_asj', 'AF_oth', 'non_topmed_AF_popmax', 'non_neuro_AF_popmax',
       'non_cancer_AF_popmax', 'controls_AF_popmax', 'Frequency_PD',
       'Frequency_control', 'AFF', 'UNAFF']]
print(final_meta5.shape)
print(final_meta5.head())

In [None]:
nonmeta5_freqs = freqs_model[freqs_model['GWAS']!='META5']
print(nonmeta5_freqs.shape)


In [None]:
final_freqs_corrected = final_meta5.append(nonmeta5_freqs)
print(final_freqs_corrected.shape)

In [None]:
final_freqs = final_freqs_corrected[['GWAS','LOC_NUM','RSID','Frequency_PD','Frequency_control','AFF','UNAFF','AF_afr','AF_asj','AF_eas','AF_fin','AF_nfe','AF_amr','AF_oth']]
final_freqs.columns = ['GWAS','LOC_NUM','RSID','Frequency_PD','Frequency_control','AFF','UNAFF','African','Ashkenazi Jewish','East Asian','European (Finnish)','European (non-Finnish)','Latino','Other']

In [None]:
final_freqs.to_csv(f"{WRKDIR}/risk_variant_pop_freqs.csv",index=None)