# Generating Coding Variant Files
- **Author** - Frank Grenn
- **Date Started** - April 2020
- **Quick Description:** annotate the risk variants from all gwas for the app with data from annovar and get tagging snps. Then get the coding variants from these and get their LD values, CADD scores and frequencies


In [None]:
import pandas as pd
import os
import numpy as np
import re

In [None]:
DATADIR = '/path/to/AppDataProcessing'
WRKDIR = f"{DATADIR}/codingvars"
TMPDIR = f"{WRKDIR}/temp"

HARDCALLS_DIR = "/path/to/PD_FINAL_PLINK_2018"

In [None]:
!mkdir {TMPDIR}

## 1) Tag GWAS Variants

#### Load GWAS

In [None]:
gwas = pd.read_csv(f"{DATADIR}/gwas_risk_variants.csv")
print(gwas.shape)
print(gwas.head())

#### Generate chr:bp files
update chr and bp index for gwas csv used

In [None]:
chrbp_index = 3
chr_index = 2
loc_index = 5
gwas_index = 4

In [None]:
with open(f"{WRKDIR}/get_05_tags.swarm","w") as swarm_file:
    for i in range(0,len(gwas.index)):
        chrbp = gwas.iloc[i,chrbp_index]
        gwas_str = gwas.iloc[i,gwas_index]
        !echo {chrbp} > {TMPDIR}/{chrbp}_{gwas_str}_gwas.txt
        
        swarm_file.write(f"plink --bfile {HARDCALLS_DIR}/HARDCALLS_PD_september_2018_no_cousins --tag-r2 0.5 --memory 135000 --threads 19 --show-tags {TMPDIR}/{chrbp}_{gwas_str}_gwas.txt --chr {gwas.iloc[i,chr_index]} --out {TMPDIR}/{chrbp}_{gwas_str}_gwas\n")
swarm_file.close()

In [None]:
print(f"swarm -f {WRKDIR}/get_05_tags.swarm -g 10 -t 20 --module=plink")

#### Now Annotate the Tags
add locus numbers and the gwas

In [None]:
for i in range(0,len(gwas.index)):
    chrbp = gwas.iloc[i,chrbp_index]

    locus = gwas.iloc[i, loc_index]
    
    gwas_str = gwas.iloc[i, gwas_index]
    !sed -i.bkp 's/^/{gwas_str}\t{locus}\t/' {TMPDIR}/{chrbp}_{gwas_str}_gwas.tags
    
    

In [None]:
!cat {TMPDIR}/*.tags > {WRKDIR}/ALL_TAGS.txt

now add annovar data and rsids

In [None]:
tags = pd.read_csv(f"{WRKDIR}/ALL_TAGS.txt", header=None,sep='\t')
tags.columns = ["GWAS","locnum","chrbp"]
print(tags.shape)
print(tags.head())

In [None]:
annovar = pd.read_csv("/path/to/HRC_ouput_annovar_ALL.txt",sep='\t')
print(annovar.head())

In [None]:

RS = pd.read_csv("/path/to/HRC_RS_conversion_final.txt",sep='\t')
print(RS.head())

In [None]:
MM = pd.merge(left = tags, right = RS, left_on = "chrbp",right_on = "POS", how = "left")
MM2 = pd.merge(left = MM, right = annovar, left_on = "ID",right_on = "avsnp142", how = "left")

In [None]:
print(MM2.shape)
print(MM2.head())

add the associated disease


In [None]:
catalog = pd.read_csv(f"{DATADIR}/phenovars/GWAS_catalogv1.0.2-associations.txt",sep='\t',encoding='latin1')
print(catalog.shape)
print(catalog.head())


In [None]:
MM2_disease = pd.merge(left = MM2, right = catalog, left_on = 'ID',right_on = 'SNPS', how = 'left')
MM2_disease = MM2_disease[['GWAS', 'ID', 'Chr', 'Start', 'End', 'locnum', 'Ref', 'Alt', 'DISEASE/TRAIT','Func.refGene', 'Gene.refGene', 'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene']]

In [None]:
MM2_disease.to_csv(f"{WRKDIR}/annotated_R05_tags_all_gwas.txt", sep = '\t',index = None)

## 2) Get Coding Variants

In [None]:
coding  = MM2_disease[MM2_disease['Func.refGene'].str.contains('exonic',na=False)]
coding = coding[coding['ExonicFunc.refGene'].str.contains('nonsynonymous')]
print(coding.shape)
print(coding.head())

In [None]:
coding = coding[['GWAS','ID','Chr','Start','End','locnum','Ref','Alt','DISEASE/TRAIT','Func.refGene','Gene.refGene','GeneDetail.refGene','ExonicFunc.refGene','AAChange.refGene']]
coding = coding.drop_duplicates()

In [None]:
coding.to_csv(f"{WRKDIR}/CodingVariant.csv",index=None)

## 3) Get CADD Scores

In [None]:
with open(f"{WRKDIR}/get_CADD_annovar.sh","w") as bash_file:
    bash_file.write(f'''#!/bin/bash\n\
module load annovar\n\
awk -F',' '{{print $3,$4,$5,$7,$8}}' '{WRKDIR}/CodingVariant.csv' > {WRKDIR}/codingvars.avinput\n\
annotate_variation.pl --filter --build hg19 --dbtype cadd --buildver hg19 --otherinfo {WRKDIR}/codingvars.avinput $ANNOVAR_DATA/hg19''')
bash_file.close()

In [None]:
print(f"sbatch {WRKDIR}/get_CADD_annovar.sh")

## 4) Get Frequencies

In [None]:
with open(f"{WRKDIR}/get_frequencies_annovar.sh","w") as bash_file:
    bash_file.write(f'''#!/bin/bash\n\
module load annovar\n\
awk -F',' '{{print $3,$4,$5,$7,$8}}' '{WRKDIR}/CodingVariant.csv' > codingvars.avinput\n\
annotate_variation.pl --filter --build hg19 --dbtype gnomad211_genome --buildver hg19 --otherinfo {WRKDIR}/codingvars.avinput $ANNOVAR_DATA/hg19''')
bash_file.close()

In [None]:
print(f"sbatch {WRKDIR}/get_frequencies_annovar.sh")

## 5) Get LD Data

In [None]:
variants = pd.read_csv(f"{DATADIR}/gwas_risk_variants.csv")
print(variants.head())

#### make a ranges file to speed up plink

In [None]:
ranges = variants[['CHR','BP','BP']]
ranges['ID']="r"+ranges.index.astype(str)
ranges.columns = ['CHR','Start','End','ID']
ranges['Start'] = ranges['Start'].apply(lambda x: max(0,x - 1000000))
ranges['End'] = ranges['End'] + 1000000
print(ranges.shape)
print(ranges.head())

In [None]:
ranges.to_csv(f"{WRKDIR}/LD/ranges.txt",sep='\t',index=None,header=None)

#### Run LD Commands
probably better way to do this but works for now

In [None]:

!mkdir {WRKDIR}/LD/out

In [None]:
with open(f"{WRKDIR}/LD/calculateLD.swarm", 'w') as outfile:
    for i in range(len(variants.index)):
        locus = variants.iloc[i]['Locus Number']
        #print(locus)
    
        locus_codingvars = coding.loc[coding['locnum'] == locus]
    
        snp1 = str(variants.iloc[i]['CHR']) + ":" + str(variants.iloc[i]['BP'])

        if(len(locus_codingvars.index)!=0):
            for i in range(len(locus_codingvars.index)):

                snp2 = str(locus_codingvars.iloc[i]['Chr']) + ":" + str(locus_codingvars.iloc[i]['Start'])


                outfile.write(f"plink --bfile /path/to/HARDCALLS_PD_september_2018_no_cousins --ld {snp1} {snp2} --extract range {WRKDIR}/LD/ranges.txt --out {snp1}_{snp2}\n")    
outfile.close()        


In [None]:
#run the swarm file
print(f"cd {WRKDIR}/LD/out")
print(f"swarm -f {WRKDIR}/LD/calculateLD.swarm --partition quick --module plink")

In [None]:
#read stuff
df = pd.DataFrame(columns=['rsid1','snp1','rsid2','snp2','r2','dprime'])
for i in range(len(variants.index)):
    locus = variants.iloc[i]['Locus Number']
    #print(locus)
    
    locus_codingvars = coding.loc[coding['locnum'] == locus]
    #print(locus_phenovars)
    
    snp1 = str(variants.iloc[i]['CHR']) + ":" + str(variants.iloc[i]['BP'])
    rsid1 = variants.iloc[i]['RSID']
    if(len(locus_codingvars.index)!=0):
        for i in range(len(locus_codingvars.index)):
            #reset the read string to null
            dataline='null'
            snp2 = str(locus_codingvars.iloc[i]['Chr']) + ":" + str(locus_codingvars.iloc[i]['Start'])
            rsid2 = locus_codingvars.iloc[i]['ID']
            #print("{} {}".format(snp1, snp2))


            
            file = open(f"{WRKDIR}/LD/out/"+str(snp1)+"_"+str(snp2)+".log","r")
            
            for line in file:
                if re.search("R-sq", line):
                    dataline = line
                    break
            
            #only add new data if 'R-sq' was found (meaning there was data in the log file and the 'null' value assigned earlier was overwritten)
            if(dataline!='null'):
                #mess with the strings
                dataline = dataline.strip('R-sq = ')
                dataline = dataline.strip(' ')
                splitdata = dataline.split("D' =")
                Rsq = splitdata[0]
                dprime = splitdata[1]
            
                df = df.append({'rsid1': rsid1,'snp1': snp1,'rsid2':rsid2, 'snp2': snp2, 'r2':Rsq.strip(' '), 'dprime':dprime.strip('\n')}, ignore_index = True)
        
print(len(df.index))
print(df.head())
print(df.tail())

In [None]:
df=df.drop_duplicates()

In [None]:
df.to_csv(f"{DATADIR}/results/CodingVariantLD.csv", index = False)

## 4) Combine Everything

#### some formatting functions

In [None]:

def formatCHRBPREFALT(chr,bp,ref,alt):
	return str(chr)+":"+str(bp)+":"+str(ref)+":"+str(alt)
	
def getNFE(frequencies):
	freq = re.split(",", frequencies)[9]
	return freq
	
def getFirstAAChange(AAchanges):
	return re.split(",", AAchanges)[0]

def getFirstGene(genes):
	return re.split(";", genes)[0]

#separate raw and phred cadd scores, and return phred score
def getCADDPhred(cadd):
    caddphred = re.split(",", cadd)[1]
    return caddphred

#### merge with frequency data

In [None]:

variants = pd.read_csv(f"{WRKDIR}/CodingVariant.csv")

variants['CHRBP_REFALT']=variants.apply(lambda x: formatCHRBPREFALT(x.Chr, x.Start, x.Ref, x.Alt), axis = 1)

#array for frequency df column names since annovar doesn't generate column names
names = ["db", "freq", "chr", "start", "end", "ref", "alt"]
frequencies = pd.read_csv(f"{WRKDIR}/codingvars.avinput.hg19_gnomad211_genome_dropped", sep="\s", names = names)

frequencies['freq_nfe']=frequencies.apply(lambda x: getNFE(x.freq), axis = 1)

#give frequencies df a CHRBP_REFALT to give it a unique key to merge with later
frequencies['CHRBP_REFALT']=frequencies.apply(lambda x: formatCHRBPREFALT(x.chr, x.start, x.ref, x.alt), axis = 1)

In [None]:
merge=pd.merge(variants,frequencies, how = 'left', on = "CHRBP_REFALT")
merge = merge[['GWAS', 'ID', 'CHRBP_REFALT','locnum','Gene.refGene','freq_nfe','AAChange.refGene']]
print(merge.shape)
print(merge.head())
print(merge.tail())

#### merge with CADD data

In [None]:
names = ["db", "cadd", "chr", "start", "end", "ref", "alt"]
cadd = pd.read_csv(f"{WRKDIR}/codingvars.avinput.hg19_cadd_dropped", sep="\s", names = names)

cadd['cadd_phred'] = cadd.apply(lambda x: getCADDPhred(x.cadd), axis = 1)

#give cadd df a CHRBP_REFALT to give it a unique key to merge with later
cadd['CHRBP_REFALT']=cadd.apply(lambda x: formatCHRBPREFALT(x.chr, x.start, x.ref, x.alt), axis = 1)



In [None]:
merge_final = pd.merge(merge, cadd, how = 'left', on = "CHRBP_REFALT")

#### just get the first AA change
may need to manually edit some of these as well

In [None]:
merge_final['AA Change']=merge_final.apply(lambda x: getFirstAAChange(x['AAChange.refGene']), axis = 1)

merge_final['Gene.refGene'] = merge_final.apply(lambda x: getFirstGene(x['Gene.refGene']), axis = 1)

In [None]:
coding_data = merge_final[['GWAS','ID', 'CHRBP_REFALT','locnum','Gene.refGene','AA Change','freq_nfe','cadd_phred']]

coding_data = coding_data.drop_duplicates()

coding_data.to_csv(f"{DATADIR}/results/CodingVariants.csv", index = False)