# Generating Coding Variants With LDLink
- **Author** - Frank Grenn
- **Date Started** - May 2020
- **Quick Description:** use LDLink to get proxy coding variants for all risk variants in the browser. Then use ANNOVAR to get the CADD scores and aa change.


In [None]:
import pandas as pd
import os
import numpy as np
import re

In [None]:
DATADIR = '$PATH/AppDataProcessing'
WRKDIR = f"{DATADIR}/codingvars"
TMPDIR = f"{WRKDIR}/temp"


In [None]:
!mkdir {TMPDIR}

## 1) Get List of Variants and Populations

#### Load GWAS

In [None]:
gwas = pd.read_csv(f"{DATADIR}/gwas_risk_variants.csv")
print(gwas.shape)
print(gwas.head())

In [None]:
gwas['POP'] = np.where(gwas['GWAS']=='Asian','EAS','EUR')
print(gwas.head())
print(gwas.tail())

In [None]:
gwas.to_csv(f"{WRKDIR}/variants_with_population.csv", index=None)

## 2) Proxy Variants
call the R script  
need to pass path to the variant file and an option r2 cutoff

In [None]:
r2 = ""#0.5

In [None]:
print(f"Rscript getRiskSNPProxies.R {WRKDIR}/variants_with_population.csv {r2}")

## 3) Read Proxy Files

In [None]:
proxy_df = pd.DataFrame(columns=["REF_SNP","RS_Number","Coord","R2","Dprime","Distance","MAF","Alleles","Correlated_Alleles","RegulomeDB","Function"])
print(proxy_df)

In [None]:

for i in range(len(gwas.index)):
    risk_var = gwas.iloc[i,:]['RSID']
    #print(gwas.iloc[i,:]['RSID'])
    #print(i)

    proxies = pd.read_csv(f"{WRKDIR}/proxy_snps/{risk_var}_proxies.csv")
    proxies['REF_SNP'] = risk_var
    
    proxy_df = proxy_df.append(proxies)
    #break;
print(proxy_df.shape)
print(proxy_df.head())

In [None]:
set(proxy_df['X.'])

In [None]:
coding_proxies = proxy_df[(proxy_df['Function'].notna()) & (proxy_df['Function']!=".")]
print(coding_proxies.shape)
print(coding_proxies.head())

In [None]:
coding_proxies['chrbprefalt'] = coding_proxies['Coord'].str.replace("chr","") + ":" + coding_proxies['Alleles'].str.replace("(","").str.replace(")","").str.replace("/",":")
print(coding_proxies.head())

In [None]:
coding_proxies=coding_proxies.sort_values(by=['RS_Number'])


In [None]:
coding_proxies.shape

In [None]:

coding_proxies.to_csv(f"{WRKDIR}/coding_proxies.txt",sep="\t",index=None)

## 3) Make avinput for ANNOVAR

In [None]:
annovar_proxies = coding_proxies[['Alleles','Coord','RS_Number']]

In [None]:
temp = annovar_proxies['Alleles'].str.replace("(","").str.replace(")","").str.split("/",n=1,expand=True)
annovar_proxies['Ref']=temp[0]
annovar_proxies['Alt']=temp[1]
print(annovar_proxies.head())

In [None]:
annovar_proxies['Coord'] = annovar_proxies['Coord'].str.replace("chr","")
temp = annovar_proxies['Coord'].str.split(":",n=1,expand=True)
annovar_proxies['Chr']  = temp[0]
annovar_proxies['Start'] = temp[1]
annovar_proxies['End'] = temp[1]

print(annovar_proxies.head())

#### apparently the avinput needs the End position to be changed for deletions

In [None]:
annovar_proxies['Start'] = annovar_proxies['Start'].astype(int)

annovar_proxies['End'] = annovar_proxies['End'].astype(int)
annovar_proxies.loc[annovar_proxies.Alt == '-', 'End'] = annovar_proxies.loc[annovar_proxies.Alt == '-', 'Start'] + annovar_proxies.loc[annovar_proxies.Alt == '-', 'Ref'].str.len() - 1


In [None]:
annovar_proxies = annovar_proxies.drop_duplicates()

In [None]:
print(annovar_proxies.shape)
print(annovar_proxies.head())

In [None]:
annovar_proxies[annovar_proxies['Alt']=='-']['End']

In [None]:
annovar_proxies[annovar_proxies['Alt']=='-']

In [None]:
annovar_proxies[['Chr', 'Start', 'End', 'Ref', 'Alt']].to_csv(f"{WRKDIR}/coding_proxies.avinput", index = None,sep=" ")

## 4) Annotate with ANNOVAR

In [None]:
!(module load annovar;\
table_annovar.pl coding_proxies.avinput $ANNOVAR_DATA/hg19 -buildver hg19 -out coding_proxies -remove -protocol refGene,avsnp142 -operation g,f -nastring . )

## 5) Get CADD Scores from ANNOVAR

In [None]:
!(module load annovar;\
annotate_variation.pl --filter --build hg19 --dbtype cadd --buildver hg19 --otherinfo coding_proxies.avinput $ANNOVAR_DATA/hg19)
#table_annovar.pl coding_proxies.avinput $ANNOVAR_DATA/hg19 -buildver hg19 -out coding_proxies -remove -protocol refGene,avsnp142 -operation g,f -nastring . )




## 6) Combine LDLink data, CADD scores and ANNOVAR data

#### read the annovar data

In [None]:
annovar = pd.read_csv(f"{WRKDIR}/coding_proxies.hg19_multianno.txt",sep="\t")
print(annovar.shape)
print(annovar.head())

In [None]:
annovar['chrbprefalt'] = annovar['Chr'] + ":" + annovar['Start'] + ":" + annovar['Ref'] + ":" + annovar['Alt']

In [None]:
print(annovar.head())

In [None]:
annovar[annovar['Alt']=='-']

In [None]:
len(set(annovar['chrbprefalt']))

In [None]:
len(set(coding_proxies['chrbprefalt']))

#### merge annovar data with LDLink data

In [None]:
ldlink_annovar = pd.merge(left = annovar, right = coding_proxies, on = 'chrbprefalt',how = 'inner')
print(ldlink_annovar.shape)
print(ldlink_annovar.head())

#### merge that with the CADD data

In [None]:
names = ["db", "cadd", "chr", "start", "end", "ref", "alt"]
cadd = pd.read_csv(f"{WRKDIR}/coding_proxies.avinput.hg19_cadd_dropped", sep="\s", names = names)


print(cadd.shape)
print(cadd.head())

In [None]:
cadd['cadd_phred'] = cadd['cadd'].str.split(",",n=1,expand=True)[1]
print(cadd.head())

In [None]:
cadd['chrbprefalt'] = cadd['chr'].astype(str) + ":" + cadd['start'].astype(str) + ":" + cadd['ref'] + ":" + cadd['alt']

In [None]:
cadd.shape

In [None]:
cadd.head()

In [None]:
cadd_ldlink_annovar = pd.merge(left = ldlink_annovar, right = cadd, on = 'chrbprefalt', how = 'left')
print(cadd_ldlink_annovar.shape)
print(cadd_ldlink_annovar.head())

In [None]:
cadd_ldlink_annovar.columns

#### Now merge with the GWAS risk variants to get the locus numbers

In [None]:
gwas = pd.read_csv(f"{DATADIR}/gwas_risk_variants.csv")
gwas = gwas[['GWAS','RSID','LOC_NUM']]
print(gwas.shape)
print(gwas.head())

In [None]:
final = pd.merge(left = cadd_ldlink_annovar, right = gwas, left_on = 'REF_SNP', right_on = 'RSID',how = 'left')
print(final.shape)
print(final.head())

In [None]:
#final.to_csv(f"{WRKDIR}/temp.csv", index=None)

In [None]:
final_write = final[['GWAS','RS_Number','chrbprefalt','LOC_NUM','Gene.refGene','AAChange.refGene','MAF','cadd_phred','R2','Dprime','REF_SNP']]

In [None]:
print(final_write.shape)
print(final_write.head())

In [None]:
#remove rows where annovar couldn't get the AA change
print(final_write.shape)
final_write = final_write[final_write['AAChange.refGene']!='.']
print(final_write.shape)

In [None]:
final_write=final_write.sort_values(by=['RS_Number'])
print(final_write.head())

In [None]:
final_write.to_csv(f"{DATADIR}/results/CodingVariantsLDLink.csv", index = False)