# GWAS Locus Browser Generate Phenotype Variant File
- **Author** - Frank Grenn
- **Date Started** - April 2020
- **Quick Description:** code to generate list of phenotype variants from other gwases. relies on files made in the coding variants scripts
- **Data:** [GWAS Catalog](https://www.ebi.ac.uk/gwas/docs/file-download)

In [None]:
import pandas as pd
import os
import numpy as np
import re

In [None]:
DATADIR = '/path/to/AppDataProcessing'
WRKDIR = f"{DATADIR}/phenovars"




## 1) Get the Variants with Associated Disease
get this from the file made in the coding variants scripts

In [None]:
tags = pd.read_csv(f"{DATADIR}/codingvars/annotated_R05_tags_all_gwas.txt",sep='\t')
tags.columns = ['GWAS', 'ID', 'Chr', 'Start', 'End', 'locnum', 'Ref', 'Alt',
       'DISEASE', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene',
       'ExonicFunc.refGene', 'AAChange.refGene']
print(tags.shape)
print(tags.head())

#### Filter for Variants with Disease

In [None]:
disease_tags = tags[tags.DISEASE.notnull()]
disease_tags = disease_tags.drop_duplicates()
disease_tags['Chr'] = disease_tags['Chr'].astype(np.int64)
disease_tags['Start'] = disease_tags['Start'].astype(np.int64)
disease_tags['End'] = disease_tags['End'].astype(np.int64)
print(disease_tags.shape)
print(disease_tags.head())

In [None]:
disease_tags.to_csv(f"{WRKDIR}/PhenoVariant.csv",index=None)

## 2) Get Frequencies

In [None]:
with open(f"{WRKDIR}/get_frequencies_annovar.sh","w") as bash_file:
    bash_file.write(f'''#!/bin/bash\n\
module load annovar\n\
awk -F',' '{{print $3,$4,$5,$7,$8}}' '{WRKDIR}/PhenoVariant.csv' > phenovars.avinput\n\
annotate_variation.pl --filter --build hg19 --dbtype gnomad211_genome --buildver hg19 --otherinfo {WRKDIR}/phenovars.avinput $ANNOVAR_DATA/hg19''')
bash_file.close()

In [None]:
print(f"sbatch {WRKDIR}/get_frequencies_annovar.sh")

## 3) Get LD Data

In [None]:
variants = pd.read_csv(f"{DATADIR}/gwas_risk_variants.csv")
print(variants.head())

In [None]:
phenovars = disease_tags

#### make a ranges file to speed up plink

In [None]:
ranges = variants[['CHR','BP','BP']]
ranges['ID']="r"+ranges.index.astype(str)
ranges.columns = ['CHR','Start','End','ID']
ranges['Start'] = ranges['Start'].apply(lambda x: max(0,x - 1000000))
ranges['End'] = ranges['End'] + 1000000
print(ranges.shape)
print(ranges.head())

In [None]:
ranges.to_csv(f"{WRKDIR}/LD/ranges.txt",sep='\t',index=None,header=None)

#### Run LD Commands
probably better way to do this but works for now

In [None]:

!mkdir {WRKDIR}/LD/out

In [None]:
with open(f"{WRKDIR}/LD/calculateLD.swarm", 'w') as outfile:
    for i in range(len(variants.index)):
        locus = variants.iloc[i]['Locus Number']
        #print(locus)
    
        locus_phenovars = phenovars.loc[phenovars['locnum'] == locus]
    
        snp1 = str(variants.iloc[i]['CHR']) + ":" + str(variants.iloc[i]['BP'])

        if(len(locus_phenovars.index)!=0):
            for i in range(len(locus_phenovars.index)):

                snp2 = str(locus_phenovars.iloc[i]['Chr']) + ":" + str(locus_phenovars.iloc[i]['Start'])


                outfile.write(f"plink --bfile /path/to/PD_FINAL_PLINK_2018/HARDCALLS_PD_september_2018_no_cousins --ld {snp1} {snp2} --extract range {WRKDIR}/LD/ranges.txt --out {snp1}_{snp2}\n")    
outfile.close()        


In [None]:
#run the swarm file
print(f"cd {WRKDIR}/LD/out")
print(f"swarm -f {WRKDIR}/LD/calculateLD.swarm --partition quick --module plink")

In [None]:
#read stuff
df = pd.DataFrame(columns=['rsid1','snp1','rsid2','snp2','r2','dprime'])
for i in range(len(variants.index)):
    locus = variants.iloc[i]['Locus Number']
    #print(locus)
    
    locus_phenovars = phenovars.loc[phenovars['locnum'] == locus]
    #print(locus_phenovars)
    
    snp1 = str(variants.iloc[i]['CHR']) + ":" + str(variants.iloc[i]['BP'])
    rsid1 = variants.iloc[i]['RSID']
    if(len(locus_phenovars.index)!=0):
        for i in range(len(locus_phenovars.index)):
            #reset the read string to null
            dataline='null'
            snp2 = str(locus_phenovars.iloc[i]['Chr']) + ":" + str(locus_phenovars.iloc[i]['Start'])
            rsid2 = locus_phenovars.iloc[i]['ID']
            #print("{} {}".format(snp1, snp2))


            
            file = open(f"{WRKDIR}/LD/out/"+str(snp1)+"_"+str(snp2)+".log","r")
            
            for line in file:
                if re.search("R-sq", line):
                    dataline = line
                    break
            
            #only add new data if 'R-sq' was found (meaning there was data in the log file and the 'null' value assigned earlier was overwritten)
            if(dataline!='null'):
                #mess with the strings
                dataline = dataline.strip('R-sq = ')
                dataline = dataline.strip(' ')
                splitdata = dataline.split("D' =")
                Rsq = splitdata[0]
                dprime = splitdata[1]
            
                df = df.append({'rsid1': rsid1,'snp1': snp1,'rsid2':rsid2, 'snp2': snp2, 'r2':Rsq.strip(' '), 'dprime':dprime.strip('\n')}, ignore_index = True)
        
print(len(df.index))
print(df.head())
print(df.tail())

In [None]:
df=df.drop_duplicates()

In [None]:
df.to_csv(f"{DATADIR}/results/PhenotypeVariantLD.csv", index = False)

## 4) Combine Everything

#### some formatting functions

In [None]:
def generatePMIDLink(link):
	id = re.split("/", link)[2]
	return ("<a href='https://"+link+"' target='_blank'>"+id+"</a>")
	
def formatCHRBPREFALT(chr,bp,ref,alt):
	return str(chr)+":"+str(bp)+":"+str(ref)+":"+str(alt)
	
def getNFE(frequencies):
	freq = re.split(",", frequencies)[9]
	return freq

#### merge with gwas catalog to get pmids and p values

In [None]:
catalog = pd.read_csv(f"{WRKDIR}/GWAS_catalogv1.0.2-associations.txt", sep="\t",encoding='latin1')
print(catalog.columns)

In [None]:
merge = pd.merge(phenovars, catalog, how='left', left_on='ID',right_on='SNPS')

print(merge.head())


In [None]:
print(merge[['LINK','PUBMEDID']].head())

In [None]:
merge['PMID']=merge.apply(lambda x: generatePMIDLink(x.LINK),axis=1)
merge['CHRBP_REFALT']=merge.apply(lambda x: formatCHRBPREFALT(x.Chr, x.Start, x.Ref, x.Alt), axis = 1)

#### Get the frequency data from before

In [None]:
#array for frequency df column names since annovar doesn't generate column names
names = ["db", "freq", "chr", "start", "end", "ref", "alt"]
frequencies = pd.read_csv(f"{WRKDIR}/phenovars.avinput.hg19_gnomad211_genome_dropped", sep="\s", names = names)



In [None]:
frequencies['freq_nfe']=frequencies.apply(lambda x: getNFE(x.freq), axis = 1)

#give frequencies df a CHRBP_REFALT to give it a unique key to merge with later
frequencies['CHRBP_REFALT']=frequencies.apply(lambda x: formatCHRBPREFALT(x.chr, x.start, x.ref, x.alt), axis = 1)



In [None]:
merge_freq=pd.merge(merge,frequencies, how = 'left', on = "CHRBP_REFALT")

pheno_data = merge_freq[['GWAS','ID', 'CHRBP_REFALT','locnum','freq_nfe', 'DISEASE/TRAIT', 'P-VALUE', 'PMID']]
pheno_data = pheno_data.rename(columns={"DISEASE/TRAIT": "other associated disease"})

In [None]:


pheno_data = pheno_data.drop_duplicates()

pheno_data.to_csv(f"{DATADIR}/results/PhenotypeVariantData.csv", index = False)