# Calculate Coding Variant LD Values
Frank Grenn  
January 2020

### Input
* a `Variants.csv` file containing a `SNP`, `CHR`, `BP` and `Locus Number` column for each variant of interest (meta5 + progression)
* a `ranges.txt` file containing chr, bp position - 1MB, bp position + 1MB, and an id for each variant of interest. 
* a `CodingVariant.csv` file containing the rsid, chr, bp, locus number, associated disease, amino acid change, and gene, for each coding variant of interest
* plink binary files for the gwas data

### Output
* a `calculateLD.swarm` file that will use plink to get LD values for all the variants
* many `.log` files in the `out` directory generated by the notebook and swarm file containing the LD values
* a `CodingVariantLD.csv` to the `results` directory containing all the coding variant's LD values in one file

In [None]:
import pandas as pd
import subprocess
import re

In [10]:
variants = pd.read_csv("$PATH1/codingvars/LD/Variants.csv")
variants = variants.astype({'Locus Number':'str'})
variants

Unnamed: 0,SNP,CHR,BP,Locus Number
0,rs114138760,1,154898185,1
1,rs35749011,1,155135036,1
2,rs76763715,1,155205634,1
3,rs6658353,1,161469054,2
4,rs11578699,1,171719769,3
...,...,...,...,...
87,rs55818311,19,2341047,76
88,rs77351827,20,6006041,77
89,rs2248244,21,38852361,78
90,rs61863020,10,112956055,prog1


In [11]:
codingvars = pd.read_csv("$PATH1/codingvars/CodingVariant.csv")
codingvars = codingvars.astype({'locus number':'str'})
print(len(codingvars.index))
print(codingvars.head())

48
           ID  Chr      Start        End locus number REF ALT  \
0   rs2230288    1  155206167  155206167            1   C   T   
1  rs76763715    1  155205634  155205634            1   T   C   
2   rs1801274    1  161479745  161479745            2   A   G   
3  rs12992066    2   95947085   95947085            8   A   G   
4  rs28365795    3  122259640  122259640           15   T   C   

  other associated disease Func.refGene Gene.refGene GeneDetail.refGene  \
0      Parkinson's disease       exonic          GBA                  .   
1                      NaN       exonic          GBA                  .   
2       Ulcerative colitis       exonic       FCGR2A                  .   
3                      NaN       exonic        PROM2                  .   
4                      NaN       exonic        PARP9                  .   

  ExonicFunc.refGene                                   AAChange.refGene  
0  nonsynonymous SNV  GBA:NM_001171811:exon7:c.G832A:p.E278K (E326K)...  
1  nons

In [12]:
#drop identical rows
codingvars = codingvars.drop_duplicates()
print(len(codingvars.index))
print(codingvars.head())

48
           ID  Chr      Start        End locus number REF ALT  \
0   rs2230288    1  155206167  155206167            1   C   T   
1  rs76763715    1  155205634  155205634            1   T   C   
2   rs1801274    1  161479745  161479745            2   A   G   
3  rs12992066    2   95947085   95947085            8   A   G   
4  rs28365795    3  122259640  122259640           15   T   C   

  other associated disease Func.refGene Gene.refGene GeneDetail.refGene  \
0      Parkinson's disease       exonic          GBA                  .   
1                      NaN       exonic          GBA                  .   
2       Ulcerative colitis       exonic       FCGR2A                  .   
3                      NaN       exonic        PROM2                  .   
4                      NaN       exonic        PARP9                  .   

  ExonicFunc.refGene                                   AAChange.refGene  
0  nonsynonymous SNV  GBA:NM_001171811:exon7:c.G832A:p.E278K (E326K)...  
1  nons

In [8]:
%%bash
mkdir $PATH1/codingvars/LD/out

In [14]:
for i in range(len(variants.index)):
    locus = variants.iloc[i]['Locus Number']
    #print(locus)
    
    locus_codingvars = codingvars.loc[codingvars['locus number'] == locus]
    
    snp1 = str(variants.iloc[i]['CHR']) + ":" + str(variants.iloc[i]['BP'])

    if(len(locus_codingvars.index)!=0):
        for i in range(len(locus_codingvars.index)):

            snp2 = str(locus_codingvars.iloc[i]['Chr']) + ":" + str(locus_codingvars.iloc[i]['Start'])



            with open("$PATH1/codingvars/LD/calculateLD.swarm", 'a') as outfile:
                outfile.write("plink --bfile $PATH2/HARDCALLS_PD_september_2018_no_cousins --ld {} {} --extract range $PATH1/codingvars/LD/ranges.txt --out {}_{}\n".format(snp1,snp2,snp1,snp2))    


In [None]:
#run the swarm file
#cd $PATH1/codingvars/LD/out
#swarm -f $PATH1/codingvars/LD/calculateLD.swarm --partition quick --module plink

In [15]:
#read stuff
df = pd.DataFrame(columns=['rsid1','snp1','rsid2','snp2','r2','dprime'])
for i in range(len(variants.index)):
    locus = variants.iloc[i]['Locus Number']
    #print(locus)
    
    locus_codingvars = codingvars.loc[codingvars['locus number'] == locus]
    #print(locus_phenovars)
    
    snp1 = str(variants.iloc[i]['CHR']) + ":" + str(variants.iloc[i]['BP'])
    rsid1 = variants.iloc[i]['SNP']
    if(len(locus_codingvars.index)!=0):
        for i in range(len(locus_codingvars.index)):
            #reset the read string to null
            dataline='null'
            snp2 = str(locus_codingvars.iloc[i]['Chr']) + ":" + str(locus_codingvars.iloc[i]['Start'])
            rsid2 = locus_codingvars.iloc[i]['ID']
            #print("{} {}".format(snp1, snp2))


            
            file = open("$PATH1/codingvars/LD/out/"+str(snp1)+"_"+str(snp2)+".log","r")
            
            for line in file:
                if re.search("R-sq", line):
                    dataline = line
                    break
            
            #only add new data if 'R-sq' was found (meaning there was data in the log file and the 'null' value assigned earlier was overwritten)
            if(dataline!='null'):
                #mess with the strings
                dataline = dataline.strip('R-sq = ')
                dataline = dataline.strip(' ')
                splitdata = dataline.split("D' =")
                Rsq = splitdata[0]
                dprime = splitdata[1]
            
                df = df.append({'rsid1': rsid1,'snp1': snp1,'rsid2':rsid2, 'snp2': snp2, 'r2':Rsq.strip(' '), 'dprime':dprime.strip('\n')}, ignore_index = True)
        
print(len(df.index))
print(df.head())
print(df.tail())

61
         rsid1         snp1       rsid2         snp2           r2     dprime
0  rs114138760  1:154898185   rs2230288  1:155206167  0.000183319          1
1  rs114138760  1:154898185  rs76763715  1:155205634  5.83634e-05          1
2   rs35749011  1:155135036   rs2230288  1:155206167     0.937428   0.973508
3   rs35749011  1:155135036  rs76763715  1:155205634  0.000100534          1
4   rs76763715  1:155205634   rs2230288  1:155206167  9.99615e-05          1
         rsid1         snp1       rsid2         snp2        r2     dprime
56   rs1941685  18:31304318   rs2282632  18:31320229  0.890114   0.958323
57   rs1941685  18:31304318   rs7232237  18:31324934  0.854411   0.936107
58  rs77351827   20:6006041  rs41282950   20:6011934  0.983308   0.999704
59  rs77351827   20:6006041   rs6085343   20:5986950  0.973083   0.995845
60  rs77351827   20:6006041   rs6107751   20:6033033   0.68409   0.981664


In [16]:
df.to_csv("$PATH1/results/CodingVariantLD.csv", index = False)