# Generate QTL Plot Lead SNP Information
- **Author(s)** - Frank Grenn
- **Date Started** - March 2020
- **Quick Description:** Create a list of all the qtl plots we want to generate (one per gene and feature) and get proxy snps for each plot that doesn't have the risk snp available in the data
- **Data:** 

In [150]:
library(data.table)
library(dplyr)

In [151]:
#location of the plot data tsvs created previously
tsv_dir <- "$PATH1/qtl/tsv"

## (1) Create a Dataframe Containing Information for all the Plots

we want a plot for each gene, each feature (blood, brain) on each locus


In [152]:
evidence <- fread("$PATH1/genes_by_locus.csv")
meta5_data <- fread("$PATH1/GWAS_loci_overview.csv")
meta5_data <- meta5_data %>% select("Locus Number", "SNP", "CHR")
prog_data <- fread("$PATH1/ProgressionLoci.csv")
prog_data <- prog_data %>% select("Locus Number", "RSID", "CHR")
colnames(prog_data) <- c("Locus Number", "SNP","CHR")

In [153]:
#combine all the rsids to one df
variant_data <- rbind(meta5_data, prog_data)


In [154]:
#merge the evidence df with the variant df
#this is done to account for loci with multiple snps (ex: locus 1 has three risk variants, so we should try to create three plots for each gene on that locus)
plot_df <- merge(x=evidence, y=variant_data, by.x = "Locusnumber", by.y = "Locus Number", all.x = TRUE, allow.cartesian = TRUE)

em_blood <- plot_df
em_blood$'feature' <- 'blood'

em_brain <- plot_df
em_brain$'feature' <- 'brain'

In [155]:
plot_df <- rbind(em_blood,em_brain)
plot_df <- plot_df[with(plot_df, order(Locusnumber, Gene, SNP)),]

#temp values for the manually assigned lead snp (because some/many likely won't have data for the actual lead snp in the 'SNP' col of this dataframe)
plot_df$'forced_lead_variant' <- "NA"
plot_df$'forced_lead_variant_r2' <- 0
plot_df$'reason' <- "NA"

In [156]:
dim(plot_df)
head(plot_df)
tail(plot_df)

Locusnumber,Gene,SNP,CHR,feature,forced_lead_variant,forced_lead_variant_r2,reason
<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>
1,ADAM15,rs114138760,1,blood,,0,
1,ADAM15,rs114138760,1,brain,,0,
1,ADAM15,rs35749011,1,blood,,0,
1,ADAM15,rs35749011,1,brain,,0,
1,ADAM15,rs76763715,1,blood,,0,
1,ADAM15,rs76763715,1,brain,,0,


Locusnumber,Gene,SNP,CHR,feature,forced_lead_variant,forced_lead_variant_r2,reason
<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>
prog2,SLC44A1,rs382940,9,blood,,0,
prog2,SLC44A1,rs382940,9,brain,,0,
prog2,TAL2,rs382940,9,blood,,0,
prog2,TAL2,rs382940,9,brain,,0,
prog2,TMEM38B,rs382940,9,blood,,0,
prog2,TMEM38B,rs382940,9,brain,,0,


## (2) Filter the plot_df

`plots_with_risk_snp` will contain the plots that contain the risk snp in the data. So we should have no issue plotting these  
`plots_with_no_data` will contain plots that have no snps in their tsvs to plot. So we won't plot these  
`plots_to_check` will contain plots that don't have the risk snp but do have data. So we will need to find a proxy snp for these to determine if we can plot them or not


In [157]:
plots_with_risk_snp <- data.frame()
plots_with_no_data <- data.frame()
plots_to_check <- data.frame()

do the filtering

In [158]:
for (rownum in 1:nrow(plot_df))
{

  print(rownum)
  

  
  row <- plot_df[rownum,]
  
  chr <- row$CHR
  snp <- row$SNP
  gene <- row$Gene
  feature <- row$feature
  

  print(paste0("checking", snp, " " , gene, " ", feature))
  ###now check the tsvs for data
  #get file path strings
  gwas_fn = paste0(tsv_dir, '/', gene, '_',feature,'_gwas.tsv')
  
  eqtl_fn = paste0(tsv_dir, '/', gene, '_',feature,'_eqtl.tsv')
  
  #check if the files actually exist
  if(file.exists(gwas_fn) && file.exists(eqtl_fn))
  {
    print("files exist")
    #read the tsv files
    g = read.table(gwas_fn, header = T)
    e = read.table(eqtl_fn, header = T)
    
    #if they both aren't empty 
    if(nrow(g)!=0 && nrow(e)!=0)
    {
      
      #merge them by rsid
      df = merge(g, e, by = 'rsid')
      print("df:")
      print(head(df))
      #check if the risk variant is already in the data. If so, then we are fine
      if(row$SNP %in% df$rsid)
      {
        print("we have the risk snp!!!")
        row$'forced_lead_variant' <- row$SNP
        row$'forced_lead_variant_r2' <- 1
        row$'reason' <- "has risk variant"
        plots_with_risk_snp<-rbind(plots_with_risk_snp,row)
        
      }
      #if we don't have the risk variant then put the row in the plot_to_check df to check later
      else
      {
        tmp <- row
        tmp$num_snps <- nrow(df)
        plots_to_check<-rbind(plots_to_check,tmp)
        
      }
      

      

    }
    #if they are empty then put in the no data df
    else
    {
      row$'reason' <- "no data for plot"
      plots_with_no_data<-rbind(plots_with_no_data,row)
    }
  }
  #if the files don't even exist then put in the no data df
  else
  {
      row$'reason' <- "no data for plot"
      plots_with_no_data<-rbind(plots_with_no_data,row)
  }
  plot_df[rownum,] <- row

}

[1] 1
[1] "checkingrs114138760 ADAM15 blood"
[1] "files exist"
[1] "df:"
        rsid    pval.x      pval.y
1  rs1001848 3.518e-07  3.2436e-66
2  rs1007170 1.006e-01  9.9655e-40
3  rs1010033 7.085e-01  2.5882e-07
4  rs1010225 1.703e-01  8.3265e-18
5 rs10157801 1.502e-05 3.0782e-163
6  rs1018730 7.476e-01  3.1109e-07
[1] 2
[1] "checkingrs114138760 ADAM15 brain"
[1] "files exist"
[1] "df:"
        rsid    pval.x      pval.y
1  rs1001848 3.518e-07 1.59618e-03
2  rs1007170 1.006e-01 1.28004e-05
3  rs1046188 9.360e-01 4.89495e-06
4 rs10494301 1.956e-01 1.60838e-03
5  rs1052176 9.503e-01 7.53418e-06
6  rs1052177 9.618e-01 7.56439e-06
[1] 3
[1] "checkingrs35749011 ADAM15 blood"
[1] "files exist"
[1] "df:"
        rsid    pval.x      pval.y
1  rs1001848 3.518e-07  3.2436e-66
2  rs1007170 1.006e-01  9.9655e-40
3  rs1010033 7.085e-01  2.5882e-07
4  rs1010225 1.703e-01  8.3265e-18
5 rs10157801 1.502e-05 3.0782e-163
6  rs1018730 7.476e-01  3.1109e-07
[1] "we have the risk snp!!!"
[1] 4
[1] "checki

In [1]:
dim(plots_with_risk_snp)
head(plots_with_risk_snp)

write.csv(plots_with_risk_snp,file="$PATH1/qtl/plots_with_risk_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-1-57bf1ef3d918>, line 4)

In [2]:
dim(plots_with_no_data)
head(plots_with_no_data)

write.csv(plots_with_no_data,file="$PATH1/qtl/plots_with_no_data.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-2-e56009d91474>, line 4)

In [3]:
dim(plots_to_check)
head(plots_to_check)

write.csv(plots_to_check,file="$PATH1/qtl/plots_to_check.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-3-1a7f5263e812>, line 4)

## (3) Obtain Proxy SNPS Per Locus

use LDlinkR to generate a list of snps with a LD R2 > 0.7 for each risk variant

run this in terminal. Queries must be made sequentially for LDlinkR to work (so don't bother changing the script to run in parallel)


In [10]:
print("Rscript getRiskSNPProxies.R")

[1] "Rscript getRiskSNPProxies.R"


## (4) Check Plots for Proxy SNPs

go through all the plots we know need checking and see if they have a proxy.

write plots with a good proxy snp to a file. and then combine that file with the list of plots that already have the risk snp to create a final plot list file



In [162]:
proxy_dir <- "$PATH1/qtl/proxy_snps"

In [163]:
plots_to_check <- fread("$PATH1/qtl/plots_to_check.csv")
plots_with_proxy_snp <- data.frame()
plots_with_no_proxy_snp <- data.frame()

In [164]:
print(dim(plots_to_check))

[1] 3055    9


In [165]:
for (rownum in 1:nrow(plots_to_check))
{
    row <- plots_to_check[rownum,]

  
    chr <- row$CHR
    snp <- row$SNP
    gene <- row$Gene
    feature <- row$feature
    
    #get file path strings
    gwas_fn = paste0(tsv_dir, '/', gene, '_',feature,'_gwas.tsv')
    eqtl_fn = paste0(tsv_dir, '/', gene, '_',feature,'_eqtl.tsv')
    g = read.table(gwas_fn, header = T)
    e = read.table(eqtl_fn, header = T)

    #merge them by rsid
    df = merge(g, e, by = 'rsid')
    
    #load the proxy snps
    proxies <- fread(paste0(proxy_dir,"/",row$SNP,"_proxies.csv"))
    
    #if we even have any proxy snps for the risk variant
    if(nrow(proxies)>0)
    {
        #merge the proxy snp df with the data df
        merged <- merge(df, proxies, by.x = "rsid", by.y = "RS_Number")

        if(nrow(merged) > 0)
        {
            best_ld_snp <- merged[which.max(merged$R2),]
            row$'forced_lead_variant' <- best_ld_snp$rsid
            row$'forced_lead_variant_r2' <- best_ld_snp$R2
            row$'reason' <- "plot has a good proxy snp"
            plots_with_proxy_snp <- rbind(plots_with_proxy_snp,row)
            
        }
        else
        {
            row$'reason' <- "no good proxy snp exists in the data we want to plot"
            plots_with_no_proxy_snp <- rbind(plots_with_no_proxy_snp, row)
        }
    }
    else
    {
        row$'reason' <- "there was no proxy snp for the risk variant"
        plots_with_no_proxy_snp <- rbind(plots_with_no_proxy_snp, row)
    }
    
    
    
    
}

In [4]:
dim(plots_with_proxy_snp)
head(plots_with_proxy_snp)

write.csv(plots_with_proxy_snp,file="$PATH1/qtl/plots_with_proxy_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-4-cc62dbcb2f6a>, line 4)

In [5]:
dim(plots_with_no_proxy_snp)
head(plots_with_no_proxy_snp)

write.csv(plots_with_no_proxy_snp,file="$PATH1/qtl/plots_with_no_proxy_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-5-9f4ac524fc97>, line 4)

make the final plot list file

In [6]:
plots_with_risk_snp <- fread("$PATH1/qtl/plots_with_risk_snp.csv")
plots_with_risk_snp$'num_snps' <- NA

plots_to_plot <- rbind(plots_with_risk_snp, plots_with_proxy_snp)


write.csv(plots_to_plot,file="$PATH1/qtl/plots_to_plot.csv",row.names=F,col.names=F)

SyntaxError: invalid syntax (<ipython-input-6-ca3f70174a64>, line 2)

finally summary table for all the plots

In [7]:
plots_to_plot <- plots_to_plot[, -c('num_snps')]
plots_with_no_proxy_snp <- plots_with_no_proxy_snp[, -c('num_snps')]

final_summary <- rbind(plots_to_plot,plots_with_no_proxy_snp, plots_with_no_data)
dim(final_summary)
head(final_summary)
write.csv(final_summary,file="$PATH1/qtl/plots_final_overview.csv",row.names=F,col.names=F)

SyntaxError: invalid syntax (<ipython-input-7-9e354a901e70>, line 1)

## (4) Next Steps

next we want to (finally!) create the plots

so copy the `tsv` folder, and the `plots_to_plot.csv` file locally

we need to run the plot generating code locally because `locuscompareR` uses code to query a sql database for LD values and that doesn't work very well on biowulf

run the script `blood_brain_eQTL_plot.R` locally