# Generate QTL Plot Lead SNP Information for Psychencode eQTL Data
- **Author(s)** - Frank Grenn
- **Date Started** - March 2020
- **Quick Description:** Create a list of all the qtl plots we want to generate and get proxy snps for each plot that doesn't have the risk snp available in the data
- **Data:** 

In [1]:
library(data.table)
library(dplyr)


Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [2]:
#location of the plot data tsvs created previously
eqtl_tsv_dir <- "$PATH1/qtl/psychencode/eqtl_tsv"

In [3]:
gwas_folders <- list.files(eqtl_tsv_dir)
gwas_folders

## (1) Create a Dataframe Containing Information for all the Plots

we want a plot for each gene, each feature on each locus


In [4]:
evidence <- fread("$PATH1/genes_by_locus.csv")
meta5_data <- fread("$PATH1/GWAS_loci_overview.csv")
meta5_data <- meta5_data %>% select("Locus Number", "SNP", "CHR")
prog_data <- fread("$PATH1/ProgressionLoci.csv")
prog_data <- prog_data %>% select("Locus Number", "RSID", "CHR")
colnames(prog_data) <- c("Locus Number", "SNP","CHR")

In [5]:
#combine all the rsids to one df
variant_data <- rbind(meta5_data, prog_data)


In [6]:
#merge the evidence df with the variant df
#this is done to account for loci with multiple snps (ex: locus 1 has three risk variants, so we should try to create three plots for each gene on that locus)
plot_df <- merge(x=evidence, y=variant_data, by.x = "Locusnumber", by.y = "Locus Number", all.x = TRUE, allow.cartesian = TRUE)

plot_df$'feature' <- 'eqtl'

meta5_plots <- plot_df[which(plot_df$Locusnumber!="prog1" & plot_df$Locusnumber!="prog2"),]
meta5_plots$'gwas' <- 'meta5'

prog1_plots <- plot_df[which(plot_df$Locusnumber=="prog1"),]
prog1_plots$'gwas' <- 'prog1'

prog2_plots <- plot_df[which(plot_df$Locusnumber=="prog2"),]
prog2_plots$'gwas' <- 'prog2'


In [7]:
plot_df <- rbind(meta5_plots,prog1_plots,prog2_plots)
plot_df <- plot_df[with(plot_df, order(Locusnumber, Gene, SNP)),]

#temp values for the manually assigned lead snp (because some/many likely won't have data for the actual lead snp in the 'SNP' col of this dataframe)
plot_df$'forced_lead_variant' <- "NA"
plot_df$'forced_lead_variant_r2' <- 0
plot_df$'reason' <- "NA"

In [8]:
dim(plot_df)
head(plot_df)
tail(plot_df)

Locusnumber,Gene,SNP,CHR,feature,gwas,forced_lead_variant,forced_lead_variant_r2,reason
<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
1,ADAM15,rs114138760,1,eqtl,meta5,,0,
1,ADAM15,rs35749011,1,eqtl,meta5,,0,
1,ADAM15,rs76763715,1,eqtl,meta5,,0,
1,ADAR,rs114138760,1,eqtl,meta5,,0,
1,ADAR,rs35749011,1,eqtl,meta5,,0,
1,ADAR,rs76763715,1,eqtl,meta5,,0,


Locusnumber,Gene,SNP,CHR,feature,gwas,forced_lead_variant,forced_lead_variant_r2,reason
<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
prog2,OR13D1,rs382940,9,eqtl,prog2,,0,
prog2,OR13F1,rs382940,9,eqtl,prog2,,0,
prog2,RALGAPA1P1,rs382940,9,eqtl,prog2,,0,
prog2,SLC44A1,rs382940,9,eqtl,prog2,,0,
prog2,TAL2,rs382940,9,eqtl,prog2,,0,
prog2,TMEM38B,rs382940,9,eqtl,prog2,,0,


## (2) Filter the plot_df

`plots_with_risk_snp` will contain the plots that contain the risk snp in the data. So we should have no issue plotting these  
`plots_with_no_data` will contain plots that have no snps in their tsvs to plot. So we won't plot these  
`plots_to_check` will contain plots that don't have the risk snp but do have data. So we will need to find a proxy snp for these to determine if we can plot them or not


In [9]:
plots_with_risk_snp <- data.frame()
plots_with_no_data <- data.frame()
plots_to_check <- data.frame()

do the filtering

In [10]:
for (rownum in 1:nrow(plot_df))
{

  print(rownum)
  

  
  row <- plot_df[rownum,]
  
  chr <- row$CHR
  snp <- row$SNP
  gene <- row$Gene
  feature <- row$feature
  gwas <- row$gwas

  print(paste0("checking", snp, " " , gene, " ", gwas))
    
      
  if(gwas == "meta5")
  {
      tsv_dir <- paste0(eqtl_tsv_dir,"/meta5")
      gwas_fn = paste0(tsv_dir, '/', gene,'_gwas.tsv')
      qtl_fn = paste0(tsv_dir, '/', gene,'_eqtl.tsv')
  }
  if(gwas == "prog1")
  {
      tsv_dir <- paste0(eqtl_tsv_dir,"/prog1")
      gwas_fn = paste0(tsv_dir, '/', gene,'_gwas.tsv')
      qtl_fn = paste0(tsv_dir, '/', gene,'_eqtl.tsv')
  }
  if(gwas == "prog2")
  {
      tsv_dir <- paste0(eqtl_tsv_dir,"/prog2")
      gwas_fn = paste0(tsv_dir, '/', gene,'_gwas.tsv')
      qtl_fn = paste0(tsv_dir, '/', gene,'_eqtl.tsv')
  }

  
  #check if the files actually exist
  if(file.exists(gwas_fn) && file.exists(qtl_fn))
  {
    print("files exist")
    #read the tsv files
    g = read.table(gwas_fn, header = T)
    e = read.table(qtl_fn, header = T)
    
    #if they both aren't empty 
    if(nrow(g)!=0 && nrow(e)!=0)
    {
      
      #merge them by rsid
      df = merge(g, e, by = 'rsid')
      print("df:")
      print(head(df))
      #check if the risk variant is already in the data. If so, then we are fine
      if(row$SNP %in% df$rsid)
      {
        print("we have the risk snp!!!")
        row$'forced_lead_variant' <- row$SNP
        row$'forced_lead_variant_r2' <- 1
        row$'reason' <- "has risk variant"
        plots_with_risk_snp<-rbind(plots_with_risk_snp,row)
        
      }
      #if we don't have the risk variant then put the row in the plot_to_check df to check later
      else
      {
        tmp <- row
        tmp$num_snps <- nrow(df)
        plots_to_check<-rbind(plots_to_check,tmp)
        
      }
      

      

    }
    #if they are empty then put in the no data df
    else
    {
      row$'reason' <- "no data for plot"
      plots_with_no_data<-rbind(plots_with_no_data,row)
    }
  }
  #if the files don't even exist then put in the no data df
  else
  {
      row$'reason' <- "no data for plot"
      plots_with_no_data<-rbind(plots_with_no_data,row)
  }
  plot_df[rownum,] <- row

}

[1] 1
[1] "checkingrs114138760 ADAM15 meta5"
[1] "files exist"
[1] "df:"
        rsid        var_id.x  pval.x        var_id.y      pval.y
1 rs10737174 1:154158882:C:G 0.13770 1:154158882:C:G 1.15465e-04
2 rs10796935 1:154891363:G:T 0.04777 1:154891363:G:T 2.04448e-04
3 rs10796936 1:154999514:T:A 0.61790 1:154999514:T:A 5.84282e-04
4 rs10796944 1:155416328:G:A 0.22990 1:155416328:G:A 4.68168e-04
5 rs10796946 1:155429490:A:T 0.60480 1:155429490:A:T 3.44108e-04
6 rs10908444 1:154840287:G:A 0.05325 1:154840287:G:A 2.17834e-05
[1] 2
[1] "checkingrs35749011 ADAM15 meta5"
[1] "files exist"
[1] "df:"
        rsid        var_id.x  pval.x        var_id.y      pval.y
1 rs10737174 1:154158882:C:G 0.13770 1:154158882:C:G 1.15465e-04
2 rs10796935 1:154891363:G:T 0.04777 1:154891363:G:T 2.04448e-04
3 rs10796936 1:154999514:T:A 0.61790 1:154999514:T:A 5.84282e-04
4 rs10796944 1:155416328:G:A 0.22990 1:155416328:G:A 4.68168e-04
5 rs10796946 1:155429490:A:T 0.60480 1:155429490:A:T 3.44108e-04
6 rs109084

In [1]:
dim(plots_with_risk_snp)
head(plots_with_risk_snp)

write.csv(plots_with_risk_snp,file="$PATH1/qtl/psychencode/eqtl_plots_with_risk_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-1-a86cd515ba44>, line 4)

In [2]:
dim(plots_with_no_data)
head(plots_with_no_data)

write.csv(plots_with_no_data,file="$PATH1/qtl/psychencode/eqtl_plots_with_no_data.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-2-c79c6cfcad48>, line 4)

In [3]:
dim(plots_to_check)
head(plots_to_check)

write.csv(plots_to_check,file="$PATH1/qtl/psychencode/eqtl_plots_to_check.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-3-9bd7226199b4>, line 4)

## (3) Obtain Proxy SNPS Per Locus

use LDlinkR to generate a list of snps with a LD R2 > 0.7 for each risk variant

run this in terminal. Queries must be made sequentially for LDlinkR to work (so don't bother changing the script to run in parallel)


In [14]:
print("Rscript getRiskSNPProxies.R")

[1] "Rscript getRiskSNPProxies.R"


## (4) Check Plots for Proxy SNPs

go through all the plots we know need checking and see if they have a proxy.

write plots with a good proxy snp to a file. and then combine that file with the list of plots that already have the risk snp to create a final plot list file



In [15]:
proxy_dir <- "$PATH1/qtl/proxy_snps"

In [16]:
plots_to_check <- fread("$PATH1/qtl/psychencode/eqtl_plots_to_check.csv")
plots_with_proxy_snp <- data.frame()
plots_with_no_proxy_snp <- data.frame()

In [17]:
print(dim(plots_to_check))

[1] 1693   10


In [18]:
for (rownum in 1:nrow(plots_to_check))
{
    row <- plots_to_check[rownum,]

  
    chr <- row$CHR
    snp <- row$SNP
    gene <- row$Gene
    feature <- row$feature
    gwas <- row$gwas

    print(paste0("checking", snp, " " , gene, " ", gwas))


    if(gwas == "meta5")
    {
      tsv_dir <- paste0(eqtl_tsv_dir,"/meta5")
      gwas_fn = paste0(tsv_dir, '/', gene,'_gwas.tsv')
      qtl_fn = paste0(tsv_dir, '/', gene,'_eqtl.tsv')
    }
    if(gwas == "prog1")
    {
      tsv_dir <- paste0(eqtl_tsv_dir,"/prog1")
      gwas_fn = paste0(tsv_dir, '/', gene,'_gwas.tsv')
      qtl_fn = paste0(tsv_dir, '/', gene,'_eqtl.tsv')
    }
    if(gwas == "prog2")
    {
      tsv_dir <- paste0(eqtl_tsv_dir,"/prog2")
      gwas_fn = paste0(tsv_dir, '/', gene,'_gwas.tsv')
      qtl_fn = paste0(tsv_dir, '/', gene,'_eqtl.tsv')
    }
    

    g = read.table(gwas_fn, header = T)
    e = read.table(qtl_fn, header = T)

    #merge them by rsid
    df = merge(g, e, by = 'rsid')
    
    #load the proxy snps
    proxies <- fread(paste0(proxy_dir,"/",row$SNP,"_proxies.csv"))
    
    #if we even have any proxy snps for the risk variant
    if(nrow(proxies)>0)
    {
        #merge the proxy snp df with the data df
        merged <- merge(df, proxies, by.x = "rsid", by.y = "RS_Number")

        if(nrow(merged) > 0)
        {
            best_ld_snp <- merged[which.max(merged$R2),]
            row$'forced_lead_variant' <- best_ld_snp$rsid
            row$'forced_lead_variant_r2' <- best_ld_snp$R2
            row$'reason' <- "plot has a good proxy snp"
            plots_with_proxy_snp <- rbind(plots_with_proxy_snp,row)
            
        }
        else
        {
            row$'reason' <- "no good proxy snp exists in the data we want to plot"
            plots_with_no_proxy_snp <- rbind(plots_with_no_proxy_snp, row)
        }
    }
    else
    {
        row$'reason' <- "there was no proxy snp for the risk variant"
        plots_with_no_proxy_snp <- rbind(plots_with_no_proxy_snp, row)
    }
    
    
    
    
}

[1] "checkingrs114138760 ADAM15 meta5"
[1] "checkingrs35749011 ADAM15 meta5"
[1] "checkingrs76763715 ADAM15 meta5"
[1] "checkingrs114138760 ADAR meta5"
[1] "checkingrs35749011 ADAR meta5"
[1] "checkingrs76763715 ADAR meta5"
[1] "checkingrs114138760 AQP10 meta5"
[1] "checkingrs35749011 AQP10 meta5"
[1] "checkingrs76763715 AQP10 meta5"
[1] "checkingrs114138760 ARHGEF2 meta5"
[1] "checkingrs35749011 ARHGEF2 meta5"
[1] "checkingrs76763715 ARHGEF2 meta5"
[1] "checkingrs114138760 ASH1L-AS1 meta5"
[1] "checkingrs35749011 ASH1L-AS1 meta5"
[1] "checkingrs76763715 ASH1L-AS1 meta5"
[1] "checkingrs114138760 ATP8B2 meta5"
[1] "checkingrs35749011 ATP8B2 meta5"
[1] "checkingrs76763715 ATP8B2 meta5"
[1] "checkingrs114138760 C1orf189 meta5"
[1] "checkingrs35749011 C1orf189 meta5"
[1] "checkingrs76763715 C1orf189 meta5"
[1] "checkingrs114138760 C1orf43 meta5"
[1] "checkingrs35749011 C1orf43 meta5"
[1] "checkingrs76763715 C1orf43 meta5"
[1] "checkingrs114138760 CHRNB2 meta5"
[1] "checkingrs35749011 CHRNB

In [4]:
dim(plots_with_proxy_snp)
head(plots_with_proxy_snp)

write.csv(plots_with_proxy_snp,file="$PATH1/qtl/psychencode/eqtl_plots_with_proxy_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-4-bf1e393aabc5>, line 4)

In [5]:
dim(plots_with_no_proxy_snp)
head(plots_with_no_proxy_snp)

write.csv(plots_with_no_proxy_snp,file="$PATH1/qtl/psychencode/eqtl_plots_with_no_proxy_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-5-dd25b2e57a30>, line 4)

make the final plot list file

In [6]:
plots_with_risk_snp <- fread("$PATH1/qtl/psychencode/eqtl_plots_with_risk_snp.csv")
plots_with_risk_snp$'num_snps' <- NA

plots_to_plot <- rbind(plots_with_risk_snp, plots_with_proxy_snp)


write.csv(plots_to_plot,file="$PATH1/qtl/psychencode/eqtl_plots_to_plot.csv",row.names=F,col.names=F)

SyntaxError: invalid syntax (<ipython-input-6-813f38dd4050>, line 2)

finally summary table for all the plots

In [7]:
plots_to_plot <- plots_to_plot[, -c('num_snps')]
plots_with_no_proxy_snp <- plots_with_no_proxy_snp[, -c('num_snps')]

final_summary <- rbind(plots_to_plot,plots_with_no_proxy_snp, plots_with_no_data)
dim(final_summary)
head(final_summary)
write.csv(final_summary,file="$PATH1/qtl/psychencode/eqtl_plots_final_overview.csv",row.names=F,col.names=F)

SyntaxError: invalid syntax (<ipython-input-7-a9293f8124ec>, line 1)