# Generate QTL Plot Lead SNP Information for Psychencode isoQTL Data
- **Author(s)** - Frank Grenn
- **Date Started** - March 2020
- **Quick Description:** Create a list of all the qtl plots we want to generate and get proxy snps for each plot that doesn't have the risk snp available in the data
- **Data:** 

In [1]:
library(data.table)
library(dplyr)


Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [2]:
#location of the plot data tsvs created previously
isoqtl_tsv_dir <- "$PATH1/qtl/psychencode/isoqtl_tsv"

In [3]:
gwas_folders <- list.files(isoqtl_tsv_dir)
gwas_folders

## (1) Create a Dataframe Containing Information for all the Plots

we want a plot for each gene, each feature on each locus


In [4]:
#read the isoqtl_tsv directories and its contents to parse the filenames for gene and transcript info

file_df <- NULL
for (gwas in gwas_folders)
{
    gwas_files <- list.files(paste0(isoqtl_tsv_dir,"/",gwas))
    for(file in gwas_files)
    {
 
        row <- NULL
        if(grepl("gwas", file, fixed=TRUE))
        {

            split <- strsplit(file,split="_",fixed=T)

            row <- data.frame("gene"=split[[1]][1],"transcript"=split[[1]][2])

            file_df <- rbind(file_df, row)
        }
    }
}
print(dim(file_df))
print(head(file_df))


[1] 731   2
    gene      transcript
1  ABCC5 ENST00000443497
2  ACAD8 ENST00000524502
3  ACAD8 ENST00000526026
4  ACAD8 ENST00000531338
5  ACAD8 ENST00000534240
6 ACADVL ENST00000322910


In [5]:
evidence <- fread("$PATH1/genes_by_locus.csv")
meta5_data <- fread("$PATH1/GWAS_loci_overview.csv")
meta5_data <- meta5_data %>% select("Locus Number", "SNP", "CHR")
prog_data <- fread("$PATH1/ProgressionLoci.csv")
prog_data <- prog_data %>% select("Locus Number", "RSID", "CHR")
colnames(prog_data) <- c("Locus Number", "SNP","CHR")

In [6]:
#combine all the rsids to one df
variant_data <- rbind(meta5_data, prog_data)
print(head(variant_data))

   Locus Number         SNP CHR
1:            1 rs114138760   1
2:            1  rs35749011   1
3:            1  rs76763715   1
4:            2   rs6658353   1
5:            3  rs11578699   1
6:            4    rs823118   1


In [7]:
#merge the evidence df with the variant df
#this is done to account for loci with multiple snps (ex: locus 1 has three risk variants, so we should try to create three plots for each gene on that locus)
plot_df <- merge(x=evidence, y=variant_data, by.x = "Locusnumber", by.y = "Locus Number", all.x = TRUE, allow.cartesian = TRUE)
print(head(plot_df))

   Locusnumber   Gene         SNP CHR
1:           1 ADAM15 rs114138760   1
2:           1 ADAM15  rs35749011   1
3:           1 ADAM15  rs76763715   1
4:           1   ADAR rs114138760   1
5:           1   ADAR  rs35749011   1
6:           1   ADAR  rs76763715   1


In [8]:
plot_df_transcript <- merge(x = file_df, y = plot_df, by.x = "gene", by.y = "Gene", all.x = TRUE, allow.cartesian = TRUE)
print(dim(plot_df_transcript))
print(head(plot_df_transcript))
print(tail(plot_df_transcript))

[1] 952   5
    gene      transcript Locusnumber        SNP CHR
1  ABCC5 ENST00000443497          18 rs10513789   3
2  ACAD8 ENST00000524502          48  rs3802920  11
3  ACAD8 ENST00000526026          48  rs3802920  11
4  ACAD8 ENST00000531338          48  rs3802920  11
5  ACAD8 ENST00000534240          48  rs3802920  11
6 ACADVL ENST00000322910          66 rs12600861  17
      gene      transcript Locusnumber        SNP CHR
947 ZNF721 ENST00000338977          19 rs34311866   4
948 ZNF721 ENST00000338977          19   rs873786   4
949  ZNF84 ENST00000441040          52 rs11610045  12
950  ZNF84 ENST00000536123          52 rs11610045  12
951  ZNF84 ENST00000542358          52 rs11610045  12
952  ZNF84 ENST00000543758          52 rs11610045  12


In [9]:

plot_df_transcript$'feature' <- 'isoqtl'

meta5_plots <- plot_df_transcript[which(plot_df_transcript$Locusnumber!="prog1" & plot_df_transcript$Locusnumber!="prog2"),]
meta5_plots$'gwas' <- 'meta5'

prog1_plots <- plot_df_transcript[which(plot_df_transcript$Locusnumber=="prog1"),]
prog1_plots$'gwas' <- 'prog1'

prog2_plots <- plot_df_transcript[which(plot_df_transcript$Locusnumber=="prog2"),]
prog2_plots$'gwas' <- 'prog2'


In [10]:
plot_df_transcript <- rbind(meta5_plots,prog1_plots,prog2_plots)
plot_df_transcript <- plot_df_transcript[with(plot_df_transcript, order(Locusnumber, gene, SNP)),]

#temp values for the manually assigned lead snp (because some/many likely won't have data for the actual lead snp in the 'SNP' col of this dataframe)
plot_df_transcript$'forced_lead_variant' <- "NA"
plot_df_transcript$'forced_lead_variant_r2' <- 0
plot_df_transcript$'reason' <- "NA"

In [11]:
dim(plot_df_transcript)
head(plot_df_transcript)
tail(plot_df_transcript)

Unnamed: 0_level_0,gene,transcript,Locusnumber,SNP,CHR,feature,gwas,forced_lead_variant,forced_lead_variant_r2,reason
Unnamed: 0_level_1,<fct>,<fct>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
16,ADAR,ENST00000368471,1,rs114138760,1,isoqtl,meta5,,0,
19,ADAR,ENST00000463920,1,rs114138760,1,isoqtl,meta5,,0,
22,ADAR,ENST00000529168,1,rs114138760,1,isoqtl,meta5,,0,
17,ADAR,ENST00000368471,1,rs35749011,1,isoqtl,meta5,,0,
20,ADAR,ENST00000463920,1,rs35749011,1,isoqtl,meta5,,0,
23,ADAR,ENST00000529168,1,rs35749011,1,isoqtl,meta5,,0,


Unnamed: 0_level_0,gene,transcript,Locusnumber,SNP,CHR,feature,gwas,forced_lead_variant,forced_lead_variant_r2,reason
Unnamed: 0_level_1,<fct>,<fct>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>
754,SHOC2,ENST00000480155,prog1,rs61863020,10,isoqtl,prog1,,0,
572,NIPSNAP3A,ENST00000374767,prog2,rs382940,9,isoqtl,prog2,,0,
573,NIPSNAP3A,ENST00000471001,prog2,rs382940,9,isoqtl,prog2,,0,
760,SLC44A1,ENST00000374720,prog2,rs382940,9,isoqtl,prog2,,0,
761,SLC44A1,ENST00000374724,prog2,rs382940,9,isoqtl,prog2,,0,
762,SLC44A1,ENST00000436716,prog2,rs382940,9,isoqtl,prog2,,0,


## (2) Filter the plot_df

`plots_with_risk_snp` will contain the plots that contain the risk snp in the data. So we should have no issue plotting these  
`plots_with_no_data` will contain plots that have no snps in their tsvs to plot. So we won't plot these  
`plots_to_check` will contain plots that don't have the risk snp but do have data. So we will need to find a proxy snp for these to determine if we can plot them or not


In [12]:
plots_with_risk_snp <- data.frame()
plots_with_no_data <- data.frame()
plots_to_check <- data.frame()

do the filtering

In [13]:
for (rownum in 1:nrow(plot_df_transcript))
{

  print(rownum)
  

  
  row <- plot_df_transcript[rownum,]
  
  chr <- row$CHR
  snp <- row$SNP
  gene <- row$gene
  transcript <- row$transcript
  feature <- row$feature
  gwas <- row$gwas

  print(paste0("checking", snp, " " , gene, " ", gwas))
    
      
  if(gwas == "meta5")
  {
      tsv_dir <- paste0(isoqtl_tsv_dir,"/meta5")

  }
  if(gwas == "prog1")
  {
      tsv_dir <- paste0(isoqtl_tsv_dir,"/prog1")

  }
  if(gwas == "prog2")
  {
      tsv_dir <- paste0(isoqtl_tsv_dir,"/prog2")

  }
  gwas_fn = paste0(tsv_dir, '/', gene, '_', transcript, '_gwas.tsv')
  qtl_fn = paste0(tsv_dir, '/', gene, '_', transcript,'_isoqtl.tsv')
  
  #check if the files actually exist
  if(file.exists(gwas_fn) && file.exists(qtl_fn))
  {
    print("files exist")
    #read the tsv files
    g = read.table(gwas_fn, header = T)
    e = read.table(qtl_fn, header = T)
    
    #if they both aren't empty 
    if(nrow(g)!=0 && nrow(e)!=0)
    {
      
      #merge them by rsid
      df = merge(g, e, by = 'rsid')
      print("df:")
      print(head(df))
      #check if the risk variant is already in the data. If so, then we are fine
      if(row$SNP %in% df$rsid)
      {
        print("we have the risk snp!!!")
        row$'forced_lead_variant' <- row$SNP
        row$'forced_lead_variant_r2' <- 1
        row$'reason' <- "has risk variant"
        plots_with_risk_snp<-rbind(plots_with_risk_snp,row)
        
      }
      #if we don't have the risk variant then put the row in the plot_to_check df to check later
      else
      {
        tmp <- row
        tmp$num_snps <- nrow(df)
        plots_to_check<-rbind(plots_to_check,tmp)
        
      }
      

      

    }
    #if they are empty then put in the no data df
    else
    {
      row$'reason' <- "no data for plot"
      plots_with_no_data<-rbind(plots_with_no_data,row)
    }
  }
  #if the files don't even exist then put in the no data df
  else
  {
      row$'reason' <- "no data for plot"
      plots_with_no_data<-rbind(plots_with_no_data,row)
  }
  plot_df_transcript[rownum,] <- row

}

[1] 1
[1] "checkingrs114138760 ADAR meta5"
[1] "files exist"
[1] "df:"
        rsid        var_id.x  pval.x        var_id.y      pval.y
1 rs11264228 1:154598437:A:G 0.02882 1:154598437:A:G 8.64327e-07
2 rs12117101 1:154578391:T:C 0.04316 1:154578391:T:C 3.18150e-07
3 rs12128435 1:154579634:C:G 0.04366 1:154579634:C:G 5.00353e-07
4 rs12141385 1:154565582:G:C 0.05439 1:154565582:G:C 1.32127e-07
5 rs34126801 1:154586012:C:T 0.03063 1:154586012:C:T 9.89123e-07
6 rs36121466 1:154589648:C:A 0.03225 1:154589648:C:A 1.08303e-06
[1] 2
[1] "checkingrs114138760 ADAR meta5"
[1] "files exist"
[1] "df:"
        rsid        var_id.x  pval.x        var_id.y      pval.y
1 rs11264228 1:154598437:A:G 0.02882 1:154598437:A:G 1.09999e-09
2 rs11264229 1:154602588:C:A 0.02792 1:154602588:C:A 1.83146e-09
3 rs12117101 1:154578391:T:C 0.04316 1:154578391:T:C 8.10022e-10
4 rs12128435 1:154579634:C:G 0.04366 1:154579634:C:G 9.75936e-10
5 rs12139887 1:154573139:A:C 0.70840 1:154573139:A:C 2.34981e-07
6 rs12141385 

In [1]:
dim(plots_with_risk_snp)
head(plots_with_risk_snp)

write.csv(plots_with_risk_snp,file="$PATH1/qtl/psychencode/isoqtl_plots_with_risk_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-1-a1fd8b778055>, line 4)

In [2]:
dim(plots_with_no_data)
head(plots_with_no_data)

write.csv(plots_with_no_data,file="$PATH1/qtl/psychencode/isoqtl_plots_with_no_data.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-2-b0c93fd9b597>, line 4)

In [3]:
dim(plots_to_check)
head(plots_to_check)

write.csv(plots_to_check,file="$PATH1/qtl/psychencode/isoqtl_plots_to_check.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-3-5257dadc66e4>, line 4)

## (3) Obtain Proxy SNPS Per Locus

use LDlinkR to generate a list of snps with a LD R2 > 0.7 for each risk variant

run this in terminal. Queries must be made sequentially for LDlinkR to work (so don't bother changing the script to run in parallel)


In [17]:
print("Rscript getRiskSNPProxies.R")

[1] "Rscript getRiskSNPProxies.R"


## (4) Check Plots for Proxy SNPs

go through all the plots we know need checking and see if they have a proxy.

write plots with a good proxy snp to a file. and then combine that file with the list of plots that already have the risk snp to create a final plot list file



In [18]:
proxy_dir <- "$PATH1/qtl/proxy_snps"

In [19]:
plots_to_check <- fread("$PATH1/qtl/psychencode/isoqtl_plots_to_check.csv")
plots_with_proxy_snp <- data.frame()
plots_with_no_proxy_snp <- data.frame()

In [20]:
print(dim(plots_to_check))

[1] 878  11


In [21]:
for (rownum in 1:nrow(plots_to_check))
{
    row <- plots_to_check[rownum,]

  
  chr <- row$CHR
  snp <- row$SNP
  gene <- row$gene
  transcript <- row$transcript
  feature <- row$feature
  gwas <- row$gwas

  print(paste0("checking", snp, " " , gene, " ", gwas))
    
      
  if(gwas == "meta5")
  {
      tsv_dir <- paste0(isoqtl_tsv_dir,"/meta5")

  }
  if(gwas == "prog1")
  {
      tsv_dir <- paste0(isoqtl_tsv_dir,"/prog1")

  }
  if(gwas == "prog2")
  {
      tsv_dir <- paste0(isoqtl_tsv_dir,"/prog2")

  }
  gwas_fn = paste0(tsv_dir, '/', gene, '_', transcript, '_gwas.tsv')
  qtl_fn = paste0(tsv_dir, '/', gene, '_', transcript,'_isoqtl.tsv')
    

    g = read.table(gwas_fn, header = T)
    e = read.table(qtl_fn, header = T)

    #merge them by rsid
    df = merge(g, e, by = 'rsid')
    
    #load the proxy snps
    proxies <- fread(paste0(proxy_dir,"/",row$SNP,"_proxies.csv"))
    
    #if we even have any proxy snps for the risk variant
    if(nrow(proxies)>0)
    {
        #merge the proxy snp df with the data df
        merged <- merge(df, proxies, by.x = "rsid", by.y = "RS_Number")

        if(nrow(merged) > 0)
        {
            best_ld_snp <- merged[which.max(merged$R2),]
            row$'forced_lead_variant' <- best_ld_snp$rsid
            row$'forced_lead_variant_r2' <- best_ld_snp$R2
            row$'reason' <- "plot has a good proxy snp"
            plots_with_proxy_snp <- rbind(plots_with_proxy_snp,row)
            
        }
        else
        {
            row$'reason' <- "no good proxy snp exists in the data we want to plot"
            plots_with_no_proxy_snp <- rbind(plots_with_no_proxy_snp, row)
        }
    }
    else
    {
        row$'reason' <- "there was no proxy snp for the risk variant"
        plots_with_no_proxy_snp <- rbind(plots_with_no_proxy_snp, row)
    }
    
    
    
    
}

[1] "checkingrs114138760 ADAR meta5"
[1] "checkingrs114138760 ADAR meta5"
[1] "checkingrs114138760 ADAR meta5"
[1] "checkingrs35749011 ADAR meta5"
[1] "checkingrs35749011 ADAR meta5"
[1] "checkingrs35749011 ADAR meta5"
[1] "checkingrs76763715 ADAR meta5"
[1] "checkingrs76763715 ADAR meta5"
[1] "checkingrs76763715 ADAR meta5"
[1] "checkingrs114138760 ARHGEF2 meta5"
[1] "checkingrs35749011 ARHGEF2 meta5"
[1] "checkingrs76763715 ARHGEF2 meta5"
[1] "checkingrs114138760 C1orf43 meta5"
[1] "checkingrs114138760 C1orf43 meta5"
[1] "checkingrs114138760 C1orf43 meta5"
[1] "checkingrs114138760 C1orf43 meta5"
[1] "checkingrs35749011 C1orf43 meta5"
[1] "checkingrs35749011 C1orf43 meta5"
[1] "checkingrs35749011 C1orf43 meta5"
[1] "checkingrs35749011 C1orf43 meta5"
[1] "checkingrs76763715 C1orf43 meta5"
[1] "checkingrs76763715 C1orf43 meta5"
[1] "checkingrs76763715 C1orf43 meta5"
[1] "checkingrs76763715 C1orf43 meta5"
[1] "checkingrs114138760 DAP3 meta5"
[1] "checkingrs114138760 DAP3 meta5"
[1] "chec

In [4]:
dim(plots_with_proxy_snp)
head(plots_with_proxy_snp)

write.csv(plots_with_proxy_snp,file="$PATH1/qtl/psychencode/isoqtl_plots_with_proxy_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-4-7903972fbf95>, line 4)

In [5]:
dim(plots_with_no_proxy_snp)
head(plots_with_no_proxy_snp)

write.csv(plots_with_no_proxy_snp,file="$PATH1/qtl/psychencode/isoqtl_plots_with_no_proxy_snp.csv",row.names=F,col.names=F)

SyntaxError: keyword can't be an expression (<ipython-input-5-856c209aa6e5>, line 4)

make the final plot list file

In [7]:
plots_with_risk_snp <- fread("$PATH1/qtl/psychencode/isoqtl_plots_with_risk_snp.csv")
plots_with_risk_snp$'num_snps' <- NA

plots_to_plot <- rbind(plots_with_risk_snp, plots_with_proxy_snp)


write.csv(plots_to_plot,file="$PATH1/qtl/psychencode/isoqtl_plots_to_plot.csv",row.names=F,col.names=F)

SyntaxError: invalid syntax (<ipython-input-7-3258b8dc225e>, line 2)

finally summary table for all the plots

In [8]:
plots_to_plot <- plots_to_plot[, -c('num_snps')]
plots_with_no_proxy_snp <- plots_with_no_proxy_snp[, -c('num_snps')]

final_summary <- rbind(plots_to_plot,plots_with_no_proxy_snp, plots_with_no_data)
dim(final_summary)
head(final_summary)
write.csv(final_summary,file="$PATH1/qtl/psychencode/isoqtl_plots_final_overview.csv",row.names=F,col.names=F)

SyntaxError: invalid syntax (<ipython-input-8-d9b18e444230>, line 1)