# Generate QTL Plot Info From Data
- **Author(s)** - Frank Grenn
- **Date Started** - May 2020
- **Quick Description:** Identify which genes we can create different qtl Locus Compare plots for. See if there is enough data to create a plot and if the plot has the risk variant in its data. Output a file per plot type (brain eqtl, blood eqtl, psychencode eqtl, psychencode isoqtl) describing which genes we can plot

need to have tsvs for all genes across all loci and all gwases to generate the complete file at the end of this notebook. this may require multiple runs of the `QTL_Generate_Data.ipynb` and `QTL_Generate_Data_Psychencode.ipynb` (a run for each gwas summary stats file)

In [None]:
library(data.table)
library(dplyr)

In [None]:
#location of the plot data tsvs created previously
tsv_dir <- "/path/to/AppDataProcessing/qtl/tsv"

## (1) Create a Dataframe Containing Information for all the Plots

we want a plot for each gene, each feature (blood, brain) on each locus


In [None]:
evidence <- fread("/path/to/AppDataProcessing/genes_by_locus.csv")
head(evidence)

In [None]:
gwas_risk_variants <- fread("/path/to/AppDataProcessing/gwas_risk_variants.csv")
dim(gwas_risk_variants)
head(gwas_risk_variants)

In [None]:
#merge the evidence df with the variant df
#this is done to account for loci with multiple snps (ex: locus 1 has three risk variants, so we should try to create three plots for each gene on that locus)
plot_df <- merge(x=evidence, y=gwas_risk_variants, by.x = c("GWAS","LOC_NUM"), by.y = c("GWAS","LOC_NUM"), all.x = TRUE, allow.cartesian = TRUE)

plot_df <- plot_df[with(plot_df, order(GWAS,LOC_NUM, GENE, RSID)),]
plot_df$'forced_lead_variant' <- "NA"
plot_df$'forced_lead_variant_r2' <- 0
plot_df$'reason' <- "NA"
plot_df$'num_snps' <- 0
plot_df$'can_plot' <- NA
plot_df$'has_plot' <- NA


In [None]:
dim(plot_df)
head(plot_df)
tail(plot_df)

## (2) Obtain Proxy SNPS Per Locus

use LDlinkR to generate a list of snps with a LD R2 > 0.7 for each risk variant

run this in terminal. Queries must be made sequentially for LDlinkR to work (so don't bother changing the script to run in parallel)


In [None]:
print("Rscript getRiskSNPProxies.R")

##### Setup function to lookup proxies

In [None]:
proxy_dir = "/path/to/AppDataProcessing/qtl/proxy_snps"
check_for_proxy <- function(row,df){
    #load the proxy snps
    proxies <- fread(paste0(proxy_dir,"/",row$RSID,"_proxies.csv"))
    
    #if we even have any proxy snps for the risk variant
    if(nrow(proxies)>0)
    {
        #merge the proxy snp df with the data df
        merged <- merge(df, proxies, by.x = "RSID", by.y = "RS_Number")

        if(nrow(merged) > 0)
        {
            best_ld_snp <- merged[which.max(merged$R2),]
            row$'forced_lead_variant' <- best_ld_snp$RSID
            row$'forced_lead_variant_r2' <- best_ld_snp$R2
            row$'reason' <- "plot has a good proxy snp"
            row$can_plot <- TRUE
            #print(row$'forced_lead_variant')
            #print(best_ld_snp$RSID[1])
        }
        else
        {
            row$'reason' <- "risk variants proxies not in plot data"
            row$can_plot <- TRUE#FALSE
        }
    }
    else
    {
        row$'reason' <- "risk variant has no proxies"
        row$can_plot <- TRUE#FALSE
    }
    row

}

## (3) Process the eQTL Brain data



In [None]:
brain_df <- plot_df
brain_df$feature <- 'brain'


In [None]:
dim(brain_df)
head(brain_df)


In [None]:
for (rownum in 1:nrow(brain_df))
{

  #print(rownum)
  

  
  row <- brain_df[rownum,]
  
  chr <- row$CHR
  rsid <- row$RSID
  gene <- row$GENE
  feature <- row$feature
  
  gwas <- row$GWAS

  #print(paste0("checking", snp, " " , gene, " ", feature))
  ###now check the tsvs for data
  #get file path strings
  gwas_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_',feature,'_gwas.tsv')
  
  eqtl_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_',feature,'_eqtl.tsv')
  
  #check if the files actually exist
  if(file.exists(gwas_fn) && file.exists(eqtl_fn))
  {
      #print(paste0("checking", rsid, " " , gene, " ", feature, " " , row$GWAS))
    #print("files exist")
    #read the tsv files
    g = read.table(gwas_fn, header = T)
    e = read.table(eqtl_fn, header = T)
    
    #if they both aren't empty 
    if(nrow(g)!=0 && nrow(e)!=0)
    {
      
      #merge them by rsid
      df = merge(g, e, by = 'RSID')
      #print("df:")
      #print(head(df))
        
      if(nrow(df)!=0)
      {
          row$num_snps <- nrow(df)  
          #check if the risk variant is already in the data. If so, then we are fine
          if(row$RSID %in% df$RSID)
          {
           # print("we have the risk snp!!!")
            row$'forced_lead_variant' <- row$RSID
            row$'forced_lead_variant_r2' <- 1
            row$'reason' <- "has risk variant"
            row$can_plot <- TRUE

          }
          #otherwise we need to check for an available proxy snp
          else
          {
              row <- check_for_proxy(row, df)
          }
 
      }
      #if no data after merging
      else
      {
          row$'reason' <- "no common snps between qtl and gwas data"
          row$can_plot <- FALSE
      }
   

    }
    #if either file is empty
    else
    {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
    }
  }
  #if the files don't even exist
  else
  {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
  }
  brain_df[rownum,] <- row

}

In [None]:
dim(brain_df)
head(brain_df)

write.csv(brain_df,file="/path/to/AppDataProcessing/qtl/brain_qtl_info.csv",row.names=F,col.names=F)

## (4) Process the eQTL Blood data



In [None]:
blood_df <- plot_df
blood_df$feature <- 'blood'

(dim(blood_df))
(head(blood_df))

In [None]:
for (rownum in 1:nrow(blood_df))
{

  #print(rownum)
  

  
  row <- blood_df[rownum,]
  
  chr <- row$CHR
  rsid <- row$RSID
  gene <- row$GENE
  feature <- row$feature
  
  gwas <- row$GWAS

  #print(paste0("checking", snp, " " , gene, " ", feature))
  ###now check the tsvs for data
  #get file path strings
  gwas_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_',feature,'_gwas.tsv')
  
  eqtl_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_',feature,'_eqtl.tsv')
  
  #check if the files actually exist
  if(file.exists(gwas_fn) && file.exists(eqtl_fn))
  {
      #print(paste0("checking", rsid, " " , gene, " ", feature, " " , row$GWAS))
    #print("files exist")
    #read the tsv files
    g = read.table(gwas_fn, header = T)
    e = read.table(eqtl_fn, header = T)
    
    #if they both aren't empty 
    if(nrow(g)!=0 && nrow(e)!=0)
    {
      
      #merge them by rsid
      df = merge(g, e, by = 'RSID')
      #print("df:")
      #print(head(df))
        
      if(nrow(df)!=0)
      {
          row$num_snps <- nrow(df)  
          #check if the risk variant is already in the data. If so, then we are fine
          if(row$RSID %in% df$RSID)
          {
           # print("we have the risk snp!!!")
            row$'forced_lead_variant' <- row$RSID
            row$'forced_lead_variant_r2' <- 1
            row$'reason' <- "has risk variant"
            row$can_plot <- TRUE

          }
          #otherwise we need to check for an available proxy snp
          else
          {
              row <- check_for_proxy(row, df)
          }
 
      }
      #if no data after merging
      else
      {
          row$'reason' <- "no common snps between qtl and gwas data"
          row$can_plot <- FALSE
      }
   

    }
    #if either file is empty
    else
    {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
    }
  }
  #if the files don't even exist
  else
  {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
  }
  blood_df[rownum,] <- row

}

In [None]:
dim(blood_df)
head(blood_df)

write.csv(blood_df,file="/path/to/AppDataProcessing/qtl/blood_qtl_info.csv",row.names=F,col.names=F)

## (5) Process the PsychENCODE eQTL data



In [None]:
pe_eqtl_df <- plot_df
pe_eqtl_df$feature <- 'e_pe'

(dim(pe_eqtl_df))
(head(pe_eqtl_df))

In [None]:
for (rownum in 1:nrow(pe_eqtl_df))
{

  #print(rownum)
  

  
  row <- pe_eqtl_df[rownum,]
  
  chr <- row$CHR
  rsid <- row$RSID
  gene <- row$GENE
  feature <- row$feature
  
  gwas <- row$GWAS

  #print(paste0("checking", snp, " " , gene, " ", feature))
  ###now check the tsvs for data
  #get file path strings
  gwas_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_',feature,'_gwas.tsv')
  
  eqtl_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_',feature,'_eqtl.tsv')
  
  #check if the files actually exist
  if(file.exists(gwas_fn) && file.exists(eqtl_fn))
  {
      #print(paste0("checking", rsid, " " , gene, " ", feature, " " , row$GWAS))
    #print("files exist")
    #read the tsv files
    g = read.table(gwas_fn, header = T)
    e = read.table(eqtl_fn, header = T)
    
    #if they both aren't empty 
    if(nrow(g)!=0 && nrow(e)!=0)
    {
      
      #merge them by rsid
      df = merge(g, e, by = 'RSID')
      #print("df:")
      #print(head(df))
        
      if(nrow(df)!=0)
      {
          row$num_snps <- nrow(df)  
          #check if the risk variant is already in the data. If so, then we are fine
          if(row$RSID %in% df$RSID)
          {
           # print("we have the risk snp!!!")
            row$'forced_lead_variant' <- row$RSID
            row$'forced_lead_variant_r2' <- 1
            row$'reason' <- "has risk variant"
            row$can_plot <- TRUE

          }
          #otherwise we need to check for an available proxy snp
          else
          {
              row <- check_for_proxy(row, df)
          }
 
      }
      #if no data after merging
      else
      {
          row$'reason' <- "no common snps between qtl and gwas data"
          row$can_plot <- FALSE
      }
   

    }
    #if either file is empty
    else
    {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
    }
  }
  #if the files don't even exist
  else
  {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
  }
  pe_eqtl_df[rownum,] <- row

}

In [None]:
dim(pe_eqtl_df)
head(pe_eqtl_df)

write.csv(pe_eqtl_df,file="/path/to/AppDataProcessing/qtl/pe_eqtl_info.csv",row.names=F,col.names=F)

In [None]:
has_pe_eqtl <- (pe_eqtl_df[which(pe_eqtl_df$can_plot==TRUE),])
dim(has_pe_eqtl)
head(has_pe_eqtl)

## (6) Process the PsychENCODE isoQTL data
handle this differently because one gene may have multiple transcripts


In [None]:
pe_isoqtl_df <- plot_df
pe_isoqtl_df$feature <- 'i_pe'

(dim(pe_isoqtl_df))
(head(pe_isoqtl_df))

In [None]:
tsv_dir

In [None]:
gwas_folders <- list.files(paste0(tsv_dir))
gwas_folders

In [None]:

file_df <- NULL
for (gwas in gwas_folders)
{
    gwas_files <- list.files(paste0(tsv_dir,"/",gwas))
    for(file in gwas_files)
    {
 
        row <- NULL
        if(grepl("isoqtl", file, fixed=TRUE))
        {

            split <- strsplit(file,split="_",fixed=T)

            row <- data.frame("gene"=split[[1]][1],"transcript"=split[[1]][2])

            file_df <- rbind(file_df, row)
        }
    }
}
print(dim(file_df))
print(head(file_df))

In [None]:
pe_isoqtl_only_df <- merge(x = file_df, y = pe_isoqtl_df, by.x = "gene", by.y = "GENE", all.x = TRUE, allow.cartesian = TRUE)
print(dim(pe_isoqtl_only_df))
print(head(pe_isoqtl_only_df))

In [None]:
names(pe_isoqtl_only_df)[names(pe_isoqtl_only_df) == 'gene'] <- 'GENE'
names(pe_isoqtl_only_df)[names(pe_isoqtl_only_df) == 'transcript'] <- 'TRANSCRIPT'
print(names(pe_isoqtl_only_df))

In [None]:
pe_isoqtl_df <- NA
for (rownum in 1:nrow(pe_isoqtl_only_df))
{

  #print(rownum)
  

  
  row <- pe_isoqtl_only_df[rownum,]
  
  chr <- row$CHR
  rsid <- row$RSID
  gene <- row$GENE
  feature <- row$feature
  transcript <- row$TRANSCRIPT
  
  gwas <- row$GWAS

  #print(paste0("checking", snp, " " , gene, " ", feature))
  ###now check the tsvs for data
  #get file path strings
  gwas_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_', transcript, '_',feature,'_gwas.tsv')
  
  eqtl_fn = paste0(tsv_dir, '/',gwas,'/', gene, '_', transcript, '_',feature,'_isoqtl.tsv')
  
  #check if the files actually exist
  if(file.exists(gwas_fn) && file.exists(eqtl_fn))
  {
      #print(paste0("checking", rsid, " " , gene, " ", feature, " " , row$GWAS))
    #print("files exist")
    #read the tsv files
    g = read.table(gwas_fn, header = T)
    e = read.table(eqtl_fn, header = T)
    
    #if they both aren't empty 
    if(nrow(g)!=0 && nrow(e)!=0)
    {
      
      #merge them by rsid
      df = merge(g, e, by = 'RSID')
      #print("df:")
      #print(head(df))
        
      if(nrow(df)!=0)
      {
          row$num_snps <- nrow(df)  
          #check if the risk variant is already in the data. If so, then we are fine
          if(row$RSID %in% df$RSID)
          {
           # print("we have the risk snp!!!")
            row$'forced_lead_variant' <- row$RSID
            row$'forced_lead_variant_r2' <- 1
            row$'reason' <- "has risk variant"
            row$can_plot <- TRUE

          }
          #otherwise we need to check for an available proxy snp
          else
          {
              row <- check_for_proxy(row, df)
          }
 
      }
      #if no data after merging
      else
      {
          row$'reason' <- "no common snps between qtl and gwas data"
          row$can_plot <- FALSE
      }
   

    }
    #if either file is empty
    else
    {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
    }
  }
  #if the files don't even exist
  else
  {
      row$'reason' <- "gwas and/or qtl data empty"
      row$can_plot <- FALSE
  }
  if(row$forced_lead_variant!='NA')
  {
      print(row)
  }
  if(is.na(pe_isoqtl_df))
  {
      pe_isoqtl_df <- row
  }
  else 
  {
      pe_isoqtl_df <- rbind(pe_isoqtl_df, row)
  }

}

In [None]:
dim(pe_isoqtl_df)
head(pe_isoqtl_df)

write.csv(pe_isoqtl_df,file="/path/to/AppDataProcessing/qtl/pe_isoqtl_info.csv",row.names=F,col.names=F)

In [None]:
has_isoqtl <- (pe_isoqtl_df[which(pe_isoqtl_df$forced_lead_variant!='NA'),])
dim(has_isoqtl)
head(has_isoqtl)

## (7) Combine all the `_info.csv` files into one

In [None]:
dim(pe_isoqtl_df)
dim(pe_eqtl_df)
dim(brain_df)
dim(blood_df)

In [None]:
pe_eqtl_df$TRANSCRIPT <- NA
brain_df$TRANSCRIPT <- NA
blood_df$TRANSCRIPT <- NA

In [None]:
all_plot_info <- rbind(pe_isoqtl_df,pe_eqtl_df,brain_df,blood_df)
dim(all_plot_info)
head(all_plot_info)

In [None]:
write.csv(all_plot_info,file="/path/to/AppDataProcessing/qtl/all_qtl_info.csv",row.names=F,col.names=F)

## (8) Archive the tsvs to copy locally

In [None]:
print("tar -zcvf tsv.tar.gz tsv/")

## (9) Next Steps

next we want to create the plots

so copy the `tsv` folder, and the `all_qtl_info.csv` file locally

we need to run the plot generating code locally because `locuscompareR` uses code to query a sql database for LD values and that doesn't work very well on biowulf

run the plot generation script with the `nohup` option to run in background locally