# GWAS Locus Browser QTL and GWAS Gene Data for (1) all Genes or (2) one Gene
- **Author(s)** - Frank Grenn and Hirotaka Iwaki
- **Date Started** - October 2019
- **Quick Description:** collect eQTL and GWAS data for genes
- **Data:** [brain eQTL](https://cnsgenomics.com/software/smr/#DataResource), [blood eQTL](https://www.eqtlgen.org/cis-eqtls.html)

In [1]:
library(data.table)
library(dplyr)


Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




## (1) All Genes

* #### (a) get the genes we want data for
* #### (b) get the gwas data
* #### (c) get the eqtl data
* #### (d) generate blood eqtl and gwas tsvs for each gene
* #### (e) generate brain eqtl tsvs for each gene using a swarm job
* #### (f) generate brain GWAS TSVs for each gene
* #### (g) next steps

### (a) get the genes we want data for
from the `genes_by_locus.csv`

In [2]:
evidence <- fread("$PATH1/genes_by_locus.csv")


evidence_genes <- evidence$Gene

length(evidence_genes)
head(evidence_genes)


### (b) get the gwas data for (i) meta5 or (ii) progression

#### (i) meta5

In [3]:
#META5
gwas_in = paste0('$PATH2/resultsForSmr_filtered.tab.gz')

gwas = fread(gwas_in)
dim(gwas)
head(gwas)



SNP,A1,A2,freq,b,se,p,N
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
rs7899632,A,G,0.5665,0.011,0.0095,0.2476,1474097
rs61875309,A,C,0.7953,-0.0091,0.0116,0.4295,1474097
rs150203744,T,C,0.014,-0.0152,0.0649,0.8147,1351069
rs111551711,T,C,0.9868,0.0347,0.0742,0.6396,777210
rs12258651,T,G,0.8819,-0.0011,0.0149,0.9423,1474097
rs72828461,A,G,0.9605,-0.0018,0.0325,0.9569,1365107


#### (ii) progression

read in progression gwas data (each progression locus has a different summary stats file so may need to modify lines below)

In [33]:
#Progression Loci
##rs382940, 9:108058562, prog2
gwas_in = paste0('$PATH1/locuszoom/surv_HY3.txt')
evidence_genes <- evidence[which(evidence$Locusnumber=="prog2"),]$Gene



##rs61863020, 10:112956055, prog1
#gwas_in = paste0('$PATH1/locuszoom/base_INS.txt')
#evidence_genes <- evidence[which(evidence$Locusnumber=="prog1"),]$Gene

print(length(evidence_genes))
print(evidence_genes)

gwas = fread(gwas_in)

# change the headers a bit
if('ID' %in% names(gwas)){
  gwas = gwas %>% mutate(SNP = ID) %>% mutate(p = P)
}

if('P' %in% names(gwas)){
    gwas = gwas %>% mutate(p = P)
}

head(gwas)
dim(gwas)

[1] 17
 [1] "ABCA1"      "FKTN"       "FSD1L"      "NIPSNAP3A"  "NIPSNAP3B" 
 [6] "OR13C2"     "OR13C3"     "OR13C4"     "OR13C5"     "OR13C8"    
[11] "OR13C9"     "OR13D1"     "OR13F1"     "RALGAPA1P1" "SLC44A1"   
[16] "TAL2"       "TMEM38B"   


Unnamed: 0_level_0,SNP,BETA,SE,P,N,NSTUDY,Isq,p
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>
1,5:29439275,-0.0322,0.0657,0.6241,2582,9,14.3,0.6241
2,5:85928892,0.2634,0.1526,0.08427,1299,5,0.0,0.08427
3,2:170966953,0.4025,0.287,0.1608,2265,8,0.0,0.1608
4,10:128341232,-0.1408,0.0783,0.07199,1299,5,46.5,0.07199
5,3:62707519,-0.1344,0.1723,0.4355,1299,5,0.0,0.4355
6,2:80464120,0.2888,0.2789,0.3004,1299,5,0.0,0.3004


merge progression loci summary stats with reference.txt to get rsids

In [34]:
reference = fread("$PATH1/locuszoom/reference.txt")
dim(reference)
head(reference)


SNP,RSID,CHR,START,REF,ALT,MAF,FUNC,NearGENE
<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>
1:14470,,1,14470,G,A,0.0263,ncRNA_exonic,WASH7P
1:14671,rs201055865,1,14671,G,C,0.0156,ncRNA_exonic,WASH7P
1:14773,rs878915777,1,14773,C,T,0.0178,ncRNA_exonic,WASH7P
1:16841,rs62636368,1,16841,G,T,0.0725,ncRNA_intronic,WASH7P
1:16856,rs3891260,1,16856,A,G,0.0199,ncRNA_splicing,WASH7P
1:17147,rs867691030,1,17147,G,A,0.0448,ncRNA_intronic,WASH7P


In [35]:
gwas = merge(x = gwas, y = reference, by = "SNP", all.x = TRUE)

gwas$SNP <- NULL
gwas$P <- NULL

colnames(gwas)[colnames(gwas)=="RSID"] <- "SNP"

head(gwas)
dim(gwas)

Unnamed: 0_level_0,BETA,SE,N,NSTUDY,Isq,p,SNP,CHR,START,REF,ALT,MAF,FUNC,NearGENE
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>
1,-0.1558,0.1759,363,1,0.0,0.3759,rs6602381,10,10000018,A,G,0.4459,intergenic,LOC101928272;LOC101928298
2,0.0032,0.0655,2582,9,13.2,0.9606,rs7899632,10,100000625,A,G,0.4289,intronic,R3HCC1L
3,0.015,0.0765,2582,9,0.0,0.8447,rs61875309,10,100000645,A,C,0.1999,intronic,R3HCC1L
4,0.2703,0.3834,936,4,26.6,0.4809,rs150203744,10,100001867,C,T,0.0163,intronic,R3HCC1L
5,0.8956,0.3042,1352,5,15.5,0.003241,rs111551711,10,100002464,T,C,0.0166,intronic,R3HCC1L
6,0.0426,0.1108,2582,9,14.1,0.7008,rs12258651,10,100003242,T,G,0.1278,intronic,R3HCC1L


### (c) get the eqtl data

In [36]:
eqtl = fread('$PATH3/cis-eQTL_significant_20181017.txt.gz')
dim(eqtl)
head(eqtl)


Pvalue,SNP,SNPChr,SNPPos,AssessedAllele,OtherAllele,Zscore,Gene,GeneSymbol,GeneChr,GenePos,NrCohorts,NrSamples,FDR
<dbl>,<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<dbl>
2.1391e-15,rs1010944,20,49525434,G,A,-7.933,ENSG00000000419,DPM1,20,49563248,33,30396,0.0
1.9943e-06,rs1062651,20,49508683,A,G,-4.7539,ENSG00000000419,DPM1,20,49563248,29,28403,0.00584088
1.054e-08,rs12479470,20,49514589,C,T,-5.7216,ENSG00000000419,DPM1,20,49563248,35,30895,5.784433e-05
8.9327e-10,rs12479721,20,49529785,T,C,-6.1274,ENSG00000000419,DPM1,20,49563248,23,20044,1.314752e-05
1.4714e-14,rs12479950,20,49512803,T,C,-7.6899,ENSG00000000419,DPM1,20,49563248,31,29729,0.0
4.5549e-12,rs12480447,20,49496178,G,C,-6.9187,ENSG00000000419,DPM1,20,49563248,34,30772,0.0


In [10]:
rsids <- fread("$PATH1/GWAS_loci_overview.csv")$SNP
head(rsids)

### (d) generate blood eqtl and gwas tsvs for each gene

In [37]:
proj = '$PATH1/qtl/'
gwasoutfold = paste0(proj, 'tsv/')
count <- 0
print(length(evidence_genes))
for(gene in evidence_genes)
{
    # blood eqtl
    eqtl_fn = paste0('tsv/', gene, '_blood_eqtl.tsv')
    gwasout_fn = paste0(gwasoutfold,  gene, '_blood_gwas.tsv')

    eqtl_gene = eqtl %>% filter(GeneSymbol == gene) %>% 
    mutate(rsid = SNP, pval = Pvalue) %>% select(rsid, pval)
    gwas %>% filter(SNP %in% eqtl_gene$rsid) %>% 
    mutate(rsid = SNP, pval = p) %>% select(rsid, pval) %>%
    fwrite(gwasout_fn, sep='\t')
    fwrite(eqtl_gene, eqtl_fn, sep='\t')
}
 

[1] 17


### (e) generate brain eQTL TSVs for each gene using a swarm job

generate the swarm file

In [38]:


count = 0
for(gene in evidence_genes)
{
  #print(paste0(gene, " ", count))

  line = paste0('$PATH4/smr_Linux',
                ' --beqtl-summary $PATH5/Brain-eMeta',
                ' --thread-num 1',
                ' --query 5.0e-2',
                ' --gene ', gene, 
                ' --cis-wind 2000',
                ' --out $PATH1/qtl/tsv/', gene, '_brain_eqtl')
    write(line,file="script/generate_brain_eqtl.swarm",append=TRUE)
    #count = count +1
}
 

print the command to run the swarm file

In [39]:
print('swarm -f generate_brain_eqtl.swarm -g 10 --partition quick --time 00:02:00')

[1] "swarm -f generate_brain_eqtl.swarm -g 10 --partition quick --time 00:02:00"


### (f) generate brain GWAS TSVs for each gene

In [40]:
for(gene in evidence_genes)
{   
  # brain eqtl
  eqtl_fn = paste0('tsv/', gene, '_brain_eqtl.tsv')
  gwasout_fn = paste0(gwasoutfold,  gene, '_brain_gwas.tsv')

    
  if(file.exists(paste0('tsv/',gene, '_brain_eqtl.txt')))
  {
      qtls = fread(paste0('tsv/',gene, '_brain_eqtl.txt'))
      cispos = qtls %>% group_by(Chr, Gene) %>% mutate(n = n()) %>% 
        arrange(desc(n)) %>% 
        ungroup() %>%
        distinct(Chr, .keep_all = T) %>% 
        rename(GeneChr = Chr) %>%
        select(Gene, GeneChr) 
      qtls_cis = inner_join(qtls, cispos, by='Gene') %>% 
        filter(Chr==GeneChr) %>%
        data.frame() %>%
        .[grep('rs', .$SNP),] %>%
        mutate(rsid = SNP, pval = p) %>% 
        select(rsid, pval)
      fwrite(qtls_cis, eqtl_fn, sep='\t')

      gwas %>% filter(SNP %in% qtls_cis$rsid) %>%
        mutate(rsid = SNP, pval = p) %>% 
        select(rsid, pval) %>%
        fwrite(gwasout_fn, sep='\t')    
      
  }
  
}

### (g) next steps

next we need to see which genes we can create plots for. Now start running `QTL_Proxy_SNPS.ipynb`

## (2) For One Gene  (WIP/need to test again)
useful with genes that have alternate names (looking at you _RAB7L1/RAB29_!)

* #### (a) specify the data gene names
* #### (b) get the gwas data
* #### (c) get the eqtl data
* #### (d) generate blood eqtl and gwas tsvs for each gene
* #### (e) generate brain eqtl tsvs for each gene using a swarm job
* #### (f) generate brain GWAS TSVs for each gene
* #### (g) next steps

### (a) specify the data gene names

In [23]:
#name of the gene in the eqtl data
eqtl_gene_name <- "RAB7L1"
#name of the gene in the gwas data
gwas_gene_name <-"RAB29"
#what gene name you want the files to show for the gene
final_gene_file_name <-"RAB29"

### (b) get the gwas data

In [24]:
#META5
gwas_in = paste0('$PATH2/resultsForSmr_filtered.tab.gz')

gwas = fread(gwas_in)
dim(gwas)
head(gwas)



SNP,A1,A2,freq,b,se,p,N
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
rs7899632,A,G,0.5665,0.011,0.0095,0.2476,1474097
rs61875309,A,C,0.7953,-0.0091,0.0116,0.4295,1474097
rs150203744,T,C,0.014,-0.0152,0.0649,0.8147,1351069
rs111551711,T,C,0.9868,0.0347,0.0742,0.6396,777210
rs12258651,T,G,0.8819,-0.0011,0.0149,0.9423,1474097
rs72828461,A,G,0.9605,-0.0018,0.0325,0.9569,1365107


### (c) get the eqtl data

In [25]:
eqtl = fread('$PATH3/cis-eQTL_significant_20181017.txt.gz')
dim(eqtl)
head(eqtl)

Pvalue,SNP,SNPChr,SNPPos,AssessedAllele,OtherAllele,Zscore,Gene,GeneSymbol,GeneChr,GenePos,NrCohorts,NrSamples,FDR
<dbl>,<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<dbl>
2.1391e-15,rs1010944,20,49525434,G,A,-7.933,ENSG00000000419,DPM1,20,49563248,33,30396,0.0
1.9943e-06,rs1062651,20,49508683,A,G,-4.7539,ENSG00000000419,DPM1,20,49563248,29,28403,0.00584088
1.054e-08,rs12479470,20,49514589,C,T,-5.7216,ENSG00000000419,DPM1,20,49563248,35,30895,5.784433e-05
8.9327e-10,rs12479721,20,49529785,T,C,-6.1274,ENSG00000000419,DPM1,20,49563248,23,20044,1.314752e-05
1.4714e-14,rs12479950,20,49512803,T,C,-7.6899,ENSG00000000419,DPM1,20,49563248,31,29729,0.0
4.5549e-12,rs12480447,20,49496178,G,C,-6.9187,ENSG00000000419,DPM1,20,49563248,34,30772,0.0


### (d) generate blood eqtl and gwas tsvs for each gene

In [29]:
out_dir = '$PATH1/qtl/tsv/'


# blood eqtl
eqtl_fn = paste0(out_dir, final_gene_file_name, '_blood_eqtl.tsv')
gwasout_fn = paste0(out_dir,  final_gene_file_name, '_blood_gwas.tsv')

#for etql (named RAB7L1)
eqtl_gene = eqtl %>% filter(GeneSymbol == eqtl_gene_name) %>% 
mutate(rsid = SNP, pval = Pvalue) %>% select(rsid, pval)

fwrite(eqtl_gene, eqtl_fn, sep='\t')

#for gwas (named RAB29)
gwas %>% filter(SNP %in% eqtl_gene$rsid) %>% 
mutate(rsid = SNP, pval = p) %>% select(rsid, pval) %>%
fwrite(gwasout_fn, sep='\t')


 

### (e) generate brain eqtl tsvs for each gene using a swarm job

In [None]:


print(paste0('$PATH4/smr_Linux',
            ' --beqtl-summary $PATH5/Brain-eMeta',
            ' --thread-num 1',
            ' --query 5.0e-2',
            ' --gene ', eqtl_gene_name, 
            ' --cis-wind 2000',
            ' --out $PATH1/qtl/tsv/', final_gene_file_name, '_brain_eqtl'))

 

### (f) generate brain GWAS TSVs for each gene

In [32]:

# brain eqtl
eqtl_fn = paste0(out_dir, final_gene_file_name, '_brain_eqtl.tsv')
gwasout_fn = paste0(out_dir,  final_gene_file_name, '_brain_gwas.tsv')


if(file.exists(paste0(out_dir,final_gene_file_name, '_brain_eqtl.txt')))
{
  qtls = fread(paste0(out_dir,final_gene_file_name, '_brain_eqtl.txt'))
  cispos = qtls %>% group_by(Chr, Gene) %>% mutate(n = n()) %>% 
    arrange(desc(n)) %>% 
    ungroup() %>%
    distinct(Chr, .keep_all = T) %>% 
    rename(GeneChr = Chr) %>%
    select(Gene, GeneChr) 
  qtls_cis = inner_join(qtls, cispos, by='Gene') %>% 
    filter(Chr==GeneChr) %>%
    data.frame() %>%
    .[grep('rs', .$SNP),] %>%
    mutate(rsid = SNP, pval = p) %>% 
    select(rsid, pval)
  fwrite(qtls_cis, eqtl_fn, sep='\t')

  gwas %>% filter(SNP %in% qtls_cis$rsid) %>%
    mutate(rsid = SNP, pval = p) %>% 
    select(rsid, pval) %>%
    fwrite(gwasout_fn, sep='\t')    

}



### (g) next steps

next we need to see which genes we can create plots for. Now start running `QTL_Proxy_SNPS.ipynb`