# GWAS Locus Browser QTL and GWAS Gene Data for (1) all Genes or (2) one Gene
- **Author(s)** - Frank Grenn and Hirotaka Iwaki
- **Date Started** - October 2019
- **Quick Description:** collect eQTL and GWAS data for genes
- **Data:** 

In [None]:
library(data.table)
library(dplyr)

In [None]:
WRKDIR = '$PATH/AppDataProcessing'
QTLDIR = paste0(WRKDIR,'/qtl')
TSVDIR = paste0(QTLDIR,'/tsv')
dir.create(TSVDIR)

## (1) All Genes

* #### (a) get the genes we want data for
* #### (b) get the gwas data
* #### (c) get the eqtl data
* #### (d) generate blood eqtl and gwas tsvs for each gene
* #### (e) generate brain eqtl tsvs for each gene using a swarm job
* #### (f) generate brain GWAS TSVs for each gene
* #### (g) next steps

### (a) get the genes we want data for
from the `genes_by_locus.csv`

In [None]:
evidence <- fread("$PATH/AppDataProcessing/genes_by_locus.csv")


evidence_genes <- evidence$GENE

length(evidence_genes)
head(evidence_genes)


### (b) get the gwas data for (i) meta5 or (ii) progression or (iii) asian gwas

#### (i) meta5

In [None]:
#META5
#gwas_in = paste0('$PATH/summary_stats/resultsForSmr_filtered.tab.gz')
gwas_id="META5"
dir.create(paste0(TSVDIR,'/',gwas_id))
gwas_in = "$PATH/AppDataProcessing/meta5_sumstats_harmonized.csv"
evidence_genes <- evidence[which(evidence$GWAS==gwas_id),]$GENE
print(length(evidence_genes))


gwas = fread(gwas_in)
dim(gwas)
head(gwas)



#### (ii) progression (NOTE: need to run notebook twice for this because each locus has its own summary stats file)



##### Using HY3 Sum Stats File OR...

In [None]:
#Progression Loci
gwas_id="Progression"
dir.create(paste0(TSVDIR,'/',gwas_id))

##rs382940, 9:108058562, 2
#gwas_in = paste0('$PATH/AppDataProcessing/locuszoom/surv_HY3.txt')
gwas_in = paste0('$PATH/AppDataProcessing/prog_hy_sumstats_harmonized.csv')
evidence_genes <- evidence[which((evidence$LOC_NUM==2) & (evidence$GWAS==gwas_id)),]$GENE



print(length(evidence_genes))
print(evidence_genes)

gwas = fread(gwas_in)


head(gwas)
dim(gwas)

##### ... Using INS Sum Stats File

In [None]:
#Progression Loci
gwas_id="Progression"
dir.create(paste0(TSVDIR,'/',gwas_id))


##rs61863020, 10:112956055, 1
##gwas_in = paste0('$PATH/AppDataProcessing/locuszoom/base_INS.txt')
gwas_in = paste0('$PATH/AppDataProcessing/prog_ins_sumstats_harmonized.csv')
evidence_genes <- evidence[which(evidence$LOC_NUM==1 & evidence$GWAS==gwas_id),]$GENE

print(length(evidence_genes))
print(evidence_genes)

gwas = fread(gwas_in)


head(gwas)
dim(gwas)

#### (iii) Asian Gwas

In [None]:
gwas_id="Asian"
dir.create(paste0(TSVDIR,'/',gwas_id))
#gwas_in = paste0("$PATH/summary_stats/asian_GWAS/6724PDcases-24851controls-5843213snps-summary-stats-metaP-SE.txt.gz")
gwas_in = "$PATH/AppDataProcessing/asiangwas_sumstats_harmonized.csv"
evidence_genes <- evidence[which(evidence$GWAS==gwas_id),]$GENE
print(length(evidence_genes))

In [None]:


gwas = fread(gwas_in)


head(gwas)
dim(gwas)

### (c) get the eqtl data

In [None]:
eqtl = fread('$PATH/tool/eQTL/cis-eQTL_significant_20181017.txt.gz')
dim(eqtl)
head(eqtl)


### (d) generate blood eqtl and gwas tsvs for each gene

In [None]:
GWASTSVDIR=paste0(TSVDIR,'/',gwas_id)
#proj = '$PATH/AppDataProcessing/qtl/'
#gwasoutfold = paste0(proj, 'tsv/')
count <- 0
print(length(evidence_genes))
for(gene in evidence_genes)
{
    # blood eqtl
    eqtl_fn = paste0(GWASTSVDIR,'/', gene, '_blood_eqtl.tsv')
    gwasout_fn = paste0(GWASTSVDIR,'/',  gene, '_blood_gwas.tsv')

    eqtl_gene = eqtl %>% filter(GeneSymbol == gene) %>% 
    mutate(RSID = SNP, P = Pvalue) %>% select(RSID, P)
    gwas %>% filter(RSID %in% eqtl_gene$RSID) %>% 
    select(RSID, P) %>%
    fwrite(gwasout_fn, sep='\t')
    fwrite(eqtl_gene, eqtl_fn, sep='\t')
}
 

### (e) generate brain eQTL TSVs for each gene using a swarm job

generate the swarm file

In [None]:


count = 0
for(gene in evidence_genes)
{
  #print(paste0(gene, " ", count))

  line = paste0('$PATH/GBA_age_of_onset/EQTL/smr_Linux',
                ' --beqtl-summary $PATH/QTL/Brain-eMeta/Brain-eMeta',
                ' --thread-num 1',
                ' --query 5.0e-2',
                ' --gene ', gene, 
                ' --cis-wind 2000',
                ' --out ',GWASTSVDIR,'/', gene, '_brain_eqtl')
    write(line,file="script/generate_brain_eqtl.swarm",append=TRUE)
    #count = count +1
}
 

print the command to run the swarm file

In [None]:
print('swarm -f generate_brain_eqtl.swarm -g 10 --partition quick --time 00:02:00')

### (f) generate brain GWAS TSVs for each gene

In [None]:
for(gene in evidence_genes)
{   
  # brain eqtl
  eqtl_fn = paste0(GWASTSVDIR,'/', gene, '_brain_eqtl.tsv')
  gwasout_fn = paste0(GWASTSVDIR, '/',  gene, '_brain_gwas.tsv')

    
  if(file.exists(paste0(GWASTSVDIR,'/',gene, '_brain_eqtl.txt')))
  {
      qtls = fread(paste0(GWASTSVDIR,'/',gene, '_brain_eqtl.txt'))
      cispos = qtls %>% group_by(Chr, Gene) %>% mutate(n = n()) %>% 
        arrange(desc(n)) %>% 
        ungroup() %>%
        distinct(Chr, .keep_all = T) %>% 
        rename(GeneChr = Chr) %>%
        select(Gene, GeneChr) 
      qtls_cis = inner_join(qtls, cispos, by='Gene') %>% 
        filter(Chr==GeneChr) %>%
        data.frame() %>%
        .[grep('rs', .$SNP),] %>%
        mutate(RSID = SNP, P = p) %>% 
        select(RSID, P)
      fwrite(qtls_cis, eqtl_fn, sep='\t')

      gwas %>% filter(RSID %in% qtls_cis$RSID) %>%
        select(RSID, P) %>%
        fwrite(gwasout_fn, sep='\t')    
      
  }
  
}

### (g) next steps

next we need to see which genes we can create plots for. Now start running `QTL_Proxy_SNPS.ipynb`

## (2) For One Gene
useful with genes that have alternate names (looking at you _RAB7L1/RAB29_!)

* #### (a) specify the data gene names
* #### (b) get the gwas data
* #### (c) get the eqtl data
* #### (d) generate blood eqtl and gwas tsvs for each gene
* #### (e) generate brain eqtl tsvs for each gene using a swarm job
* #### (f) generate brain GWAS TSVs for each gene
* #### (g) next steps

### (a) specify the data gene names

In [None]:
#name of the gene in the eqtl data
eqtl_gene_name <- "RAB7L1"
#name of the gene in the gwas data
gwas_gene_name <-"RAB29"
#what gene name you want the files to show for the gene
final_gene_file_name <-"RAB29"

### (b) get the gwas data

In [None]:
#META5
gwas_id = "META5"
dir.create(paste0(TSVDIR,'/',gwas_id))
gwas_in = "$PATH/AppDataProcessing/meta5_sumstats_harmonized.csv"
gwas = fread(gwas_in)
dim(gwas)
head(gwas)

GWASTSVDIR=paste0(TSVDIR,'/',gwas_id)

### (c) get the eqtl data

In [None]:
eqtl = fread('$PATH/tool/eQTL/cis-eQTL_significant_20181017.txt.gz')
dim(eqtl)
head(eqtl)

### (d) generate blood eqtl and gwas tsvs for each gene

In [None]:

# blood eqtl
eqtl_fn = paste0(GWASTSVDIR,'/', final_gene_file_name, '_blood_eqtl.tsv')
gwasout_fn = paste0(GWASTSVDIR,'/',  final_gene_file_name, '_blood_gwas.tsv')

#for etql (named RAB7L1)
eqtl_gene = eqtl %>% filter(GeneSymbol == eqtl_gene_name) %>% 
mutate(RSID = SNP, P = Pvalue) %>% select(RSID, P)

fwrite(eqtl_gene, eqtl_fn, sep='\t')

#for gwas (named RAB29)
gwas %>% filter(RSID %in% eqtl_gene$RSID) %>% 
select(RSID, P) %>%
fwrite(gwasout_fn, sep='\t')


 

### (e) generate brain eqtl tsvs for each gene using a swarm job

In [None]:


print(paste0('$PATH/GBA_age_of_onset/EQTL/smr_Linux',
            ' --beqtl-summary $PATH/GENERAL/QTL/Brain-eMeta/Brain-eMeta',
            ' --thread-num 1',
            ' --query 5.0e-2',
            ' --gene ', eqtl_gene_name, 
            ' --cis-wind 2000',
            ' --out ',GWASTSVDIR,'/', final_gene_file_name, '_brain_eqtl'))

 

### (f) generate brain GWAS TSVs for each gene

In [None]:

# brain eqtl
eqtl_fn = paste0(GWASTSVDIR,'/', final_gene_file_name, '_brain_eqtl.tsv')
gwasout_fn = paste0(GWASTSVDIR,'/',  final_gene_file_name, '_brain_gwas.tsv')


if(file.exists(paste0(GWASTSVDIR,'/',final_gene_file_name, '_brain_eqtl.txt')))
{
  qtls = fread(paste0(GWASTSVDIR,'/',final_gene_file_name, '_brain_eqtl.txt'))
  cispos = qtls %>% group_by(Chr, Gene) %>% mutate(n = n()) %>% 
    arrange(desc(n)) %>% 
    ungroup() %>%
    distinct(Chr, .keep_all = T) %>% 
    rename(GeneChr = Chr) %>%
    select(Gene, GeneChr) 
  qtls_cis = inner_join(qtls, cispos, by='Gene') %>% 
    filter(Chr==GeneChr) %>%
    data.frame() %>%
    .[grep('rs', .$SNP),] %>%
    mutate(RSID = SNP, P = p) %>% 
    select(RSID, P)
  fwrite(qtls_cis, eqtl_fn, sep='\t')

  gwas %>% filter(RSID %in% qtls_cis$RSID) %>%
    select(RSID, P) %>%
    fwrite(gwasout_fn, sep='\t')    

}



### (g) next steps

next we need to see which genes we can create plots for. Now start running `QTL_Proxy_SNPS.ipynb`