# Generate GWAS Locus Browser Psychencode QTL and GWAS Gene Data
- **Author(s)** - Frank Grenn
- **Date Started** - March 2020
- **Quick Description:** collect Psychencode QTL and GWAS data for genes
- **Data:** 

In [None]:
library(data.table)
library(dplyr)
library("EnsDb.Hsapiens.v86")

In [None]:
WRKDIR = '/path/to/AppDataProcessing'
QTLDIR = paste0(WRKDIR,'/qtl')
TSVDIR = paste0(QTLDIR,'/tsv')
dir.create(TSVDIR)

## 1) Gene List

In [None]:
evidence <- fread(paste0(WRKDIR,"/genes_by_locus.csv"))


evidence_genes <- evidence$GENE

length(evidence_genes)
head(evidence_genes)


## 2) get the gwas data for (a) meta5 or (b) progression1 or (c) progression2

#### (a) meta5

In [None]:
#META5
gwas_id="META5"
dir.create(paste0(TSVDIR,'/',gwas_id))
gwas_in = "/path/to/AppDataProcessing/meta5_sumstats_harmonized.csv"
evidence_genes <- evidence[which(evidence$GWAS==gwas_id),]$GENE
print(length(evidence_genes))


gwas = fread(gwas_in)
dim(gwas)
head(gwas)


In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid <- gwas %>% dplyr::filter(RSID!=".")
dim(gwas_smr_rsid)
head(gwas_smr_rsid)

In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique <- unique(gwas_smr_rsid)
dim(gwas_smr_unique)
head(gwas_smr_unique)

In [None]:
gwas <- gwas_smr_unique

#### (b) progression (NOTE: need to run notebook twice for this because each locus has its own summary stats file)

##### Using HY3 Sum Stats File OR...

In [None]:
#Progression Loci
gwas_id="Progression"
dir.create(paste0(TSVDIR,'/',gwas_id))

##rs382940, 9:108058562, 2
#gwas_in = paste0('/path/to/AppDataProcessing/locuszoom/surv_HY3.txt')
gwas_in = paste0('/path/to/AppDataProcessing/prog_hy_sumstats_harmonized.csv')
evidence_genes <- evidence[which((evidence$LOC_NUM==2) & (evidence$GWAS==gwas_id)),]$GENE



print(length(evidence_genes))
print(evidence_genes)

gwas = fread(gwas_in)


head(gwas)
dim(gwas)




In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid <- gwas %>% dplyr::filter(RSID!="")
dim(gwas_smr_rsid)
head(gwas_smr_rsid)



In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique <- unique(gwas_smr_rsid)
dim(gwas_smr_unique)
head(gwas_smr_unique)



In [None]:
gwas <- gwas_smr_unique

##### ... Using INS Sum Stats File

In [None]:
#Progression Loci
gwas_id="Progression"
dir.create(paste0(TSVDIR,'/',gwas_id))


##rs61863020, 10:112956055, 1
##gwas_in = paste0('/path/to/AppDataProcessing/locuszoom/base_INS.txt')
gwas_in = paste0('/path/to/AppDataProcessing/prog_ins_sumstats_harmonized.csv')
evidence_genes <- evidence[which(evidence$LOC_NUM==1 & evidence$GWAS==gwas_id),]$GENE

print(length(evidence_genes))
print(evidence_genes)

gwas = fread(gwas_in)


head(gwas)
dim(gwas)

In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid <- gwas %>% dplyr::filter(RSID!="")
dim(gwas_smr_rsid)
head(gwas_smr_rsid)



In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique <- unique(gwas_smr_rsid)
dim(gwas_smr_unique)
head(gwas_smr_unique)

In [None]:
gwas <- gwas_smr_unique

#### (iii) Asian Gwas

In [None]:
gwas_id="Asian"
dir.create(paste0(TSVDIR,'/',gwas_id))
#gwas_in = paste0("/path/to/asian_GWAS/6724PDcases-24851controls-5843213snps-summary-stats-metaP-SE.txt.gz")
gwas_in = "/path/to/AppDataProcessing/asiangwas_sumstats_harmonized.csv"
evidence_genes <- evidence[which(evidence$GWAS==gwas_id),]$GENE
print(length(evidence_genes))



gwas = fread(gwas_in)


head(gwas)
dim(gwas)

In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid <- gwas %>% dplyr::filter(RSID!="")
dim(gwas_smr_rsid)
head(gwas_smr_rsid)


In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique <- unique(gwas_smr_rsid)
dim(gwas_smr_unique)
head(gwas_smr_unique)

In [None]:
gwas <- gwas_smr_unique

## 3) Get Risk Variant Data

In [None]:
gwas_risk_variants <- fread("/path/to/AppDataProcessing/gwas_risk_variants.csv")
dim(gwas_risk_variants)
head(gwas_risk_variants)


In [None]:
GWASTSVDIR=paste0(TSVDIR,'/',gwas_id)
GWASTSVDIR

## 4) eQTL
read the psychencode eQTL data  
should be similar to the blood and brain eQTL from before

In [None]:
eqtl = fread('/path/to/AppDataProcessing/qtl/psychencode/DER-08a_hg19_eQTL.significant.txt')

dim(eqtl)
head(eqtl)


#### we need to get the gene names that match the gene_id

In [None]:
#add a gene id col without the version number to join on later
eqtl$gene_id_no_version <- gsub("\\..*","",eqtl$gene_id)
dim(eqtl)
head(eqtl)

In [None]:
edb <- EnsDb.Hsapiens.v86
tx <- transcripts(edb, columns=c("tx_id", "gene_id", "gene_name"))

In [None]:
mapping <- cbind(gene_id=tx$gene_id, name=tx$gene_name)
dim(mapping)
head(mapping)

In [None]:
mapping <-mapping[!duplicated(mapping),]
dim(mapping)
head(mapping)

In [None]:
qtl <- merge(x=eqtl, y=mapping, by.x='gene_id_no_version',by.y='gene_id', all.x=TRUE)
dim(qtl)
head(qtl)

now we need to get the rsids for the SNP_id

In [None]:
psych_snps <- fread("/path/to/AppDataProcessing/qtl/psychencode/SNP_Information_Table_with_Alleles.txt")
dim(psych_snps)
head(psych_snps)

In [None]:

qtl <- merge(x=qtl, y=psych_snps, by.x = "SNP_id", by.y = "PEC_id",all.x=TRUE)
class(qtl)
typeof(qtl)
dim(qtl)
head(qtl)


In [None]:
#add a chr:bp:ref:alt col
qtl$'CHR_BP_REF_ALT' <- paste0(qtl$SNP_id,"_",qtl$REF,"/",qtl$ALT)
dim(qtl)
head(qtl)

In [None]:

print(length(evidence_genes))
for(gene in evidence_genes)
{
    print(gene)
    eqtl_fn = paste0(GWASTSVDIR,'/', gene, '_e_pe_eqtl.tsv')
    gwasout_fn = paste0(GWASTSVDIR,'/',  gene, '_e_pe_gwas.tsv')

    eqtl_gene = qtl %>% dplyr::filter(name == gene) %>% 
    mutate(RSID = Rsid, P = nominal_pval) %>% dplyr::select(RSID, CHR_BP_REF_ALT, P)
    gwas %>% dplyr::filter(CHR_BP_REF_ALT %in% eqtl_gene$CHR_BP_REF_ALT) %>% 
    dplyr::select(RSID, CHR_BP_REF_ALT, P) %>%
    fwrite(gwasout_fn, sep='\t')
    fwrite(eqtl_gene, eqtl_fn, sep='\t')
}
 

## 5) cQTL (chromatin QTL)
no gene or transcript id available in the data so we will just check for risk snps

In [None]:
cqtl = fread('/path/to/AppDataProcessing/qtl/psychencode/DER-09_hg19_cQTL.significant.txt')

dim(cqtl)
head(cqtl)


In [None]:

cqtl <- cqtl[,c(1,2,5:15)]
head(cqtl)

In [None]:
colnames(gwas_risk_variants)

In [None]:
print(gwas_risk_variants$"CHR_BP"[0:10])
print(cqtl$"SNP_id"[0:10])

In [None]:
#merge the risk variants with the cQTL data to see if they are in the data
risk_cQTL_snps <- merge(x=gwas_risk_variants, y=cqtl, by.x="CHR_BP", by.y="SNP_id")
print(dim(risk_cQTL_snps))
print(head(risk_cQTL_snps))

## 6) isoQTL (isoform QTL)
need to separate isoforms into separate plots

In [None]:
iqtl = fread('/path/to/AppDataProcessing/qtl/psychencode/DER-10b_hg19_isoQTL.FPKM5.all.txt')

dim(iqtl)
head(iqtl)


In [None]:
#add a tx id col without the version number to join on later
iqtl$tx_id_no_version <- gsub("\\..*","",iqtl$transcript_id)
dim(iqtl)
head(iqtl)

In [None]:
edb <- EnsDb.Hsapiens.v86
tx <- transcripts(edb, columns=c("tx_id", "gene_id", "gene_name"))

In [None]:
mapping <- cbind(tx_id=tx$tx_id, name=tx$gene_name)
dim(mapping)
head(mapping)

In [None]:
mapping <-mapping[!duplicated(mapping),]
dim(mapping)
head(mapping)

In [None]:
qtl <- merge(x=iqtl, y=mapping, by.x='tx_id_no_version',by.y='tx_id', all.x=TRUE)
dim(qtl)
head(qtl)

In [None]:
psych_snps <- fread("/path/to/AppDataProcessing/qtl/psychencode/SNP_Information_Table_with_Alleles.txt")
dim(psych_snps)
head(psych_snps)

In [None]:

qtl <- merge(x=qtl, y=psych_snps, by.x = "SNP_id", by.y = "PEC_id",all.x=TRUE)
class(qtl)
typeof(qtl)
dim(qtl)
head(qtl)


In [None]:
print(length(unique(qtl$tx_id_no_version)))
print(length(unique(qtl$transcript_id)))
print(length(qtl$transcript_id))
print(length(unique(qtl$name)))

In [None]:
gene_transcript_df <- qtl %>% dplyr::select(tx_id_no_version, name) %>% mutate(transcript = tx_id_no_version, gene = name) %>% dplyr::select(transcript, gene) %>% distinct
print(head(gene_transcript_df))
print(nrow(unique(gene_transcript_df)))
print(nrow(gene_transcript_df))

In [None]:
length(evidence_genes)

In [None]:
#now filter down gene_transcript_df to only include genes in our list
filtered_gene_transcript_df <- gene_transcript_df %>% dplyr::filter(gene %in% evidence_genes)
print(dim(filtered_gene_transcript_df))
print(head(filtered_gene_transcript_df))

In [None]:
#add a chr:bp:ref:alt col
qtl$'CHR_BP_REF_ALT' <- paste0(qtl$SNP_id,"_",qtl$REF,"/",qtl$ALT)
dim(qtl)
head(qtl)

In [None]:
nrow(filtered_gene_transcript_df)
for(i in 1:nrow(filtered_gene_transcript_df))
{
    row <- filtered_gene_transcript_df[i,]
    print(paste0(row$gene, " ", row$transcript))
    isoqtl_fn = paste0(GWASTSVDIR,'/', row$gene, "_", row$transcript, '_i_pe_isoqtl.tsv')
    gwasout_fn = paste0(GWASTSVDIR,'/', row$gene, "_", row$transcript, '_i_pe_gwas.tsv')

    isoqtl_gene = qtl %>% dplyr::filter(name == row$gene) %>% dplyr::filter(tx_id_no_version == row$transcript) %>%
    mutate(RSID = Rsid, P = nominal_pval) %>% dplyr::select(RSID, CHR_BP_REF_ALT, P)
    gwas %>% dplyr::filter(CHR_BP_REF_ALT %in% isoqtl_gene$CHR_BP_REF_ALT) %>% 
    dplyr::select(RSID, CHR_BP_REF_ALT, P) %>%
    fwrite(gwasout_fn, sep='\t')
    fwrite(isoqtl_gene, isoqtl_fn, sep='\t')
}
 

## 7) fQTL (cell fraction QTL)
no gene or transcript id available in the data so we will just check for risk variants

In [None]:
fqtl = fread('/path/to/AppDataProcessing/qtl/psychencode/DER-11_hg19_fQTL.significant.txt')

dim(fqtl)
head(fqtl)


In [None]:
#need to add CHR:BP to the fQTL data
fqtl$"CHR_BP" <- paste0(gsub("chr","",fqtl$"Chromosome_of_variant"),":", fqtl$"Locus_of_variant")
print(head(fqtl))

In [None]:
testrow <- data.frame("#Cell_Type"="Ex3","Chromosome_of_variant"="chr1","Locus_of_variant"=154898185,"Nominal_p_val_of_association"=0.05,"Bonferroni_corrected_p_val"=0.005,"Regression_slope"=0.5,"CHR:BP"="1:154898185")
colnames(testrow) <- colnames(fqtl)
print(testrow)

In [None]:
fqtl <- rbind(fqtl, testrow)

In [None]:
#merge the risk variants with the fQTL data to see if they are in the data
risk_fQTL_snps <- merge(x=gwas_risk_variants, y=fqtl, by.x="CHR_BP", by.y="CHR_BP")
print(dim(risk_fQTL_snps))
print(head(risk_fQTL_snps))