-
Notifications
You must be signed in to change notification settings - Fork 2
/
psychimmgenB11.Rmd
243 lines (195 loc) · 10 KB
/
psychimmgenB11.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
---
title: "Find nearest genes and perform overrepresentation (pathway) analysis"
output: html_document
---
```{r}
library(ggplot2)
library(tidyverse)
#set_here("/Users/mary/non_dropbox/exps/exp059_snp_to_sc/")
setwd("/Users/mary/non_dropbox/exps/exp059_snp_to_sc/")
library(here)
library(ChIPseeker)
# Loads cytoimmgen CHEERS results, including the snps overlapping peaks
load(here("res/cheers_cytoimmgen_processed.RData"))
# T cells
t_celllist <- c('naive_16H_UNS', 'naive_D5_UNS','memory_16H_UNS', 'memory_D5_UNS', 'naive_16H_TH0', 'naive_16H_TH1', 'naive_16H_TH2', 'naive_16H_TH17', 'naive_16H_ITREG', 'naive_16H_IL10', 'naive_16H_IL21', 'naive_16H_IL27', 'naive_16H_IFNB', 'naive_16H_TNFA', 'naive_D5_TH0', 'naive_D5_TH1', 'naive_D5_TH2', 'naive_D5_TH17', 'naive_D5_ITREG', 'naive_D5_IL10', 'naive_D5_IL21', 'naive_D5_IL27', 'naive_D5_IFNB', 'naive_D5_TNFA', 'memory_16H_TH0', 'memory_16H_TH1', 'memory_16H_TH2', 'memory_16H_TH17', 'memory_16H_ITREG', 'memory_16H_IL10', 'memory_16H_IL21', 'memory_16H_IL27', 'memory_16H_IFNB', 'memory_16H_TNFA', 'memory_D5_TH0', 'memory_D5_TH1', 'memory_D5_TH2', 'memory_D5_TH17', 'memory_D5_ITREG', 'memory_D5_IL10', 'memory_D5_IL21', 'memory_D5_IL27', 'memory_D5_IFNB', 'memory_D5_TNFA')
lynall_get_top_peaks <- function(pValsFile=NULL, uniquePeaksFile=NULL, celllist=NULL){
# Function gets top 10% of peaks active in the significantly enriched cell subsets (Bonferroni p<0.05)
# Output is vector of peaks
pvals <- read.table(pValsFile, sep="\t")
library(magrittr)
pvals %<>% dplyr::rename(cell_condition=V1,value=V2)
print(sprintf("There are %d used cell types as follows:", length(celllist)))
print(celllist)
if (length(celllist)==0){
return(NA)
} else {
peaks <- read.table(uniquePeaksFile, sep="\t", header=T)[,c("chr","start","end",celllist)]
peaks$peak <- paste(peaks$chr, peaks$start, peaks$end, sep = '_')
# For each cell type, convert to percetile ranks (scales 0 --> 1)
peaks_rank <- dplyr::mutate(peaks, across(.cols=all_of(celllist), percent_rank))
# Keep any peaks with percentile rank >=0.9 in any of the sig cell types
peaks_top <- peaks_rank %>% filter_at(vars(all_of(celllist)), .vars_predicate = any_vars(. >= 0.9)) %>% dplyr::arrange(peak)
}
}
top_peaks <- list()
# Include those disorders showing T cell enrichment in cytoimmgen
for (i in c("bmi","mdd","crossdis","szs","ra")){
print(i)
top_peaks[[i]] <- lynall_get_top_peaks(pValsFile=here(paste0("res/cheersout_500/",i,"_cytoimmgen_H3K27ac_disease_enrichment_pValues.txt")), uniquePeaksFile=here(paste0("res/cheersout_500/",i,"_cytoimmgen_H3K27ac_uniquePeaks.txt")), celllist=t_celllist)
}
length(top_peaks)
```
Prepare peaks as granges for ChIPseeker
```{r}
library(GenomicRanges)
peaks_top_granges <- lapply(top_peaks, function(x) {GenomicRanges::makeGRangesFromDataFrame(x,
keep.extra.columns=FALSE,
ignore.strand=TRUE,
seqnames.field="chr",
start.field="start",
end.field="end",
starts.in.df.are.0based=FALSE)})
```
Load organism databases required for seq2gene
```{r}
library(org.Hs.eg.db)
library(EnsDb.Hsapiens.v86)
edb <- EnsDb.Hsapiens.v86
genomeStyles()$Homo_sapiens
seqlevelsStyle(edb) <- "UCSC"
```
Get union of exons / introns overlapping peaks and nearest genes. Main modification of lynall_seq2gene compared to CHiPseeker::seq2gene is to retain information about what the peak is overlapping in the function output. A single peak can overlap multiple genomic features (an exon, an intron, a promoter of another gene etc.)
```{r}
source(here("workflow/scripts/functions/lynall_seq2gene.R"))
out <- lapply(peaks_top_granges, function(x) {lynall_seq2gene(x, tssRegion = c(-3000, 3000), flankDistance = 10000, TxDb=edb, sameStrand = FALSE)})
```
Over-representation analysis using pooled reactome and GO (Biological Processes) pathways.
```{r}
# Make term2gene for reactome
reactome <- read.csv("/Users/mary/non_dropbox/exps/exp042_meninges_stress_dropseq/data/raw/reactome_download/Ensembl2Reactome_All_Levels.txt", sep="\t", header=F)
react <- reactome %>% dplyr::filter(grepl("R-HSA-",V2)) %>% dplyr::filter(grepl("ENSG",V1))
human_t2g_reactome <- data.frame(gs_name=react$V4, ens=react$V1)
# Get GO term2gene
#library(remotes)
#remotes::install_version("msigdbr","7.4.1")
library(msigdbr) # Requires version 7.4.1
tmp = msigdbr(species = "human", category = "C5", subcategory = "GO:BP")
human_t2g_go <- data.frame(gs_name=tmp$gs_name, ens=tmp$ensembl_gene, symbol=tmp$human_gene_symbol)
# Prettify
human_t2g_go$gs_name <- gsub("GOBP_","",human_t2g_go$gs_name)
human_t2g_go$gs_name <- gsub("_"," ",human_t2g_go$gs_name)
human_t2g_go$gs_name <- paste0(human_t2g_go$gs_name," (GO)")
library(magrittr)
human_t2g_reactome$gs_name %<>% toupper
human_t2g_reactome$gs_name <- paste0(human_t2g_reactome$gs_name," (Reactome)")
human_t2g_pooled <- bind_rows(human_t2g_go, human_t2g_reactome)
sample_n(human_t2g_pooled,20)
# Do pooled reactome and GO over-representation analysis
#remotes::install_url("https://bioconductor.org/packages/3.14/bioc/src/contrib/clusterProfiler_4.2.2.tar.gz")
# NB. enricher uses phyper(...,lower.tail = FALSE) i.e. one-sided hypergeometric test
library(clusterProfiler)
enrichout_pooled <- lapply(out, function(x){enricher(gene=x$unique_genes,minGSSize = 10, maxGSSize = 500, TERM2GENE = human_t2g_pooled)})
# And show the sig results
enrichout_pooled_readable <- lapply(enrichout_pooled, function(x) {setReadable(x, 'org.Hs.eg.db', 'ENSEMBL')})
result_pooled <- lapply(enrichout_pooled_readable, function(x){x@result[x@result$p.adjust<0.05,c("p.adjust", "geneID","Count")]})
```
Print results to file
Unique gene lists for each disorder
```{r}
nearestgenes <- lapply(out, function(x){x$full_dataframe %>% select(peak_location, gene, symbol, overlap_type)})
uniquenearestgenes <- lapply(nearestgenes, function(x){x %>% select(gene,symbol) %>% distinct()})
# FOR SUPPLEMENTARY TABLE 5
for (i in c("bmi","mdd","crossdis","szs", "ra")){
write.table(uniquenearestgenes[[i]], file=here(paste0("res/overrepresentation_tcells_uniquenearestgenes_",i,".tsv")), row.names = F, quote=F, sep="\t")
}
# Number of unique nearest genes
lapply(uniquenearestgenes,nrow)
#bmi 592
#mdd 65
#crossdis 99
#szs 100
#ra 53
# The unique nearest genes found in significantly enriched pathways
uniquegenesinpathway <- lapply(enrichout_pooled_readable, function(x){unique(strsplit(paste(x[,"geneID"],collapse="/"),"/")[[1]])})
# Proportion in enriched pathways
for (i in c("bmi","mdd","crossdis","szs", "ra")){
print(i)
print(sprintf("Total input genes: %d", nrow(uniquenearestgenes[[i]])))
print(sprintf("Total genes in significantly enriched pathways: %d", length(uniquegenesinpathway[[i]])))
print(sprintf("Percent in pathway = %f", length(uniquegenesinpathway[[i]])/nrow(uniquenearestgenes[[i]])*100))
}
# [1] "bmi"
# [1] "Total input genes: 592"
# [1] "Total genes in significantly enriched pathways: 0"
# [1] "Percent in pathway = 0.000000"
# [1] "mdd"
# [1] "Total input genes: 65"
# [1] "Total genes in significantly enriched pathways: 8"
# [1] "Percent in pathway = 12.307692"
# [1] "crossdis"
# [1] "Total input genes: 99"
# [1] "Total genes in significantly enriched pathways: 21"
# [1] "Percent in pathway = 21.212121"
# [1] "szs"
# [1] "Total input genes: 100"
# [1] "Total genes in significantly enriched pathways: 32"
# [1] "Percent in pathway = 32.000000"
# [1] "ra"
# [1] "Total input genes: 53"
# [1] "Total genes in significantly enriched pathways: 23"
# [1] "Percent in pathway = 43.396226"
```
The below are the easiest way to see the genes driving the enrichment results - have transferred these onto the figure S8
```{r}
for (i in c("bmi","mdd","crossdis","szs", "ra")){
write.table(result_pooled[[i]], file=here(paste0("res/overrepresentation_tcells_",i,".tsv")), row.names = T, quote=F, sep="\t")
}
for (i in c("bmi","mdd","crossdis","szs", "ra")){
write.table(out[[i]]$unique_genes, file=here(paste0("res/overrepresentation_tcells_uniquegenes_",i,".tsv")), row.names = F, col.names=T, quote=F, sep="\t")
}
# Proportion of genes picked out in overrepresentation analysis
```
Make GSEA plots for supplementary, showing top 10 significant pathways
Show -log10 p-value on x-axis and colour by gene ratio, filtering on p(adj)<0.05
```{r}
# What is the range of geneRatios across all?
grmax <- NULL
grmin <- NULL
for (i in 1:length(enrichout_pooled_readable)){
gr <- enrichout_pooled_readable[[i]]@result %>% dplyr::filter(p.adjust<0.05)
gr <- gr %>% rowwise() %>% dplyr::mutate(geneRatio = eval(parse(text = GeneRatio)))
grmax[i] <- max(gr$geneRatio)
grmin[i] <- min(gr$geneRatio)
}
grmax
grmin
# Choose visualization range 0 --> 0.4 as this will cover the range for all disorders
lynall_ora_plot <- function(oraout, name="Unnamed", padjthh=0.05, picwidth=15, topn=10){
df <- oraout@result %>% dplyr::arrange(p.adjust)
df <- df[1:topn,]
df %<>% dplyr::filter(p.adjust<padjthh)
df$ID %<>% factor(levels=rev(unique(df$ID)), ordered = T)
df$geneRatio <- sapply(df$GeneRatio, function(x){eval(parse(text = x))})
p <- ggplot(df, aes(x = -log10(pvalue), y = ID, color=geneRatio)) +
geom_point(size=4) +
theme_bw(base_size = 14) +
scale_color_gradient(low="yellow",high="red", limits=c(0,0.32), breaks=c(0,0.1,0.2,0.3)) +
ylab(NULL) +
expand_limits(x=0) +
theme(legend.title = element_blank()) +
xlab("-log10(p-value)") +
ggtitle(name)
print(p)
ggsave(p,path=here::here("pics/"),filename=paste0("ora_out_",name,".pdf"),width=picwidth,height=(2+nrow(df)*0.11))
p
}
# Plot results for T cell enriched disorders i.e. Xdiss, SZS, MDD, RA, BMI. Nil sig for BMI so not shown
plots <- list()
for (i in c("bmi","mdd","crossdis","szs", "ra")){
plots[[i]] <- lynall_ora_plot(oraout=enrichout_pooled_readable[[i]], name=i, padjthh=0.05, picwidth=10, topn=10)
print(plots[[i]])
}
g <- cowplot::plot_grid(plotlist=list(plots$crossdis, plots$szs, plots$mdd, plots$ra), ncol=1, align="v", rel_heights=c(1,1,0.45,1))
ggsave(g, filename=here("pics/ora_for_supp_select.pdf"), width=11, height=12)
```