# Pathway enrichment in LvsHC Reynolds markers
In order to interpret the DEA genes found and understand the biological context:

Here I will perform the pathway enrichment with different databases (KEGG, Reactome...) in the common markers of the datasets.

Also GO enrichment.

In [0]:
%sh
apt-get -y install libglpk-dev #Correct igraph - Need t load before, compatible for 14.3 LTS

In [0]:
# Load libraries
## Append the library folder
.libPaths(c("/dbfs/home/boriol@almirall.com/my_r_packages/bulkRNASeq_PBMCs_R4.3", .libPaths()))

# Load libraries
library(clusterProfiler)
library(ReactomePA) 
library(msigdbr)
library(DOSE)
library(tidyverse)
library(org.Hs.eg.db)
library(biomaRt)

.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat", .libPaths()))
library(openxlsx)
library(enrichplot)
library(ggplot2)

##Functions, G_list and HALLMARKS databases needed

In [0]:
prepare_gene_list <- function(res, G_list) {
  # Rank by pvalue
  res_rankedlist <- res %>%
    mutate(p_val_adj = ifelse(p_val_adj == 0, 1e-300, p_val_adj)) %>%
    mutate(rank = -log10(p_val_adj) * sign(avg_log2FC)) %>%
    mutate(rank2 = (1 + avg_log2FC) * -log10(p_val_adj))
  
  # Assigning the 'gene' column values to the 'hgnc_symbol'
  res_rankedlist$hgnc_symbol <- res_rankedlist$gene
    display(res_rankedlist)

  gene_list <- left_join(res_rankedlist, G_list, by = "hgnc_symbol") %>% distinct_all()
  gene_list_entrez_id <- gene_list %>% dplyr::select(entrezgene_id, rank) %>% distinct(entrezgene_id, .keep_all = TRUE) %>% drop_na()
  
  geneList_entrez <- gene_list_entrez_id$rank
  names(geneList_entrez) <- gene_list_entrez_id$entrezgene_id
  
  geneList_entrez <- geneList_entrez[is.finite(geneList_entrez)]
  geneList_entrez <- sort(geneList_entrez, decreasing = TRUE)
  geneList_entrez
  return(geneList_entrez)
}

In [0]:
get_enrichments <- function(gene_list, pval_cutoff, term2gene_hallmark, term2gene_hallmark2 = NULL) {
  set.seed(123)
  # KEGG enrichment
  kegg_result <- gseKEGG(gene_list, pvalueCutoff = pval_cutoff, organism = "hsa", verbose = FALSE, eps=0)
  res_kegg <- kegg_result@result
  
  # Reactome enrichment
  reactome_result <- gsePathway(gene_list, pAdjustMethod = "BH", pvalueCutoff = pval_cutoff, organism = "human", verbose = FALSE,  eps=0)
  res_reactome <- reactome_result@result
  
  # Hallmark enrichment
  hallmark_result <- GSEA(gene_list, TERM2GENE = term2gene_hallmark, pvalueCutoff = pval_cutoff,  eps=0)
  res_hallmark <- hallmark_result@result
  
  if (!is.null(term2gene_hallmark2)) {
    hallmark_result2 <- GSEA(gene_list, TERM2GENE = term2gene_hallmark2, pvalueCutoff = pval_cutoff,  eps=0)
    res_hallmark2 <- hallmark_result2@result
    return(list(KEGG = res_kegg, Reactome = res_reactome, Hallmark1 = res_hallmark, Hallmark2 = res_hallmark2))
  }
  
  return(list(KEGG = res_kegg, Reactome = res_reactome, Hallmark = res_hallmark))
}

In [0]:
#Fuction to generate a dotplot of the top pathway results
# Arguments: results, number of pathways (N with NES <0 and N with NES>0) and tittle of the plot
dotplot_pathway_generator <- function(results, N, title) {
  aux_pos <- arrange(results[which(results$NES > 0),], -NES)
  aux_neg <- arrange(results[which(results$NES < 0),], -NES)
  top_results <- unique(rbind(head(aux_pos, n = N), tail(aux_neg, n = N)))
  
  ggplot(top_results, aes(x = NES, y = reorder(Description, NES), size = setSize, color = p.adjust)) +
    geom_point(alpha = 0.7) +
    scale_color_gradient(low = "red", high = "darkblue") +
    labs(x = "NES", y = "Pathway", size = "SetSize", color = "P-value adjust") +
    ggtitle(title) +
    theme_minimal() +
    theme(axis.text.y = element_text(size = 8))
}

In [0]:
#Fuction to generate a barplot of the top pathway results
# Arguments: results, number of pathways (N with NES <0 and N with NES>0) and tittle of the plot
barplot_gsea <- function(res, N, title) {
  aux_pos <- arrange(res[which(res$NES > 0),], -NES)
  aux_neg <- arrange(res[which(res$NES < 0),], -NES)
  GSEA_f <- unique(rbind(head(aux_pos, n = N), tail(aux_neg, n = N)))

  options(repr.plot.width = 1000, repr.plot.height = 1000, echo = FALSE)
  
  plot <- ggplot(GSEA_f, aes(NES, fct_reorder(Description, NES), fill = p.adjust)) + 
    geom_col(orientation = 'y') + 
    scale_fill_continuous(low = 'red', high = 'blue', guide = guide_colorbar(reverse = TRUE)) + 
    theme_minimal() + ylab(NULL) + xlab("NES") +
    ggtitle(title)
  
  return(plot)
}

In [0]:
G_list <- readRDS("/dbfs/mnt/sandbox/RNASeq/PBMCs_IL4/pathways/G_list20240710.rds") #List with all translations to other id names
G_list <-  G_list %>% dplyr::filter(transcript_biotype == "protein_coding")

In [0]:
#HALLMARKS

library(msigdbr)
msigdbr_species()

m_df <- msigdbr(species = "Homo sapiens")
head(m_df, 2) %>% as.data.frame
msigdbr_collections()

C2_t2g <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
  dplyr::select(gs_name, entrez_gene)
head(C2_t2g) #This collection includes gene sets curated from various sources such as online pathway databases and the biomedical literature. 

H_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>% 
  dplyr::select(gs_name, entrez_gene)
head(H_t2g) # These gene sets summarize and represent specific well-defined biological states or processes. They are designed to reduce noise and redundancy, providing a clearer biological context.


CP_t2g <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP") %>% 
  dplyr::select(gs_name, entrez_gene)
head(CP_t2g) #CP (Canonical Pathways): Gene sets from pathway databases representing canonical biological processes

C7_t2g <- msigdbr(species = "Homo sapiens", category = "C7") %>% 
  dplyr::select(gs_name, entrez_gene)
head(C7_t2g) #Immunologic Signatures: Gene sets representing expression signatures of immune cell states, cell types, and perturbations

##Tcell

In [0]:
res_tcell <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_tcell_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_Tcell <- prepare_gene_list(res_tcell, G_list)

###ENRICHMENT

In [0]:
res_tcell_0.05 <- get_enrichments(geneList_entrez_Tcell, 0.05, H_t2g, C2_t2g)

In [0]:
res_tcell_1 <- get_enrichments(geneList_entrez_Tcell, 0.1, H_t2g, C2_t2g)

In [0]:
set.seed(123)
tcell_GO <- enrichGO(gene = names(geneList_entrez_Tcell),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)

res_tcell_GO <- tcell_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_tcell_GO <- res_tcell_GO %>%
  filter(p.adjust < 0.05)
display(filtered_res_tcell_GO)

In [0]:
options(repr.plot.width=1000, repr.plot.height=1000,  echo= F)
barplot(tcell_GO, showCategory=30, label_format=50, font.size=9) + ggtitle("Reynolds- Tcell - L vs HC - GO")

In [0]:
#Save results
# write.xlsx(res_tcell_0.05, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/res_tcell_0.05_reynolds.xlsx")
write.xlsx(tcell_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_tcell_reynolds_GO_v2.xlsx")

In [0]:
# res_tcell_1$Reactome[grepl("R-HSA-6785807", res_tcell_1$Reactome$ID), ]

##Macrophages

In [0]:
res_macro <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_macro_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_macro <- prepare_gene_list(res_macro, G_list)

In [0]:
res_macro_0.05 <- get_enrichments(geneList_entrez_macro, 0.05, H_t2g, C2_t2g)

In [0]:
# res_macro_1 <- get_enrichments(geneList_entrez_macro, 1, H_t2g)

In [0]:
set.seed(123)

macro_GO <- enrichGO(gene = names(geneList_entrez_macro),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_macro_GO <- macro_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_macro_GO <- res_macro_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_macro_GO)

In [0]:
options(repr.plot.width=1000, repr.plot.height=1000,  echo= F)
barplot(macro_GO, showCategory=30, label_format=50, font.size=9) + ggtitle("Reynolds- Macro - L vs HC - GO")

###Save results

In [0]:
#Save results
# write.xlsx(res_macro_0.05, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/res_macro_0.05_reynolds.xlsx")
write.xlsx(macro_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_macro_reynolds_GO_v2.xlsx")

##Treg

In [0]:
res_treg <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_treg_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_treg <- prepare_gene_list(res_treg, G_list)

In [0]:
res_treg_0.05 <- get_enrichments(geneList_entrez_treg, 0.05, H_t2g, C2_t2g)

In [0]:
# res_treg_1 <- get_enrichments(geneList_entrez_treg, 1, H_t2g)

###GO

In [0]:
set.seed(123)

treg_GO <- enrichGO(gene = names(geneList_entrez_treg),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_treg_GO <- treg_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_treg_GO <- res_treg_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_treg_GO)

In [0]:
options(repr.plot.width=1000, repr.plot.height=1000,  echo= F)
barplot(treg_GO, showCategory=30, label_format=50, font.size=9) + ggtitle("Reynolds- Treg - L vs HC - GO")

###Save results

In [0]:
# write.xlsx(res_treg_0.05, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/res_treg_0.05_reynolds.xlsx")
write.xlsx(treg_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_treg_reynolds_GO_v2.xlsx")

##Fibroblasts

In [0]:
res_fb <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_fb_LvsHC_bulk_v2.xlsx")

In [0]:
display(res_fb)

In [0]:
geneList_entrez_fb <- prepare_gene_list(res_fb, G_list)

In [0]:
res_fb_0.05 <- get_enrichments(geneList_entrez_fb, 0.05, H_t2g, C2_t2g)

###GO

In [0]:
set.seed(123)

fb_GO <- enrichGO(gene = names(geneList_entrez_fb),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_fb_GO <- fb_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_fb_GO <- res_fb_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_fb_GO)

In [0]:
options(repr.plot.width=1200, repr.plot.height=1200,  echo= F)
barplot(fb_GO, showCategory=25, label_format=100, font.size=9) + ggtitle("Reynolds- FB - L vs HC - GO")

###Save results

In [0]:
#Save results
# write.xlsx(res_fb_0.05, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/res_fb_0.05_reynolds.xlsx")
write.xlsx(fb_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_fb_reynolds_GO_v2.xlsx")

##Keratinocytes

In [0]:
res_kc <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_kc_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_kc <- prepare_gene_list(res_kc, G_list)

In [0]:
res_kc_0.05 <- get_enrichments(geneList_entrez_kc, 0.05, H_t2g, C2_t2g)

In [0]:
res_kc_0.05$Hallmark2

In [0]:
barplot_gsea(res_kc_0.05$Reactome, 20,"Keratinocytes - Reynolds- Reactome")

In [0]:
barplot_gsea(res_kc_0.05$Hallmark2, 20,"Keratinocytes - Reynolds- Hallmarks C2")

In [0]:
# res_kc_1 <- get_enrichments(geneList_entrez_kc, 1, H_t2g, C2_t2g)

In [0]:
set.seed(123)

kc_GO <- enrichGO(gene = names(geneList_entrez_kc),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_kc_GO <- kc_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_kc_GO <- res_kc_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_kc_GO)

In [0]:
options(repr.plot.width=1200, repr.plot.height=1200,  echo= F)
barplot(kc_GO, showCategory=25, label_format=100, font.size=9) + ggtitle("Reynolds KC - L vs HC - GO")

###Save results

In [0]:
#Save results
write.xlsx(res_kc_0.05, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/res_kc_0.05_reynolds_v2.xlsx")
write.xlsx(kc_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_kc_reynolds_GO_v2.xlsx")

##ILC

In [0]:
res_ilc <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_ilc_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_ilc <- prepare_gene_list(res_ilc, G_list)

In [0]:
res_ilc_0.05 <- get_enrichments(geneList_entrez_ilc, 0.05, H_t2g, C2_t2g)

No results

In [0]:
set.seed(123)

ilc_GO <- enrichGO(gene = names(geneList_entrez_ilc),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_ilc_GO <- ilc_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_ilc_GO <- res_ilc_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_ilc_GO)

In [0]:
#Save results
write.xlsx(ilc_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_ilc_reynolds_GO_v2.xlsx")

##MastC

In [0]:
res_mastc <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_mast_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_mast <- prepare_gene_list(res_mastc, G_list)

In [0]:
res_mast_0.05 <- get_enrichments(geneList_entrez_mast, 0.05, H_t2g, C2_t2g)

No results

In [0]:
set.seed(123)

mast_GO <- enrichGO(gene = names(geneList_entrez_mast),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_mast_GO <- mast_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_mast_GO <- res_mast_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_mast_GO)

In [0]:
#Save results
write.xlsx(mast_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_mast_reynolds_GO_v2.xlsx")

##Monocytes

In [0]:
res_mono <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_mono_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_mono <- prepare_gene_list(res_mono, G_list)

In [0]:
res_mono_0.05 <- get_enrichments(geneList_entrez_mono, 0.05, H_t2g, C2_t2g)

In [0]:
set.seed(123)

mono_GO <- enrichGO(gene = names(geneList_entrez_mono),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_mono_GO <- mono_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_mono_GO <- res_mono_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_mono_GO)

In [0]:
#Save results
# write.xlsx(res_mono_0.05, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/res_mono_0.05_reynolds.xlsx")
write.xlsx(mono_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_mono_reynolds_GO_v2.xlsx")

##DC

In [0]:
res_dc <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/pseudobulk/reynolds_dc_LvsHC_bulk_v2.xlsx")

In [0]:
geneList_entrez_dc <- prepare_gene_list(res_dc, G_list)

In [0]:
res_dc_0.05 <- get_enrichments(geneList_entrez_dc, 0.05, H_t2g, C2_t2g)

In [0]:
options(repr.plot.width = 1400, repr.plot.height = 1000, echo = FALSE)
dotplot_pathway_generator(res_dc_0.05$Hallmark2, 20, "Top 40 DC Reynolds - Hallmark c2")

In [0]:
set.seed(123)

dc_GO <- enrichGO(gene = names(geneList_entrez_dc),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "BP", 
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.05,
                qvalueCutoff = 0.2)
                
# Extract results
res_dc_GO <- dc_GO@result

# Filter results to include only those with p.adjust < 0.05
filtered_res_dc_GO <- res_dc_GO %>%
  filter(p.adjust < 0.05)

# Display filtered results
display(filtered_res_dc_GO)

In [0]:
#Save results
write.xlsx(res_dc_0.05, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_dc_0.05_reynolds_v2.xlsx")
write.xlsx(dc_GO, "/dbfs/mnt/sandbox/TFM_PAULA/Reynolds/LvsHC/GSEA_individually/pseudobulk/res_dc_reynolds_GO_v2.xlsx")

##NK