#Alternative method for functional enrichment using GSVA and limma

In the new annotation

In [0]:
#Load required libraries
.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat", .libPaths()))
library(openxlsx)

.libPaths(c("/dbfs/home/boriol@almirall.com/my_r_packages/bulkRNASeq_PBMCs_R4.3", .libPaths()))
library(msigdbr)
library(DESeq2)


.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat_v2", .libPaths()))
library(Seurat)

In [0]:
my_library <- "/dbfs/home/pdelgadom@almirall.com/my_r_packages/tfm_paula_4"
dir.create(my_library, recursive=TRUE, showWarnings=FALSE)
.libPaths(c(my_library, .libPaths()))
if (!requireNamespace("remotes")) install.packages("remotes")
 
install_from_github <- function(pkg_name, my_library=NULL) {
  if (is.null(my_library)) {
    my_library <- .libPaths()[1]
    message("Installing ", pkg_name, " to ", my_library)
  }
 
  temp_library <- tempfile()
  dir.create(temp_library)
  #remotes::install_cran(pkg_name, lib = temp_library, upgrade=FALSE)
  remotes::install_bioc(pkg_name, lib=temp_library, upgrade=FALSE)
  #remotes::install_github(pkg_name, lib = temp_library, upgrade=FALSE)
  for (x in list.files(temp_library)) {
    file.copy(
      file.path(temp_library, x),
      my_library,
      recursive=TRUE
    )
  }
}

In [0]:
if (!requireNamespace("GSVA"))install_from_github("GSVA")

In [0]:
library(GSVA)

##Prepare expression matrix

In [0]:
pseudo_AR <- readRDS(file="/dbfs/mnt/sandbox/TFM_PAULA/AR_MERGED_celltypist_aggregated_expression_TFM.rds")

In [0]:
# Extract count data
counts_AR <- GetAssayData(pseudo_AR, layer = "counts")

# Extract metadata
metadata_AR <- pseudo_AR@meta.data

In [0]:
# Filter to remove non lesional samples 

In [0]:
metadata_AR_reynolds <- metadata_AR[metadata_AR$Condition_AR %in% c("HC", "Lesional") & metadata_AR$dataset == "reynolds", ]
metadata_AR <- rbind(metadata_AR[metadata_AR$dataset != "reynolds", ], metadata_AR_reynolds)

# Filter counts too
counts_AR <- counts_AR[, colnames(counts_AR) %in% rownames(metadata_AR)]

In [0]:
metadata_AR$celltype.cond <- as.factor(metadata_AR$celltype.cond)
metadata_AR$dataset <- as.factor(metadata_AR$dataset)

In [0]:
table(metadata_AR$Condition_AR, metadata_AR$dataset)

In [0]:
# Ensure the same order for rows in metadata_AR and columns in counts_AR
metadata_AR <- metadata_AR[order(rownames(metadata_AR)), ]
counts_AR <- counts_AR[, order(colnames(counts_AR))]

# Reorder counts_AR columns to match the order of metadata_AR rows
counts_AR <- counts_AR[, rownames(metadata_AR)]

In [0]:
unique(metadata_AR$Condition_AR)

In [0]:
metadata_AR$samples <- paste(metadata_AR$Condition_AR, metadata_AR$Sample_id, metadata_AR$celltype_AR, sep = "_")

In [0]:
counts_AR

In [0]:
dds <- DESeqDataSetFromMatrix(countData = counts_AR, colData = metadata_AR, design = ~ 1)
dds <- estimateSizeFactors(dds)
vsd <- vst(dds, blind = TRUE)
expr_matrix <- assay(vsd)

##GSVA

In [0]:
gene_sets <- msigdbr(species= "Homo sapiens", category="C2", subcategory="CP:REACTOME") %>% split (x=.$gene_symbol, f= .$gs_name)

In [0]:
# #Filter very large/ very small gene sets
 gene_sets <- gene_sets[sapply(gene_sets, length) >= 10 & sapply(gene_sets, length) <= 500]

In [0]:
?gsva

In [0]:
gsva_scores <- gsva(as.matrix(expr_matrix), gene_sets, method = "gsva")


In [0]:
# sample_names <- colnames(counts_AR)
# sample_metadata <- data.frame(
#   sample = sample_names,
#   condition = sapply(strsplit(sample_names, "_"), `[`, 1),
#   patient = sapply(strsplit(sample_names, "_"), `[`, 2),
#   cell_type = sapply(strsplit(sample_names, "_"), `[`, 3)
# )

In [0]:
library(limma)
library(dplyr)
 
# Your GSVA matrix: rows = pathways, columns = samples
# Your metadata_AR should already contain: samples, celltype, condition, dataset

wanted_celltypes <- c("Tc", "Th", "Undifferentiated_KC", "Differentiated_KC")
 
# Sanity check
colnames(gsva_scores) <- gsub("-", "_", colnames(gsva_scores))
metadata_AR$samples <- gsub("-", "_", metadata_AR$samples)
stopifnot(all(colnames(gsva_scores) %in% metadata_AR$samples))
 
# Reorder metadata to match GSVA matrix columns
metadata_AR <- metadata_AR[match(colnames(gsva_scores), metadata_AR$samples), ]
 
# Subset GSVA matrix to relevant samples
expr_ct <- gsva_scores[, metadata_AR$samples, drop = FALSE]
expr_ct <- as.matrix(expr_ct)
mode(expr_ct) <- "numeric"
 
# Design matrix (e.g., for batch + celltype.cond)
# Make sure 'celltype.cond' is a column in metadata_AR like "TC_HC", "TC_Lesional"
design_ct <- model.matrix(~ 0 + dataset + celltype.cond, data = metadata_AR)
 
# Fit linear model
fit_ct <- lmFit(expr_ct, design_ct)
fit_ct <- eBayes(fit_ct)

In [0]:

# Loop through wanted cell types and compare Lesional vs HC
results_list <- list()
 
for (ct in wanted_celltypes) {
  group_HC <- paste0("celltype.cond", ct, "_HC")
  group_LES <- paste0("celltype.cond", ct, "_Lesional")
 
  if (!(group_HC %in% colnames(design_ct)) || !(group_LES %in% colnames(design_ct))) {
    cat("Skipping", ct, "- missing group\n")
    next
  }
 
  # Build contrast
  contrast_vec <- rep(0, ncol(design_ct))
  names(contrast_vec) <- colnames(design_ct)
  contrast_vec[group_LES] <- 1
  contrast_vec[group_HC] <- -1
 
  # Apply contrast
  fit2 <- contrasts.fit(fit_ct, contrast_vec)
  fit2 <- eBayes(fit2)
 
  # Extract results
  res <- topTable(fit2, coef = 1, number = Inf)
  res$celltype <- ct
  results_list[[ct]] <- res
  cat("Finished:", ct, "\n")
}
 
# Combine results
all_pathway_results <- bind_rows(results_list)
head(all_pathway_results)

In [0]:
TC_GSVA_res <- all_pathway_results %>% filter(celltype == "Tc")
TC_GSVA_res$pathway_name <- rownames(TC_GSVA_res)
TC_GSVA_res.sig <- all_pathway_results %>% filter(celltype == "Tc" & abs(logFC)>0.2 & adj.P.Val < 0.05)
TC_GSVA_res.sig$pathway_name <- rownames(TC_GSVA_res.sig)
display(TC_GSVA_res.sig)

In [0]:
display(TC_GSVA_res)

In [0]:
Th_GSVA_res <- all_pathway_results %>% filter(celltype == "Th")
Th_GSVA_res.sig <- all_pathway_results %>% filter(celltype == "Th" & adj.P.Val < 0.05)
Th_GSVA_res.sig$pathway_name <- rownames(Th_GSVA_res.sig)
display(Th_GSVA_res.sig)

In [0]:
Differentiated_KC_GSVA_res <- all_pathway_results %>% filter(celltype == "Differentiated_KC")
Differentiated_KC_GSVA_res$pathway_name <- rownames(Differentiated_KC_GSVA_res)
Differentiated_KC_GSVA_res.sig <- all_pathway_results %>% filter(celltype == "Differentiated_KC" & adj.P.Val < 0.05)
Differentiated_KC_GSVA_res.sig$pathway_name <- rownames(Differentiated_KC_GSVA_res.sig)
display(Differentiated_KC_GSVA_res.sig)

In [0]:
display(Differentiated_KC_GSVA_res)

In [0]:
Undifferentiated_KC_GSVA_res <- all_pathway_results %>% filter(celltype == "Undifferentiated_KC")
Undifferentiated_KC_GSVA_res$pathway_name <- rownames(Undifferentiated_KC_GSVA_res)
Undifferentiated_KC_GSVA_res.sig <- all_pathway_results %>% filter(celltype == "Undifferentiated_KC" & adj.P.Val < 0.05)
Undifferentiated_KC_GSVA_res.sig$pathway_name <- rownames(Undifferentiated_KC_GSVA_res.sig)
display(Undifferentiated_KC_GSVA_res.sig)

In [0]:
display(Undifferentiated_KC_GSVA_res)

##Save

In [0]:
%sh
mkdir /dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA

In [0]:
if (file.exists("/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Differentiated_KC_GSVA_res.sig.xlsx")) {
  file.remove("/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Differentiated_KC_GSVA_res.sig.xlsx")
}
if (file.exists("/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Undifferentiated_KC_GSVA_res.sig.xlsx")) {
  file.remove("/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Undifferentiated_KC_GSVA_res.sig.xlsx")
}
if (file.exists("/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Tc_GSVA_res.sig.xlsx")) {
  file.remove("/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Tc_GSVA_res.sig.xlsx")
}

In [0]:
write.xlsx(Differentiated_KC_GSVA_res.sig, "/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Differentiated_KC_GSVA_res.sig.xlsx")
write.xlsx(Undifferentiated_KC_GSVA_res.sig, "/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Undifferentiated_KC_GSVA_res.sig.xlsx")
write.xlsx(TC_GSVA_res.sig, "/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/GSVA/Tc_GSVA_res.sig.xlsx")

#Plot

In [0]:
options(repr.plot.width = 1600, repr.plot.height = 1000, echo = FALSE)

In [0]:
library(dplyr)
library(tibble)

plot_df_tc <- TC_GSVA_res %>%
  rownames_to_column(var = "gs_name") %>%
  mutate(
    logFC_size = abs(logFC),
    direction = ifelse(logFC > 0, "Up in Lesional", "Down in Lesional")
  )

In [0]:
 
# Filter top pathways or by p-value
plot_top_tc <- plot_df_tc %>%
  filter(adj.P.Val < 0.05) %>%
  group_by(direction) %>%
  slice_max(order_by = logFC_size, n = 15)  # top 10 per cell type
 
ggplot(plot_top_tc, aes(x = logFC, y = reorder(gs_name, logFC))) +
  geom_bar(stat = "identity", aes(fill = adj.P.Val)) +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.9)) +
  labs(
    x = "logFC",
    y = "Pathway",
    fill = "adj.P.Val",
    title = "Top 30 Pathway enrichment in Tcells by GSVA + limma"
  )

In [0]:
library(dplyr)
library(tibble)

plot_df_kc <- Undifferentiated_KC_GSVA_res %>%
  rownames_to_column(var = "gs_name") %>%
  mutate(
    logFC_size = abs(logFC),
    direction = ifelse(logFC > 0, "Up in Lesional", "Down in Lesional")
  )

In [0]:
options(repr.plot.width = 1900, repr.plot.height = 1000, echo = FALSE)

In [0]:
library(ggplot2)
 
# Filter top pathways or by p-value
plot_top_kc <- plot_df_kc %>%
  filter(adj.P.Val < 0.05) %>%
  group_by(direction) %>%
  slice_max(order_by = logFC_size, n = 15)  # top 10 per direction per cell type
 
ggplot(plot_top_kc, aes(x = logFC, y = reorder(gs_name, logFC))) +
  geom_bar(stat = "identity", aes(fill = adj.P.Val)) +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.6)) +
  labs(
    x = "logFC",
    y = "Pathway",
    fill = "adj.P.Val",
    title = "Top 30 Pathway enrichment in Undiff.KC by GSVA + limma"
  )

In [0]:
library(dplyr)
library(tibble)

plot_df_kc <- Differentiated_KC_GSVA_res %>%
  rownames_to_column(var = "gs_name") %>%
  mutate(
    logFC_size = abs(logFC),
    direction = ifelse(logFC > 0, "Up in Lesional", "Down in Lesional")
  )

In [0]:
options(repr.plot.width = 1600, repr.plot.height = 1000, echo = FALSE)

In [0]:
library(ggplot2)
 
# Filter top pathways or by p-value
plot_top_kc <- plot_df_kc %>%
  filter(adj.P.Val < 0.05) %>%
  group_by(direction) %>%
  slice_max(order_by = logFC_size, n = 15)  # top 10 per direction per cell type
 
ggplot(plot_top_kc, aes(x = logFC, y = reorder(gs_name, logFC))) +
  geom_bar(stat = "identity", aes(fill = adj.P.Val)) +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.6)) +
  labs(
    x = "logFC",
    y = "Pathway",
    fill = "adj.P.Val",
    title = "Top 30 Pathway enrichment in Diff.KC by GSVA + limma"
  )