#Find DEGs with pseudobulk in the merge object Alkon + Reynolds
Done with and without adjusting dataset as a covariable

###Most relevant cell types: 
T-cells (TC), Fibroblasts, Keratinocytes (KC), Macrophages, Treg 

###Constrast: 
  - Lesional vs Healthy control (LvsHC)

In [0]:
#Load required libraries
.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat", .libPaths()))
library(openxlsx)

.libPaths(c("/dbfs/home/boriol@almirall.com/my_r_packages/bulkRNASeq_PBMCs_R4.3", .libPaths()))
library(VennDiagram)
library(EnhancedVolcano)

.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat_v2", .libPaths()))
library(Seurat)

In [0]:
volcano_generator <- function(resultsDE, given_title) {
  library(dplyr)

  resultsDE <- as.data.frame(resultsDE)
  
  # Create annotations for volcano plot
  resultsDE0 <- resultsDE
  resultsDE0$gene_id <- rownames(resultsDE0)

  # Ensure unique row names and remove rows with missing gene id
  resultsDE0 <- resultsDE0 %>%
    distinct(gene_id, .keep_all = TRUE)
  rownames(resultsDE0) <- resultsDE0$gene_id
  
  # Determine column names for p-value and log2 fold change
  p_val_col <- if ("p_val_adj" %in% colnames(resultsDE0)) "p_val_adj" else "padj"
  log2fc_col <- if ("avg_log2FC" %in% colnames(resultsDE0)) "avg_log2FC" else "log2FoldChange"
  
  top10_genes <- resultsDE0 %>%
    filter(!!sym(log2fc_col) > 1 & !!sym(p_val_col) < 0.05) %>%
    arrange(!!sym(p_val_col)) %>% top_n(10, -!!sym(p_val_col))
  
  bottom10_genes <- resultsDE0 %>%
    filter(!!sym(log2fc_col) < -1 & !!sym(p_val_col) < 0.05) %>%
    arrange(!!sym(p_val_col)) %>% top_n(10, -!!sym(p_val_col))
  
  # Plot Volcano
  volcano <- EnhancedVolcano(resultsDE0,
    lab = rownames(resultsDE0),
    x = log2fc_col,
    y = p_val_col,
    pCutoff = 0.05,
    selectLab = c(top10_genes$gene_id, bottom10_genes$gene_id),
    labSize = 5,
    drawConnectors = TRUE,
    widthConnectors = 0.5,
    colConnectors = 'black',
    title = given_title)
  volcano
}

##Process data

In [0]:
AR <- readRDS(file="/dbfs/mnt/sandbox/TFM_PAULA/MERGED_ARdatasets_TFM.rds")

In [0]:
unique(AR$dataset)

In [0]:
unique(AR$Condition_AR)

In [0]:
table(AR$celltype_AR, AR$dataset)

##Filtering variables that have at least 3 counts

In [0]:
counts_matrix <- AR[["RNA"]]$counts
dim(counts_matrix) #38248 391832

In [0]:
# Keep only rows that have a count of at least 3 counts in 3 samples
smallestGroupSize <- 3
keep <- rowSums(counts_matrix >= 3) >= smallestGroupSize
counts_keep <- counts_matrix[keep,]

# Subset the Seurat object to keep only the features in counts_keep
AR_f <- subset(AR, features = rownames(counts_keep))

# Assign the filtered counts to the new Seurat object
AR_f[["RNA"]]$counts <- counts_keep

# Check dimensions
dim(AR_f[["RNA"]]$counts) # 16417 391832

##Pseudobulk the counts based on the donor id

In [0]:
unique(AR_f$celltype_AR)

In [0]:
table(AR_f$Condition_AR, AR_f$Sample_id)

In [0]:
unique(AR_f$dataset)

In [0]:
# pseudobulk the counts based on donor-condition-celltype
pseudo_AR <- AggregateExpression(AR_f, assays = "RNA", return.seurat = T, group.by = c("Condition_AR", "Sample_id", "celltype_AR"))

# each 'cell' is a donor-condition-celltype pseudobulk profile
head(Cells(pseudo_AR))

In [0]:
pseudo_AR$celltype.cond <- paste(pseudo_AR$celltype_AR, pseudo_AR$Condition_AR, sep = "_")

In [0]:
Idents(pseudo_AR) <- "celltype.cond"

In [0]:
dataset_info <- AR_f@meta.data$dataset

# Create a mapping of Sample_id to dataset
sample_to_dataset <- setNames(dataset_info, AR_f@meta.data$Sample_id)

# Add the dataset information to the pseudo_AR metadata
pseudo_AR@meta.data$dataset <- sample_to_dataset[pseudo_AR@meta.data$Sample_id]

In [0]:
unique(pseudo_AR@meta.data$dataset)

In [0]:
saveRDS(pseudo_AR, file="/dbfs/mnt/sandbox/TFM_PAULA/AR_MERGED_aggregated_expression_TFM.rds")

In [0]:
# Extract count data
counts_AR <- GetAssayData(pseudo_AR, layer = "counts")

# Extract metadata
metadata_AR <- pseudo_AR@meta.data

In [0]:
# Filter to remove non lesional samples 

In [0]:
metadata_AR_reynolds <- metadata_AR[metadata_AR$Condition_AR %in% c("HC", "Lesional") & metadata_AR$dataset == "reynolds", ]
metadata_AR <- rbind(metadata_AR[metadata_AR$dataset != "reynolds", ], metadata_AR_reynolds)

# Filter counts too
counts_AR <- counts_AR[, colnames(counts_AR) %in% rownames(metadata_AR)]

In [0]:
metadata_AR$celltype.cond <- as.factor(metadata_AR$celltype.cond)
metadata_AR$dataset <- as.factor(metadata_AR$dataset)

In [0]:
table(metadata_AR$Condition_AR, metadata_AR$dataset)

In [0]:
# Ensure the same order for rows in metadata_AR and columns in counts_AR
metadata_AR <- metadata_AR[order(rownames(metadata_AR)), ]
counts_AR <- counts_AR[, order(colnames(counts_AR))]

# Reorder counts_AR columns to match the order of metadata_AR rows
counts_AR <- counts_AR[, rownames(metadata_AR)]

###DESEQ design to adjust the covariable dataset

In [0]:
.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat", .libPaths()))
library(DESeq2)
library(tidyverse)

In [0]:
dds <- DESeqDataSetFromMatrix(countData = round(counts_AR), #already in matrix format,
                              colData = metadata_AR,
                              design = ~ dataset + celltype.cond)

In [0]:
colData(dds)

In [0]:
#Make sure we have all data there
display(data.frame(colData(dds)) %>% group_by(Sample_id, celltype.cond) %>% summarise(n=n()))

In [0]:
#Keep only rows that have a count of at least 10 counts in 5 samples

keep <- rowSums(counts(dds) > 10) >= 5
dds <- dds[keep,]

counts_keep_dds <- counts(dds) 
dim(counts_keep_dds) #15972   316

In [0]:
#Run differential expression analysis
dds2 <- DESeq(dds)
res <- results(dds2)
res

In [0]:
# saveRDS(dds2, file = "/dbfs/mnt/sandbox/TFM_PAULA/dds2_object_pseudobulk_covariable_dataset.rds")

##PCA

In [0]:
pseudo_AR <- readRDS(file="/dbfs/mnt/sandbox/TFM_PAULA/AR_MERGED_aggregated_expression_TFM.rds")

In [0]:
# Step 2: Normalize the data
pseudo_AR <- NormalizeData(pseudo_AR)

# Step 3: Find variable features
pseudo_AR <- FindVariableFeatures(pseudo_AR)

# Step 4: Scale the data
pseudo_AR <- ScaleData(pseudo_AR)

# Step 5: Run PCA
pseudo_AR <- RunPCA(pseudo_AR, features = VariableFeatures(object = pseudo_AR))

In [0]:
table(pseudo_AR$celltype_AR, pseudo_AR$dataset)

In [0]:
# Filter pseudo_AR according to celltype
filtered_pseudo_AR <- subset(pseudo_AR, subset = celltype_AR %in% c("KC", "TC") & dataset %in% c("reynolds", "alkon"))

In [0]:
# Visualize PCA results with sample ID labels and color by celltype
plot1 <- DimPlot(pseudo_AR, reduction = "pca", group.by = "celltype_AR", label = TRUE, repel = TRUE)

# Visualize PCA results with sample ID labels and color by sample id
plot2 <- DimPlot(pseudo_AR, reduction = "pca", group.by = "Condition_AR", label = TRUE, repel = TRUE)


In [0]:
plot4 <- DimPlot(filtered_pseudo_AR, reduction = "pca", group.by = "celltype_AR", label = TRUE, repel = TRUE)
plot5 <- DimPlot(filtered_pseudo_AR, reduction = "pca", group.by = "dataset", label = TRUE, repel = TRUE)
plot6 <- DimPlot(filtered_pseudo_AR, reduction = "pca", group.by = "Condition_AR", label = TRUE, repel = TRUE)


In [0]:
options(repr.plot.width=1500, repr.plot.height=1200)

In [0]:
plot6

In [0]:
plot4

In [0]:
plot5

In [0]:
plot1

In [0]:
plot2

In [0]:
plot3 <- DimPlot(pseudo_AR, reduction = "pca", group.by = "dataset", label = TRUE, repel = TRUE)

In [0]:
plot3

##Tcells

In [0]:
Idents(pseudo_AR)

In [0]:
bulk.tcell.de <- FindMarkers(object = pseudo_AR, 
                         ident.1 = "TC_Lesional", 
                         ident.2 = "TC_HC",
                         min.pct = 0.01,
                         test.use = "DESeq2")
head(bulk.tcell.de, n = 15)

In [0]:
volcano_generator(bulk.tcell.de, "Pseudobulk - Tcells  Merge")

###Covariable dataset adjusted

In [0]:
bulk.tcell.de.cov <- results(dds2, contrast=c("celltype.cond","TC_Lesional", "TC_HC"))
bulk.tcell.de.cov

In [0]:
volcano_generator(bulk.tcell.de.cov, "Pseudobulk - Tcells - AR + dataset adjusted")

##Fibroblasts

In [0]:
bulk.fb.de <- FindMarkers(object = pseudo_AR, 
                         ident.1 = "Fibroblasts_Lesional", 
                         ident.2 = "Fibroblasts_HC",
                         test.use = "DESeq2")
head(bulk.fb.de, n = 15)

In [0]:
volcano_generator(bulk.fb.de, "Pseudobulk - Fibroblasts - AR merged")

###Covariable adjusted

In [0]:
bulk.fb.de.cov <- results(dds2, contrast=c("celltype.cond","Fibroblasts_Lesional", "Fibroblasts_HC"))
bulk.fb.de.cov

In [0]:
volcano_generator(bulk.fb.de.cov, "Pseudobulk - Fibroblasts - AR + dataset adjusted")

##Keratinocytes

In [0]:
bulk.kc.de <- FindMarkers(object = pseudo_AR, 
                         ident.1 = "KC_Lesional", 
                         ident.2 = "KC_HC",
                         test.use = "DESeq2")
head(bulk.kc.de, n = 15)

In [0]:
library(ggplot2)
bulk.kc.de <- as.data.frame(bulk.kc.de)
ggplot(bulk.kc.de, aes(x = p_val)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of p-value", x = "p-value", y = "Frequency")

In [0]:
options(repr.plot.width=1000, repr.plot.height=1000)

In [0]:
volcano_generator(bulk.kc.de, "Pseudobulk - KC merged")

###Covariable adjusted

In [0]:
bulk.kc.de.cov <- results(dds2, contrast=c("celltype.cond","KC_Lesional", "KC_HC"))
bulk.kc.de.cov

In [0]:
# bulk.kc.de.cov <- read.xlsx("/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/adjusting_cov_dataset_DEGs/bulk.kc.de.cov.xlsx")

In [0]:
library(ggplot2)
bulk.kc.de.cov <- as.data.frame(bulk.kc.de.cov)
ggplot(bulk.kc.de.cov, aes(x = pvalue)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of p-value", x = "p-value", y = "Frequency")

In [0]:
volcano_generator(bulk.kc.de.cov, "Pseudobulk - KC Merge Cov. Adjusted")

##Treg

In [0]:
bulk.treg.de <- FindMarkers(object = pseudo_AR, 
                            ident.1 = "Treg_Lesional", 
                            ident.2 = "Treg_HC",
                            test.use = "DESeq2")
head(bulk.treg.de, n = 15)

In [0]:
volcano_generator(bulk.treg.de, "Pseudobulk - Treg - AR merged")

###Covariable adjusted

In [0]:
bulk.treg.de.cov <- results(dds2, contrast=c("celltype.cond","Treg_Lesional", "Treg_HC"))
bulk.treg.de.cov

In [0]:
volcano_generator(bulk.treg.de.cov, "Pseudobulk - Treg - AR + dataset adjusted")

##Macrophages

In [0]:
bulk.macro.de <- FindMarkers(object = pseudo_AR, 
                            ident.1 = "Macro_Lesional", 
                            ident.2 = "Macro_HC",
                            test.use = "DESeq2")
head(bulk.macro.de, n = 15)

In [0]:
volcano_generator(bulk.macro.de, "Pseudobulk - Macrophages - AR merged")

###Covariable adjusted

In [0]:
bulk.macro.de.cov <- results(dds2, contrast=c("celltype.cond","Macro_Lesional", "Macro_HC"))
bulk.macro.de.cov

In [0]:
volcano_generator(bulk.macro.de.cov, "Pseudobulk - Macrophages - AR + dataset adjusted")

#Number of DEGs per comparison

In [0]:
res_list <- list(
  bulk.macro.de.cov = bulk.macro.de.cov,
  bulk.macro.de = bulk.macro.de,
  bulk.kc.de.cov = bulk.kc.de.cov,
  bulk.kc.de = bulk.kc.de,
  bulk.fb.de.cov = bulk.fb.de.cov,
  bulk.fb.de = bulk.fb.de,
  bulk.tcell.de.cov = bulk.tcell.de.cov,
  bulk.tcell.de = bulk.tcell.de,
  bulk.treg.de.cov = bulk.treg.de.cov,
  bulk.treg.de = bulk.treg.de
)

for (res_name in names(res_list)) {
  res <- res_list[[res_name]]
  pos_DEGs <- 0
  neg_DEGs <- 0
  p_val_col <- if ("p_val_adj" %in% colnames(res)) "p_val_adj" else "padj"
  log2fc_col <- if ("avg_log2FC" %in% colnames(res)) "avg_log2FC" else "log2FoldChange"
  for (i in 1:nrow(res)) {
    if (!is.na(res[[p_val_col]][i]) && res[[p_val_col]][i] < 0.05) {
      if (res[[log2fc_col]][i] > 1) {
        pos_DEGs <- pos_DEGs + 1
      } else if (res[[log2fc_col]][i] < -1) {
        neg_DEGs <- neg_DEGs + 1
      }
    }
  }
  print(paste("Number of up DEGs in", res_name, ":", pos_DEGs, "and down DEGs:", neg_DEGs))
}

In [0]:
# [1] "Number of up DEGs in bulk.macro.de.cov : 97 and down DEGs: 807"
# [1] "Number of up DEGs in bulk.macro.de : 23 and down DEGs: 3"
# [1] "Number of up DEGs in bulk.kc.de.cov : 1082 and down DEGs: 681"
# [1] "Number of up DEGs in bulk.kc.de : 99 and down DEGs: 36"
# [1] "Number of up DEGs in bulk.fb.de.cov : 283 and down DEGs: 587"
# [1] "Number of up DEGs in bulk.fb.de : 67 and down DEGs: 10"
# [1] "Number of up DEGs in bulk.tcell.de.cov : 215 and down DEGs: 309"
# [1] "Number of up DEGs in bulk.tcell.de : 26 and down DEGs: 9"
# [1] "Number of up DEGs in bulk.treg.de.cov : 79 and down DEGs: 1085"
# [1] "Number of up DEGs in bulk.treg.de : 8 and down DEGs: 7"

#Save

In [0]:
%sh
mkdir /dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/DEGs
mkdir /dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/adjusting_cov_dataset_DEGs

In [0]:
bulk.fb.de$gene <- rownames(bulk.fb.de)
bulk.treg.de$gene <- rownames(bulk.treg.de)
bulk.kc.de$gene <- rownames(bulk.kc.de)
bulk.tcell.de$gene <- rownames(bulk.tcell.de)
bulk.macro.de$gene <- rownames(bulk.macro.de)

# write.xlsx(bulk.fb.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/DEGs/bulk.fb.de.xlsx")
# write.xlsx(bulk.treg.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/DEGs/bulk.treg.de.xlsx")
# write.xlsx(bulk.kc.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/DEGs/bulk.kc.de.xlsx")
# write.xlsx(bulk.tcell.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/DEGs/bulk.tcell.de.xlsx")
# write.xlsx(bulk.macro.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/DEGs/bulk.macro.de.xlsx")

In [0]:
bulk.fb.de.cov$gene <- rownames(bulk.fb.de.cov)
bulk.treg.de.cov$gene <- rownames(bulk.treg.de.cov)
bulk.kc.de.cov$gene <- rownames(bulk.kc.de.cov)
bulk.tcell.de.cov$gene <- rownames(bulk.tcell.de.cov)
bulk.macro.de.cov$gene <- rownames(bulk.macro.de.cov)

# write.xlsx(bulk.fb.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/adjusting_cov_dataset_DEGs/bulk.fb.de.cov.xlsx")
# write.xlsx(bulk.treg.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/adjusting_cov_dataset_DEGs/bulk.treg.de.cov.xlsx")
# write.xlsx(bulk.kc.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/adjusting_cov_dataset_DEGs/bulk.kc.de.cov.xlsx")
# write.xlsx(bulk.tcell.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/adjusting_cov_dataset_DEGs/bulk.tcell.de.cov.xlsx")
# write.xlsx(bulk.macro.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_results/adjusting_cov_dataset_DEGs/bulk.macro.de.cov.xlsx")