#Find DEGs with pseudobulk in the merge object Alkon + Reynolds
Done with and without adjusting dataset as a covariable

###Relevant cell types (celltypist): 
T-cells (Tc, Th and Treg), Keratinocytes (Undifferentiated and Differentiated), Macrophages

###Constrast: 
  - Lesional vs Healthy control (LvsHC)

In [0]:
#Load required libraries
.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat_v2", .libPaths()))
library(Seurat)
.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat", .libPaths()))
library(openxlsx)
library(EnhancedVolcano)
.libPaths(c("/dbfs/home/boriol@almirall.com/my_r_packages/bulkRNASeq_PBMCs_R4.3", .libPaths()))
library(EnhancedVolcano)

In [0]:
volcano_generator <- function(resultsDE, given_title) {
  library(dplyr)

  resultsDE <- as.data.frame(resultsDE)
  
  # Create annotations for volcano plot
  resultsDE0 <- resultsDE
  resultsDE0$gene_id <- rownames(resultsDE0)

  # Ensure unique row names and remove rows with missing gene id
  resultsDE0 <- resultsDE0 %>%
    distinct(gene_id, .keep_all = TRUE)
  rownames(resultsDE0) <- resultsDE0$gene_id
  
  # Determine column names for p-value and log2 fold change
  p_val_col <- if ("p_val_adj" %in% colnames(resultsDE0)) "p_val_adj" else "padj"
  log2fc_col <- if ("avg_log2FC" %in% colnames(resultsDE0)) "avg_log2FC" else "log2FoldChange"
  
  top10_genes <- resultsDE0 %>%
    filter(!!sym(log2fc_col) > 1 & !!sym(p_val_col) < 0.05) %>%
    arrange(!!sym(p_val_col)) %>% top_n(10, -!!sym(p_val_col))
  
  bottom10_genes <- resultsDE0 %>%
    filter(!!sym(log2fc_col) < -1 & !!sym(p_val_col) < 0.05) %>%
    arrange(!!sym(p_val_col)) %>% top_n(10, -!!sym(p_val_col))
  
  # Plot Volcano
  volcano <- EnhancedVolcano(resultsDE0,
    lab = rownames(resultsDE0),
    x = log2fc_col,
    y = p_val_col,
    pCutoff = 0.05,
    selectLab = c(top10_genes$gene_id, bottom10_genes$gene_id),
    labSize = 5,
    drawConnectors = TRUE,
    widthConnectors = 0.5,
    colConnectors = 'black',
    title = given_title)
  volcano
}

##Process data

In [0]:
AR <- readRDS(file="/dbfs/mnt/sandbox/TFM_PAULA/MERGED_ARdatasets_celltypist_TFM.rds")

In [0]:
unique(AR$dataset)

In [0]:
unique(AR$Condition_AR)

In [0]:
table(AR$celltype_AR, AR$dataset)

##Filtering variables that have at least 3 counts

In [0]:
counts_matrix <- AR[["RNA"]]$counts
dim(counts_matrix) #38248 391832

In [0]:
# Keep only rows that have a count of at least 3 counts in 3 samples
smallestGroupSize <- 3
keep <- rowSums(counts_matrix >= 3) >= smallestGroupSize
counts_keep <- counts_matrix[keep,]

# Subset the Seurat object to keep only the features in counts_keep
AR_f <- subset(AR, features = rownames(counts_keep))

# Assign the filtered counts to the new Seurat object
AR_f[["RNA"]]$counts <- counts_keep

# Check dimensions
dim(AR_f[["RNA"]]$counts) # 16417 391832

##Pseudobulk the counts based on the donor id

In [0]:
unique(AR_f$celltype_AR)

In [0]:
table(AR_f$Condition_AR, AR_f$Sample_id)

In [0]:
unique(AR_f$dataset)

In [0]:
# pseudobulk the counts based on donor-condition-celltype
pseudo_AR <- AggregateExpression(AR_f, assays = "RNA", return.seurat = T, group.by = c("Condition_AR", "Sample_id", "celltype_AR"))

# each 'cell' is a donor-condition-celltype pseudobulk profile
head(Cells(pseudo_AR))

In [0]:
pseudo_AR$celltype_AR <- ifelse(pseudo_AR$celltype_AR == "Differentiated-KC", "Differentiated_KC", pseudo_AR$celltype_AR)
pseudo_AR$celltype_AR <- ifelse(pseudo_AR$celltype_AR == "Undifferentiated-KC", "Undifferentiated_KC", pseudo_AR$celltype_AR)
unique(pseudo_AR$celltype_AR)

In [0]:
pseudo_AR$celltype.cond <- paste(pseudo_AR$celltype_AR, pseudo_AR$Condition_AR, sep = "_")

In [0]:
Idents(pseudo_AR) <- "celltype.cond"

In [0]:
dataset_info <- AR_f@meta.data$dataset

# Create a mapping of Sample_id to dataset
sample_to_dataset <- setNames(dataset_info, AR_f@meta.data$Sample_id)

# Add the dataset information to the pseudo_AR metadata
pseudo_AR@meta.data$dataset <- sample_to_dataset[pseudo_AR@meta.data$Sample_id]

In [0]:
unique(pseudo_AR@meta.data$dataset)

In [0]:
saveRDS(pseudo_AR, file="/dbfs/mnt/sandbox/TFM_PAULA/AR_MERGED_celltypist_aggregated_expression_TFM.rds")

###Filter to remove what is not relevant
- non lesional samples 
- all celltypes except to Tc, Th, Treg, Undifferentiated_KC, Differentiated_KC


Filter cell types, in that way I can run DESEQ2 faster

In [0]:
pseudo_AR <- readRDS(file="/dbfs/mnt/sandbox/TFM_PAULA/AR_MERGED_celltypist_aggregated_expression_TFM.rds")

In [0]:
unique(pseudo_AR$celltype_AR)

In [0]:
desired_cell_types <- c("Tc", "Th", "Treg", "Undifferentiated_KC", "Differentiated_KC")  # Relevant celltypes
pseudo_AR_filtered <- subset(pseudo_AR, celltype_AR %in% desired_cell_types)

In [0]:
unique(pseudo_AR_filtered$celltype_AR)

Filter non lesional

In [0]:
# Extract count data
counts_AR <- GetAssayData(pseudo_AR_filtered, layer = "counts")

# Extract metadata
metadata_AR <- pseudo_AR_filtered@meta.data

In [0]:
metadata_AR_reynolds <- metadata_AR[metadata_AR$Condition_AR %in% c("HC", "Lesional") & metadata_AR$dataset == "reynolds", ]
metadata_AR <- rbind(metadata_AR[metadata_AR$dataset != "reynolds", ], metadata_AR_reynolds)

# Filter counts too
counts_AR <- counts_AR[, colnames(counts_AR) %in% rownames(metadata_AR)]

In [0]:
metadata_AR$celltype.cond <- as.factor(metadata_AR$celltype.cond)
metadata_AR$dataset <- as.factor(metadata_AR$dataset)

In [0]:
table(metadata_AR$Condition_AR, metadata_AR$dataset)

In [0]:
# Ensure the same order for rows in metadata_AR and columns in counts_AR
metadata_AR <- metadata_AR[order(rownames(metadata_AR)), ]
counts_AR <- counts_AR[, order(colnames(counts_AR))]

# Reorder counts_AR columns to match the order of metadata_AR rows
counts_AR <- counts_AR[, rownames(metadata_AR)]

###DESEQ design to adjust the covariable dataset

In [0]:
.libPaths(c("/dbfs/home/jtrincado@almirall.com/my_r_packages/Seurat", .libPaths()))
library(DESeq2)

In [0]:
dds <- DESeqDataSetFromMatrix(countData = round(counts_AR), #already in matrix format,
                              colData = metadata_AR,
                              design = ~ dataset + celltype.cond)

In [0]:
colData(dds)

In [0]:
# Load the dplyr package
library(dplyr)

# Make sure we have all data there
display(data.frame(colData(dds)) %>% 
  group_by(Sample_id, celltype.cond) %>% 
  summarise(n = n()))

In [0]:
#Keep only rows that have a count of at least 10 counts in 5 samples

keep <- rowSums(counts(dds) > 10) >= 5
dds <- dds[keep,]

counts_keep_dds <- counts(dds) 
dim(counts_keep_dds) #14421    90

In [0]:
#Run differential expression analysis
dds2 <- DESeq(dds)
res <- results(dds2)
res

In [0]:
saveRDS(dds2, file = "/dbfs/mnt/sandbox/TFM_PAULA/dds2_object_pseudobulk_covariable_dataset_celltypist.rds")

##Tcells

In [0]:
Idents(pseudo_AR)

In [0]:
#DEA without adjusting covariable dataset to compare

bulk.tcell.de <- FindMarkers(object = pseudo_AR, 
                         ident.1 = "Tc_Lesional", 
                         ident.2 = "Tc_HC",
                         min.pct = 0.01,
                         test.use = "DESeq2")
head(bulk.tcell.de, n = 15)

In [0]:
volcano_generator(bulk.tcell.de, "Pseudobulk - Tcells - AR merged")

###Covariable dataset adjusted

In [0]:
bulk.tcell.de.cov <- results(dds2, contrast=c("celltype.cond","Tc_Lesional", "Tc_HC"))
bulk.tcell.de.cov

In [0]:
volcano_generator(bulk.tcell.de.cov, "Pseudobulk - Tcells - AR + dataset adjusted")

##Th

In [0]:
Idents(pseudo_AR)

In [0]:
#DEA without adjusting covariable dataset to compare

bulk.th.de <- FindMarkers(object = pseudo_AR_filtered, 
                         ident.1 = "Th_Lesional", 
                         ident.2 = "Th_HC",
                         min.pct = 0.01,
                         test.use = "DESeq2")
head(bulk.th.de, n = 15)

In [0]:
volcano_generator(bulk.th.de, "Pseudobulk - Th - AR merged")

###Covariable dataset adjusted

In [0]:
dds2$celltype.cond

In [0]:
bulk.th.de.cov <- results(dds2, contrast=c("celltype.cond","Th_Lesional", "Th_HC"))
bulk.th.de.cov

In [0]:
volcano_generator(bulk.th.de.cov, "Pseudobulk - Th - AR + dataset adjusted")

##Keratinocytes (Undifferentiated)

In [0]:
bulk.kc.de <- FindMarkers(object = pseudo_AR, 
                         ident.1 = "Undifferentiated_KC_Lesional", 
                         ident.2 = "Undifferentiated_KC_HC",
                         test.use = "DESeq2")
head(bulk.kc.de, n = 15)

In [0]:
library(ggplot2)
bulk.kc.de <- as.data.frame(bulk.kc.de)
ggplot(bulk.kc.de, aes(x = p_val)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of p-value", x = "p-value", y = "Frequency")

In [0]:
volcano_generator(bulk.kc.de, "Pseudobulk - Undifferentiated KC - AR merged")

###Covariable adjusted

In [0]:
bulk.kc.de.cov <- results(dds2, contrast=c("celltype.cond","Undifferentiated_KC_Lesional", "Undifferentiated_KC_HC"))
bulk.kc.de.cov

In [0]:
library(ggplot2)
bulk.kc.de.cov <- as.data.frame(bulk.kc.de.cov)
ggplot(bulk.kc.de.cov, aes(x = pvalue)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of p-value", x = "p-value", y = "Frequency")

In [0]:
volcano_generator(bulk.kc.de.cov, "Pseudobulk - Undifferentiated KC - AR + dataset adjusted")

##Keratinocytes (Differentiated)

In [0]:
bulk.dif.kc.de <- FindMarkers(object = pseudo_AR, 
                         ident.1 = "Differentiated_KC_Lesional", 
                         ident.2 = "Differentiated_KC_HC",
                         test.use = "DESeq2")
head(bulk.dif.kc.de, n = 15)

In [0]:
library(ggplot2)
bulk.kc.de <- as.data.frame(bulk.dif.kc.de)
ggplot(bulk.kc.de, aes(x = p_val)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of p-value", x = "p-value", y = "Frequency")

In [0]:
volcano_generator(bulk.dif.kc.de, "Pseudobulk - Differentiated KC - AR merged")

###Covariable adjusted

In [0]:
bulk.dif.kc.de.cov <- results(dds2, contrast=c("celltype.cond","Differentiated_KC_Lesional", "Differentiated_KC_HC"))
bulk.dif.kc.de.cov

In [0]:
library(ggplot2)
bulk.dif.kc.de.cov  <- as.data.frame(bulk.dif.kc.de.cov )
ggplot(bulk.dif.kc.de.cov , aes(x = pvalue)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of p-value", x = "p-value", y = "Frequency")

In [0]:
volcano_generator(bulk.dif.kc.de.cov, "Pseudobulk - Differentiated KC - AR + dataset adjusted")

##Treg

In [0]:
bulk.treg.de <- FindMarkers(object = pseudo_AR, 
                            ident.1 = "Treg_Lesional", 
                            ident.2 = "Treg_HC",
                            test.use = "DESeq2")
head(bulk.treg.de, n = 15)

In [0]:
volcano_generator(bulk.treg.de, "Pseudobulk - Treg - AR merged")

###Covariable adjusted

In [0]:
bulk.treg.de.cov <- results(dds2, contrast=c("celltype.cond","Treg_Lesional", "Treg_HC"))
bulk.treg.de.cov

In [0]:
volcano_generator(bulk.treg.de.cov, "Pseudobulk - Treg - AR + dataset adjusted")

#Number of DEGs per comparison

In [0]:
res_list <- list(
  bulk.tcell.de.cov = bulk.tcell.de.cov,
  bulk.th.de.cov = bulk.th.de.cov,
  bulk.treg.de.cov = bulk.treg.de.cov,
  bulk.kc.de.cov = bulk.kc.de.cov,
  bulk.dif.kc.de.cov = bulk.dif.kc.de.cov,
  bulk.tcell.de = bulk.tcell.de,
  bulk.th.de = bulk.th.de,
  bulk.treg.de = bulk.treg.de,
  bulk.kc.de = bulk.kc.de,
  bulk.dif.kc.de = bulk.dif.kc.de
)

for (res_name in names(res_list)) {
  res <- res_list[[res_name]]
  pos_DEGs <- 0
  neg_DEGs <- 0
  p_val_col <- if ("p_val_adj" %in% colnames(res)) "p_val_adj" else "padj"
  log2fc_col <- if ("avg_log2FC" %in% colnames(res)) "avg_log2FC" else "log2FoldChange"
  for (i in 1:nrow(res)) {
    if (!is.na(res[[p_val_col]][i]) && res[[p_val_col]][i] < 0.05) {
      if (res[[log2fc_col]][i] > 1) {
        pos_DEGs <- pos_DEGs + 1
      } else if (res[[log2fc_col]][i] < -1) {
        neg_DEGs <- neg_DEGs + 1
      }
    }
  }
  print(paste("Number of up DEGs in", res_name, ":", pos_DEGs, "and down DEGs:", neg_DEGs))
}

In [0]:
# BEFORE CELLTYPIST REANNOTATION
# [1] "Number of up DEGs in bulk.kc.de.cov : 1082 and down DEGs: 681"

# [1] "Number of up DEGs in bulk.tcell.de.cov : 215 and down DEGs: 309"

# [1] "Number of up DEGs in bulk.treg.de.cov : 79 and down DEGs: 1085"

# AFTER CELLTYPIST REANNOTATION
# [1] "Number of up DEGs in bulk.kc.de.cov : 777 and down DEGs: 919"
# [1] "Number of up DEGs in bulk.dif.kc.de.cov : 1268 and down DEGs: 864"

# [1] "Number of up DEGs in bulk.tcell.de.cov : 243 and down DEGs: 547"
# [1] "Number of up DEGs in bulk.th.de.cov : 285 and down DEGs: 708"

# [1] "Number of up DEGs in bulk.treg.de.cov : 83 and down DEGs: 227"

#Save

In [0]:
%sh
mkdir /dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results
mkdir /dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs
mkdir /dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/adjusting_cov_dataset_DEGs

In [0]:
bulk.th.de$gene <- rownames(bulk.th.de)
bulk.treg.de$gene <- rownames(bulk.treg.de)
bulk.kc.de$gene <- rownames(bulk.kc.de)
bulk.dif.kc.de$gene <- rownames(bulk.dif.kc.de)
bulk.tcell.de$gene <- rownames(bulk.tcell.de)

write.xlsx(bulk.th.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.th.de.xlsx")
write.xlsx(bulk.treg.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.treg.de.xlsx")
write.xlsx(bulk.kc.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.undif.kc.de.xlsx")
write.xlsx(bulk.tcell.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.tcell.de.xlsx")
write.xlsx(bulk.dif.kc.de, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.dif.kc.de.xlsx")

In [0]:
bulk.th.de.cov$gene <- rownames(bulk.th.de.cov)
bulk.treg.de.cov$gene <- rownames(bulk.treg.de.cov)
bulk.kc.de.cov$gene <- rownames(bulk.kc.de.cov)
bulk.dif.kc.de.cov$gene <- rownames(bulk.dif.kc.de.cov)
bulk.tcell.de.cov$gene <- rownames(bulk.tcell.de.cov)

write.xlsx(bulk.th.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.th.de.cov.xlsx")
write.xlsx(bulk.treg.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.treg.de.cov.xlsx")
write.xlsx(bulk.kc.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.undif.kc.de.cov.xlsx")
write.xlsx(bulk.tcell.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.tcell.de.cov.xlsx")
write.xlsx(bulk.dif.kc.de.cov, file="/dbfs/mnt/sandbox/TFM_PAULA/merged_AR_celltypist_results/DEGs/bulk.dif.kc.de.cov.xlsx")