In [1]:
options(warn=-1)

In [2]:
library(edgeR)
library(limma)

assay_to_use <- 3
assayname_to_use <- "scalelogcounts"

run_limma <- function(spe, assay_to_use, verbose = FALSE, p.value = 0.05, lfc = 0.0){

  design <- model.matrix(~0 + Response + batch, data = colData(spe))

  if(verbose) print(colnames(design))
  
  #To simplify the factor name, here we edit the column name of the design matrix by removing the prefix "DEvar" and replacing spaces with underscores.

  colnames(design) <- gsub("^Response","",colnames(design))
  colnames(design) <- gsub(" ","_",colnames(design))
  
  if(verbose) print(colnames(design))
    
  # The contrast for pairwise comparisons between different groups are set up in using the `makeContrasts` function from `Limma`.
  contr.matrix <- makeContrasts(
   Yes_vs_NO = yes - no,
   levels = colnames(design))

  ## Differential expression

  # In the `limma-voom` pipeline, linear modelling is carried out on the log-CPM values by using the `voom`, `lmFit`, `contrasts.fit` and `eBayes` functions. In specific cases where users like to take more considerations of the log fold changes in the statistical analysis, the `treat` function is applied. The `treat` function, t-tests relative to a threshold, allows testing formally the hypothesis (with associated p-values) that the differential expression is greater than a given threshold, fold-change in this case. But be aware of avoiding using `eBayes` and `treat` for different contrasts for the same analysis.

  # **Notes: Here we have microarray data, so the limma without voom pipeline is used**
  v <- assay(spe, assay_to_use)

  fit <- lmFit(v, design = design)
  
  fit_contrast <- contrasts.fit(fit, contrasts = contr.matrix)

  efit <- eBayes(fit_contrast, robust = TRUE)
  
  results_efit<- decideTests(efit, adjust.method = "none", p.value = p.value, lfc = lfc)
  summary_efit <- summary(results_efit)

  if(verbose) print(summary_efit)

  
  my_de_results <- topTable(efit, coef = 1, sort.by = "none", n = Inf) # sort by none

  sig_genes <- rownames(my_de_results[my_de_results$P.Value < p.value,])

  # We can obtain the DE results by using the `TopTable` function.

  return(sig_genes)
}

Loading required package: limma



In [4]:
file_path = '/tmp/work/RCproject/GEO_singlecellexperiment.rds'
sce <- readRDS(file_path)
sce <- sce[, colData(sce)$Response != "partial"]
ncol(sce)

In [5]:
train_samples <- readLines("train_samples.txt")
head(train_samples)
sce <- sce[,train_samples]
sce

class: SingleCellExperiment 
dim: 12165 369 
metadata(0):
assays(3): counts logcounts scalelogcounts
rownames(12165): A1CF A2M ... ZZEF1 ZZZ3
rowData names(5): SYMBOL ENTREZID ENSEMBL gene_id gene_name
colnames(369): GSM5732588_GSE190826 GSM4523147_GSE150082 ...
  GSM5732574_GSE190826 GSM4304768_GSE145037
colData names(6): Response TRG ... Platform batch
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [6]:
all_data_sets <- sce
# de4_data_sets <- sce[, colData(sce)$Platform %in% c("GPL13497","GPL14951","GPL15207","GPL6102")]
de4_data_sets <- sce[, colData(sce)$batch %in% c("GSE133057","GSE145037","GSE150082","GSE45404-1","GSE45404-2","GSE93375","GSE94104")]
rna_seq_data_sets <- sce[, colData(sce)$batch %in% c("GSE190826","GSE209746")]

print('All')
table(colData(all_data_sets)$Response)
print('de4')
table(colData(de4_data_sets)$Response)
print('RNA_seq')
table(colData(rna_seq_data_sets)$Response)

[1] "All"



 no yes 
241 128 

[1] "de4"



 no yes 
 67  49 

[1] "RNA_seq"



 no yes 
131  46 

In [7]:
DE_alldatasets <- run_limma(all_data_sets, assay_to_use = assay_to_use, verbose = TRUE, p.value = 0.05, lfc = 0.0)
DE_4datasets <- run_limma(de4_data_sets, assay_to_use = assay_to_use, verbose = TRUE, p.value = 0.05, lfc = 0.0)
DE_bulkDatasets <- run_limma(rna_seq_data_sets, assay_to_use = assay_to_use, verbose = TRUE, p.value = 0.05, lfc = 0.0)

 [1] "Responseno"         "Responseyes"        "batchGSE145037"    
 [4] "batchGSE150082"     "batchGSE190826"     "batchGSE209746"    
 [7] "batchGSE45404_GPL1" "batchGSE45404_GPL2" "batchGSE93375"     
[10] "batchGSE94104"     
 [1] "no"                 "yes"                "batchGSE145037"    
 [4] "batchGSE150082"     "batchGSE190826"     "batchGSE209746"    
 [7] "batchGSE45404_GPL1" "batchGSE45404_GPL2" "batchGSE93375"     
[10] "batchGSE94104"     
       Yes_vs_NO
Down         563
NotSig     11036
Up           566
[1] "Responseno"     "Responseyes"    "batchGSE145037" "batchGSE150082"
[5] "batchGSE93375"  "batchGSE94104" 
[1] "no"             "yes"            "batchGSE145037" "batchGSE150082"
[5] "batchGSE93375"  "batchGSE94104" 
       Yes_vs_NO
Down         413
NotSig     11369
Up           383
[1] "Responseno"     "Responseyes"    "batchGSE209746"
[1] "no"             "yes"            "batchGSE209746"
       Yes_vs_NO
Down         214
NotSig     11619
Up           332


In [8]:
rds_path = '/tmp/work/RCproject/gene_lists.rds'
gene_lists<- readRDS(rds_path)
model_gene_list <- gene_lists$'de_intersect_plus_bulk_genes'

In [9]:
de_intersect_plus_bulk_genes <- intersect(union(DE_bulkDatasets,DE_4datasets),DE_alldatasets)
saveRDS(de_intersect_plus_bulk_genes,'ann_gene_set.rds')
# saveRDS(DE_alldatasets,'ann_gene_set.rds')

In [10]:
length(de_intersect_plus_bulk_genes)

In [11]:
length(de_intersect_plus_bulk_genes)
length(model_gene_list)
length(intersect(model_gene_list,de_intersect_plus_bulk_genes))

In [12]:
length(de_intersect_plus_bulk_genes)
length(model_gene_list)
length(intersect(model_gene_list,de_intersect_plus_bulk_genes))