# Differential gene expression analysis - code example for edgeR (pseudobulk)
Using the data set Misharin.

In [22]:
library(edgeR)
library(anndata)
library(zellkonverter)
library(SingleCellExperiment)

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges

In [7]:
file_name <- '/home/sch/schonner/MaPra/misharin_for_diffEx_edgeR_test2.h5ad'
adata_pb <- read_h5ad(file_name)
adata_pb

AnnData object with n_obs × n_vars = 31 × 17824
    obs: 'condition', 'manual_celltype_annotation', 'batch', 'sample', 'author_annotation', 'lib_size', 'log_lib_size'

In [8]:
adata_pb$obs[1:5, ]

“index contains duplicated values: row names not set”


Unnamed: 0_level_0,condition,manual_celltype_annotation,batch,sample,author_annotation,lib_size,log_lib_size
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>
1,control,AT1,0,0_control,AT1 cells,174455,12.06942
2,asbestos,AT1,1,1_asbestos,AT1 cells,162594,11.99901
3,control,AT2,0,0_control,AT2,7514408,15.83233
4,asbestos,AT2,1,1_asbestos,AT2,12162263,16.31385
5,control,Alveolar_macrophages,0,0_control,AM,3936244,15.18574


In [12]:
adata_pb$X

Unnamed: 0,Gm37381,Rp1,Sox17,Mrpl15,Lypla1,Gm37988,Tcea1,Rgs20,Atp6v1h,Rb1cc1,⋯,CR974586.5,Csprs,AC132444.6,AC125149.3,AC125149.2,AC168977.2,AC168977.1,PISD,DHRSX,CAAA01147332.1
donor_0_control_0,0,0,0,12,19,0,14,0,10,30,⋯,0,0,0,1,0,0,0,14,9,0
donor_1_asbestos_0,0,0,0,14,10,0,10,0,7,19,⋯,0,0,0,0,0,0,0,10,9,0
donor_0_control_0,0,0,4,173,259,1,377,0,264,380,⋯,0,1,0,0,0,1,1,972,185,10
donor_1_asbestos_0,0,1,10,307,446,0,636,0,492,696,⋯,0,0,1,2,0,0,4,1519,378,8
donor_0_control_0,0,0,5,262,202,0,399,0,225,238,⋯,0,1,0,1,0,0,0,367,130,5
donor_1_asbestos_0,1,1,11,342,326,1,665,0,525,319,⋯,0,0,0,2,0,0,3,607,288,4
donor_0_control_0,0,0,0,121,57,0,190,0,67,120,⋯,0,16,1,19,0,0,4,406,57,1
donor_1_asbestos_0,0,0,0,54,39,0,90,0,36,51,⋯,0,10,2,5,1,0,0,182,35,1
donor_0_control_0,0,0,5,485,451,1,612,0,252,455,⋯,0,1,0,8,1,0,5,493,305,3
donor_1_asbestos_0,0,0,2,86,94,0,115,0,64,97,⋯,0,0,0,2,0,0,1,91,81,1


In [16]:
typeof(assay(adata_AT1))

ERROR: Error in assay(adata_AT1): could not find function "assay"


## 1. Preparing some functions

### Separate function to fit an edgeR GLM:

In [44]:
fit_model <- function(adata_){
    # create an edgeR object with counts and grouping factor
    y <- DGEList(assay(adata_, "X"), group = colData(adata_)$condition)
    # filter out genes with low counts
    print("Dimensions before subsetting:")
    print(dim(y))
    print("")
    keep <- filterByExpr(y)
    y <- y[keep, , keep.lib.sizes=FALSE]
    print("Dimensions after subsetting:")
    print(dim(y))
    print("")
    # normalize
    y <- calcNormFactors(y)
    # create a vector that is concatentation of condition and cell type that we will later use with contrasts
    group <- paste0(colData(adata_)$condition, ".", colData(adata_)$manual_celltype_annotation)
    batch <- colData(adata_)$batch
    # create a design matrix: here we have multiple donors so also consider that in the design matrix
    design <- model.matrix(~ 0 + group + batch)
    # estimate dispersion
    y <- estimateDisp(y, design = design)
    # fit the model
    fit <- glmQLFit(y, design)
    return(list("fit"=fit, "design"=design, "y"=y))
}

## 2. Pseudobulk

### 2.1 One group

In [38]:
adata_AT1 <- adata_pb[adata_pb$obs["manual_celltype_annotation"] == "AT1"]
adata_AT1

“index contains duplicated values: row names not set”


View of AnnData object with n_obs × n_vars = 2 × 17824
    obs: 'condition', 'manual_celltype_annotation', 'batch', 'sample', 'author_annotation', 'lib_size', 'log_lib_size'

In [39]:
sce_AT1 <- AnnData2SCE(
  adata_AT1,
  X_name = NULL, layers = TRUE, uns = TRUE, var = TRUE, obs = TRUE, varm = TRUE, obsm = TRUE, varp = TRUE, obsp = TRUE,
  raw = FALSE, skip_assays = FALSE, hdf5_backed = TRUE, verbose = NULL
)

“[1m[22mThe passed object is a 'AnnDataR6' object, conversion is likely to be less reliable”


In [36]:
rds_pb <- readRDS("/home/sch/schonner/MaPra/schiller_pseudobulk_sce.rds", refhook = NULL)

In [25]:
rds_pb$obs

NULL

In [45]:
outs <- fit_model(rds_pb)

Repeated column names found in count matrix



[1] "Dimensions before subsetting:"
[1] 18031   134
[1] ""
[1] "Dimensions after subsetting:"
[1] 1321  134
[1] ""


ERROR: Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05, : Design matrix not of full rank.  The following coefficients not estimable:
 batchmuc4656


In [None]:
fit <- outs$fit
y <- outs$y

In [None]:
plotMDS(y, col=ifelse(y$samples$group == "stim", "red", "blue"))

In [None]:
plotBCV(y)

In [None]:
myContrast <- makeContrasts('groupstim.AT1-groupctrl.AT1', levels = y$design)
qlf <- glmQLFTest(fit, contrast=myContrast)
# get all of the DE genes and calculate Benjamini-Hochberg adjusted FDR
tt <- topTags(qlf, n = Inf)
tt <- tt$table

In [None]:
tr <- glmTreat(fit, contrast=myContrast, lfc=1.5)
print(head(topTags(tr)))

In [None]:
plotSmear(qlf, de.tags = rownames(tt)[which(tt$FDR<0.01)])

### 2.2 Multiple groups

In [11]:
outs <-fit_model(adata_pb)

ERROR: Error in assay(adata_, "X"): could not find function "assay"
