# Differential gene expression analysis - code example for edgeR (pseudobulk)
Using the data set Misharin.

In [1]:
library(edgeR)
library(anndata)

Loading required package: limma



In [2]:
file_name <- '/home/sch/schonner/MaPra/misharin_for_diffEx_edgeR_test.h5ad'
adata_pb <- read_h5ad(file_name)
adata_pb

AnnData object with n_obs × n_vars = 31 × 17824
    obs: 'condition', 'manual_celltype_annotation', 'batch', 'sample', 'author_annotation', 'lib_size', 'log_lib_size'
    uns: 'author_annotation_colors', 'batch_colors', 'condition_colors', 'log1p', 'manual_celltype_annotation_colors', 'pca', 'sample_colors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [3]:
adata_pb$obs[1:5, ]

“index contains duplicated values: row names not set”


Unnamed: 0_level_0,condition,manual_celltype_annotation,batch,sample,author_annotation,lib_size,log_lib_size
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>
1,control,AT1,0,0_control,AT1 cells,174455,12.06942
2,asbestos,AT1,1,1_asbestos,AT1 cells,162594,11.99901
3,control,AT2,0,0_control,AT2,7514408,15.83233
4,asbestos,AT2,1,1_asbestos,AT2,12162263,16.31385
5,control,Alveolar_macrophages,0,0_control,AM,3936244,15.18574


## 1. Preparing some functions

### Separate function to fit an edgeR GLM:

In [4]:
fit_model <- function(adata_){
    # create an edgeR object with counts and grouping factor
    y <- DGEList(assay(adata_, "X"), group = colData(adata_)$condition)
    # filter out genes with low counts
    print("Dimensions before subsetting:")
    print(dim(y))
    print("")
    keep <- filterByExpr(y)
    y <- y[keep, , keep.lib.sizes=FALSE]
    print("Dimensions after subsetting:")
    print(dim(y))
    print("")
    # normalize
    y <- calcNormFactors(y)
    # create a vector that is concatentation of condition and cell type that we will later use with contrasts
    group <- paste0(colData(adata_)$condition, ".", colData(adata_)$manual_celltype_annotation)
    replicate <- colData(adata_)$batch
    # create a design matrix: here we have multiple donors so also consider that in the design matrix
    design <- model.matrix(~ 0 + group + batch)
    # estimate dispersion
    y <- estimateDisp(y, design = design)
    # fit the model
    fit <- glmQLFit(y, design)
    return(list("fit"=fit, "design"=design, "y"=y))
}

## 2. Pseudobulk

### 2.1 One group

In [5]:
adata_AT1 <- adata_pb[adata_pb$obs["manual_celltype_annotation"] == "AT1"]
adata_AT1

“index contains duplicated values: row names not set”


View of AnnData object with n_obs × n_vars = 2 × 17824
    obs: 'condition', 'manual_celltype_annotation', 'batch', 'sample', 'author_annotation', 'lib_size', 'log_lib_size'
    uns: 'author_annotation_colors', 'batch_colors', 'condition_colors', 'log1p', 'manual_celltype_annotation_colors', 'pca', 'sample_colors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [6]:
outs <- fit_model(adata_AT1)

ERROR: Error in assay(adata_, "X"): could not find function "assay"


In [None]:
fit <- outs$fit
y <- outs$y

In [None]:
plotMDS(y, col=ifelse(y$samples$group == "stim", "red", "blue"))

In [None]:
plotBCV(y)

In [None]:
myContrast <- makeContrasts('groupstim.AT1-groupctrl.AT1', levels = y$design)
qlf <- glmQLFTest(fit, contrast=myContrast)
# get all of the DE genes and calculate Benjamini-Hochberg adjusted FDR
tt <- topTags(qlf, n = Inf)
tt <- tt$table

In [None]:
tr <- glmTreat(fit, contrast=myContrast, lfc=1.5)
print(head(topTags(tr)))

In [None]:
plotSmear(qlf, de.tags = rownames(tt)[which(tt$FDR<0.01)])

### 2.2 Multiple groups

In [44]:
outs <-fit_model(adata_pb)

ERROR: Error in (function (classes, fdef, mtable) : unable to find an inherited method for function ‘assay’ for signature ‘"AnnDataR6", "character"’
