# Differential gene expression analysis - code example for edgeR (pseudobulk)
Using the data set Misharin.

In [1]:
library(edgeR)
library(anndata)
library(zellkonverter)
library(SingleCellExperiment)

Loading required package: limma

Registered S3 method overwritten by 'zellkonverter':
  method                from      
  py_to_r.numpy.ndarray reticulate

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs,

In [4]:
file_name <- '/home/sch/schonner/MaPra/merged_data_old_for_diffEx_edgeR.h5ad'
adata_pb <- read_h5ad(file_name)
adata_pb

AnnData object with n_obs × n_vars = 644 × 32317
    obs: 'condition', 'manual_celltype_annotation', 'batch', 'sample', 'author_annotation', 'lib_size', 'log_lib_size'
    uns: 'author_annotation_colors', 'batch_colors', 'condition_colors', 'log1p', 'manual_celltype_annotation_colors', 'pca', 'sample_colors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [5]:
adata_pb$obs[1:5, ]

“index contains duplicated values: row names not set”


Unnamed: 0_level_0,condition,manual_celltype_annotation,batch,sample,author_annotation,lib_size,log_lib_size
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>
1,untreated,AT1,0_misharin,0_misharin_untreated,AT1 cells,64648.696,11.07672
2,asbestos,AT1,1_misharin,1_misharin_asbestos,AT1 cells,63999.997,11.06664
3,untreated,AT1,1_xie,1_xie_untreated,Methothelial,28761.913,10.26681
4,untreated,AT1,2_xie,2_xie_untreated,Endothelial,30400.624,10.32222
5,untreated,AT1,3_xie,3_xie_untreated,Methothelial,6664.563,8.80456


In [9]:
adata_pb$X[1:5, ]

Unnamed: 0,0610005C13Rik,0610007N19Rik,0610007P14Rik,0610008F07Rik,0610009B14Rik,0610009B22Rik,0610009D07Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,⋯,n-R5s74,n-R5s8,n-R5s80,n-R5s85,n-R5s87,n-R5s88,n-R5s89,n-R5s92,n-R5s96,n-R5s98
donor_0_misharin_untreated_0,0,0,10.7381343,0,0,5.9506426,0,0,0.6931472,3.465736,⋯,0,0,0,0,0,0,0,0,0,0
donor_1_misharin_asbestos_0,0,0,7.742402,0,0,6.9314718,0,0,2.0794415,2.484907,⋯,0,0,0,0,0,0,0,0,0,0
donor_1_xie_untreated_0,0,0,1.7917595,0,0,0.6931472,0,0,0.0,1.386294,⋯,0,0,0,0,0,0,0,0,0,0
donor_2_xie_untreated_0,0,0,0.6931472,0,0,0.6931472,0,0,0.6931472,3.178054,⋯,0,0,0,0,0,0,0,0,0,0
donor_3_xie_untreated_0,0,0,1.0986123,0,0,0.6931472,0,0,1.3862944,0.0,⋯,0,0,0,0,0,0,0,0,0,0


In [16]:
typeof(assay(adata_AT1))

ERROR: Error in assay(adata_AT1): could not find function "assay"


## 1. Preparing some functions

### Separate function to fit an edgeR GLM:

In [7]:
fit_model <- function(adata_){
    # create an edgeR object with counts and grouping factor
    y <- DGEList(assay(adata_, "X"), group = colData(adata_)$condition)
    # filter out genes with low counts
    print("Dimensions before subsetting:")
    print(dim(y))
    print("")
    keep <- filterByExpr(y)
    y <- y[keep, , keep.lib.sizes=FALSE]
    print("Dimensions after subsetting:")
    print(dim(y))
    print("")
    # normalize
    y <- calcNormFactors(y)
    # create a vector that is concatentation of condition and cell type that we will later use with contrasts
    group <- paste0(colData(adata_)$condition, ".", colData(adata_)$manual_celltype_annotation)
    batch <- colData(adata_)$batch
    # create a design matrix: here we have multiple donors so also consider that in the design matrix
    design <- model.matrix(~ 0 + group + batch)
    # estimate dispersion
    y <- estimateDisp(y, design = design)
    # fit the model
    fit <- glmQLFit(y, design)
    return(list("fit"=fit, "design"=design, "y"=y))
}

## 2. Pseudobulk

### 2.1 One group

In [11]:
adata_AT1 <- adata_pb[adata_pb$obs["manual_celltype_annotation"] == "AT1"]
adata_AT1

“index contains duplicated values: row names not set”


View of AnnData object with n_obs × n_vars = 36 × 32317
    obs: 'condition', 'manual_celltype_annotation', 'batch', 'sample', 'author_annotation', 'lib_size', 'log_lib_size'
    uns: 'author_annotation_colors', 'batch_colors', 'condition_colors', 'log1p', 'manual_celltype_annotation_colors', 'pca', 'sample_colors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [12]:
sce_AT1 <- AnnData2SCE(
  adata_AT1,
  X_name = NULL, layers = TRUE, uns = TRUE, var = TRUE, obs = TRUE, varm = TRUE, obsm = TRUE, varp = TRUE, obsp = TRUE,
  raw = FALSE, skip_assays = FALSE, hdf5_backed = TRUE, verbose = NULL
)

“[1m[22mThe passed object is a 'AnnDataR6' object, conversion is likely to be less reliable”
“[1m[22mUnable to access items in [32mvarm[39m, attempting to convert the whole list. Access error message: [34m"'match'[39m
[34mrequires vector arguments"[39m”
“[1m[22mUnable to access items in [32mobsm[39m, attempting to convert the whole list. Access error message: [34m"'match'[39m
[34mrequires vector arguments"[39m”


In [14]:
rds_pb <- readRDS("/home/sch/schonner/MaPra/merged_data_old_pseudobulk_sce.rds", refhook = NULL)

“cannot open compressed file '/home/sch/schonner/MaPra/merged_data_old_pseudobulk_sce.rds', probable reason 'No such file or directory'”


ERROR: Error in gzfile(file, "rb"): cannot open the connection


In [25]:
rds_pb$obs

NULL

In [45]:
outs <- fit_model(rds_pb)

Repeated column names found in count matrix



[1] "Dimensions before subsetting:"
[1] 18031   134
[1] ""
[1] "Dimensions after subsetting:"
[1] 1321  134
[1] ""


ERROR: Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05, : Design matrix not of full rank.  The following coefficients not estimable:
 batchmuc4656


In [10]:
outs <- fit_model(adata_pb)

ERROR: Error in (function (classes, fdef, mtable) : unable to find an inherited method for function ‘assay’ for signature ‘"AnnDataR6", "character"’


In [None]:
fit <- outs$fit
y <- outs$y

In [None]:
plotMDS(y, col=ifelse(y$samples$group == "stim", "red", "blue"))

In [None]:
plotBCV(y)

In [None]:
myContrast <- makeContrasts('groupstim.AT1-groupctrl.AT1', levels = y$design)
qlf <- glmQLFTest(fit, contrast=myContrast)
# get all of the DE genes and calculate Benjamini-Hochberg adjusted FDR
tt <- topTags(qlf, n = Inf)
tt <- tt$table

In [None]:
tr <- glmTreat(fit, contrast=myContrast, lfc=1.5)
print(head(topTags(tr)))

In [None]:
plotSmear(qlf, de.tags = rownames(tt)[which(tt$FDR<0.01)])

### 2.2 Multiple groups

In [11]:
outs <-fit_model(adata_pb)

ERROR: Error in assay(adata_, "X"): could not find function "assay"
