In [1]:
library(anndata)
library(edgeR)
library(stringr)
library(EnhancedVolcano)

Loading required package: limma

Loading required package: ggplot2

Loading required package: ggrepel



In [2]:
fit_model <- function(adata_){
  # create an edgeR object with counts and grouping factor
  y <- DGEList(assay(adata_, "X"), group = colData(adata_)$condition)
  # filter out genes with low counts
  print("Dimensions before subsetting:")
  print(dim(y))
  print("")
  keep <- filterByExpr(y)   # keeps genes that have at least min.count (default: 10) reads in a worthwhile number samples
  y <- y[keep, , keep.lib.sizes=FALSE]
  print("Dimensions after subsetting:")
  print(dim(y))
  print("")
  # normalize
  y <- calcNormFactors(y)
  # create factors for the design matrix
  condition <- colData(adata_)$condition   # saline, asbestos, ...
  project <- colData(adata_)$dataset       # schiller, misharin, peyser, ...
  design <- model.matrix(~ 0 + condition + project)
  print(colnames(design))   # printing columns from the design matrix
  # estimate dispersion
  y <- estimateDisp(y, design = design)
  # fit the model
  fit <- glmQLFit(y, design)
  return(list("fit"=fit, "design"=design, "y"=y))
}

In [3]:
get_tt <- function(y, fit){
  # create contrast matrix
  contrasts <- list()
  for (colname in colnames(y$design)){
    if (startsWith(colname, "condition")){
      contrasts <- append(contrasts, colname)
    }
  }
  contrast_tmp <- paste(contrasts,collapse = '-')
  myContrast <- makeContrasts(contrast_tmp, levels = y$design)
  qlf <- glmQLFTest(fit, contrast=myContrast)
  # get all of the DE genes and calculate Benjamini-Hochberg adjusted FDR
  tt <- topTags(qlf, n = Inf)
  tt <- tt$table
  # 'normalize' logFc
  mean_FC <- mean(tt$logFC)
  tt$logFC_shifted <- (tt$logFC - mean_FC)
  # TODO: save tt as dataframe ?
  return(tt)
}

In [4]:
# define paths
path_adata <- "/home/sch/schonner/MaPra/merged_data_for_diffEx_edgeR.h5ad"
path_out <- "/home/sch/schonner/MaPra/test"

In [7]:
# read in the pseudobulk object
adata_pb <- read_h5ad(path_adata)
print(paste0("Dimensions of the annData object: ", dim(adata_pb)[1], " ", dim(adata_pb)[2]))

[1] "Dimensions of the annData object: 508 23767"


In [12]:
adata_pb

AnnData object with n_obs × n_vars = 508 × 23767
    obs: 'dataset', 'batch', 'sample', 'condition', 'fibrotic/control', 'coarse_harmonized_anno', 'harmonized_anno', 'author_annotation_coarse', 'author_annotation', 'lib_size', 'log_lib_size'
    uns: 'author_annotation_coarse_colors', 'author_annotation_colors', 'batch_colors', 'coarse_harmonized_anno_colors', 'condition_colors', 'dataset_colors', 'fibrotic', 'harmonized_anno_colors', 'log1p', 'pca', 'sample_colors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [21]:
cell_types <- unique(adata_pb$obs$coarse_harmonized_anno)
for (ct in cell_types){
    print(ct)
  ct_print <- str_replace(ct, "_", " ")   # will be used for plot titles
  # 1. subset adata_pb to one cell type
  #adata_ct <- subset(adata_pb, coarse_harmonized_anno==ct)
  adata_ct <- adata_pb[adata_pb$obs["coarse_harmonized_anno"] == ct]
  # 2. fit the model
  outs <- fit_model(adata_=adata_ct)
  fit <- outs$fit
  y <- outs$y
  # 3. plot overviews
  plot_overview(cell_type=ct_print, y=y, path_out=path_out)
  # 4. create table with differentially expressed genes
  tt <- get_tt(y=y, git=fit)
  # 5. Vulcano plot - of original and 'normalized' logFC

  # 6. heatmap
}
  


“index contains duplicated values: row names not set”


[1] "Aerocytes"


“index contains duplicated values: row names not set”


ERROR: Error in assay(adata_, "X"): could not find function "assay"
