In [1]:
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import sc_toolbox
import anndata

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
from rpy2.robjects import r

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [2]:
%%R
library(edgeR)
library(stringr)
library(EnhancedVolcano)

In [68]:
adata_pb = sc.read("/home/sch/schonner/MaPra/merged_data_for_diffEx_edgeR.h5ad")
adata_pb

AnnData object with n_obs × n_vars = 508 × 23767
    obs: 'dataset', 'batch', 'sample', 'condition', 'fibrotic/control', 'coarse_harmonized_anno', 'harmonized_anno', 'author_annotation_coarse', 'author_annotation', 'lib_size', 'log_lib_size'
    uns: 'author_annotation_coarse_colors', 'author_annotation_colors', 'batch_colors', 'coarse_harmonized_anno_colors', 'condition_colors', 'dataset_colors', 'fibrotic', 'harmonized_anno_colors', 'log1p', 'pca', 'sample_colors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'counts'

In [69]:
# Push the variable to the R kernel
%Rpush adata_pb  

In [4]:
%%R
fit_model <- function(adata_){
    # create an edgeR object with counts and grouping factor
    y <- DGEList(assay(adata_, "X"), group = colData(adata_)$condition)
    # filter out genes with low counts
    print("Dimensions before subsetting:")
    print(dim(y))
    print("")
    keep <- filterByExpr(y)   # keeps genes that have at least min.count (default: 10) reads in a worthwhile number samples
    y <- y[keep, , keep.lib.sizes=FALSE]
    print("Dimensions after subsetting:")
    print(dim(y))
    print("")
    # normalize
    y <- calcNormFactors(y)
    # create factors for the design matrix
    condition <- colData(adata_)$condition   # saline, asbestos, ...
    project <- colData(adata_)$dataset       # schiller, misharin, peyser, ...
    design <- model.matrix(~ 0 + condition + project)
    print(colnames(design))
    # estimate dispersion
    y <- estimateDisp(y, design = design)
    # fit the model
    fit <- glmQLFit(y, design)
    return(list("fit"=fit, "design"=design, "y"=y))
}

In [56]:
cell_types = adata_pb.obs["coarse_harmonized_anno"].cat.categories

# Push the variable to the R kernel
%Rpush cell_types   

In [54]:
adata_cts = list()
for ct in cell_types:
    adata_cts.append(adata_pb[adata_pb.obs["coarse_harmonized_anno"] == ct])

# Push the variable to the R kernel
%Rpush adata_cts   

IndexError: index 0 is out of bounds for axis 0 with size 0

In [66]:
%%R 
for (i in 1:length(cell_types)) {
    ct <- cell_types[[i]]
    print(ct)
    ct_print <- str_replace(ct, "_", " ")   # will be used for plot titles
    # 1. subset adata_pb to one cell type
    adata_ct <- adata_cts[[i]]
    print(adata_ct)
    # 2. fit the model
    outs <- fit_model(adata_=adata_ct)
    fit <- outs$fit
    y <- outs$y
}

[1] "Aerocytes"

Error in withVisible({ : object 'adata_cts' not found


RInterpreterError: Failed to parse and evaluate line 'for (i in 1:length(cell_types)) {\n    ct <- cell_types[[i]]\n    print(ct)\n    ct_print <- str_replace(ct, "_", " ")   # will be used for plot titles\n    # 1. subset adata_pb to one cell type\n    adata_ct <- adata_cts[[i]]\n    print(adata_ct)\n    # 2. fit the model\n    outs <- fit_model(adata_=adata_ct)\n    fit <- outs$fit\n    y <- outs$y\n}\n'.
R error message: "Error in withVisible({ : object 'adata_cts' not found"

In [71]:
%%R
for (ct in cell_types){
    ct_print <- str_replace(ct, "_", " ")   # will be used for plot titles
    # 1. subset adata_pb to one cell type
    #adata_ct <- adata_pb[adata_pb$obs["coarse_harmonized_anno"] == ct]
    #adata_ct <- adata_pb[, adata_pb$coarse_harmonized_anno == ct]
    adata_ct2 <- subset(adata_pb, , coarse_harmonized_anno==ct)
    # 2. fit the model
    print(adata_ct2)
    outs <- fit_model(adata_ct2)
    fit <- outs$fit
    y <- outs$y
}

class: SingleCellExperiment 
dim: 23767 16 
metadata(11): author_annotation_coarse_colors author_annotation_colors
  ... pca sample_colors
assays(2): X counts
rownames(23767): 0610005C13Rik 0610007N19Rik ... n-R5s2 n-R5s89
rowData names(0):
colnames(16): donor_0_misharin_untreated_0 donor_0_tsukui_untreated_0
  ... donor_6_xie_bleomycin_0 donor_7_tsukui_untreated_0
colData names(11): dataset batch ... lib_size log_lib_size
reducedDimNames(1): PCA
mainExpName: NULL
altExpNames(0):
[1] "Dimensions before subsetting:"
[1] 23767    16
[1] ""
[1] "Dimensions after subsetting:"
[1] 8009   16
[1] ""
[1] "conditionasbestos"  "conditionbleomycin" "conditionsaline"   
[4] "conditionuntreated" "projectpeyser"      "projectschiller"   
[7] "projecttsukui"      "projectxie"        

Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05,  : 
  Design matrix not of full rank.  The following coefficients not estimable:
 conditionsaline projectpeyser projectschiller


RInterpreterError: Failed to parse and evaluate line 'for (ct in cell_types){\n    ct_print <- str_replace(ct, "_", " ")   # will be used for plot titles\n    # 1. subset adata_pb to one cell type\n    #adata_ct <- adata_pb[adata_pb$obs["coarse_harmonized_anno"] == ct]\n    #adata_ct <- adata_pb[, adata_pb$coarse_harmonized_anno == ct]\n    adata_ct2 <- subset(adata_pb, , coarse_harmonized_anno==ct)\n    # 2. fit the model\n    print(adata_ct2)\n    outs <- fit_model(adata_ct2)\n    fit <- outs$fit\n    y <- outs$y\n}\n'.
R error message: 'Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05,  : \n  Design matrix not of full rank.  The following coefficients not estimable:\n conditionsaline projectpeyser projectschiller'