In [7]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import tempfile
import os

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

sc.settings.set_figure_params(dpi=80)
%matplotlib inline

-----
anndata     0.10.5.post1
scanpy      1.9.8
-----
PIL                         10.2.0
anyio                       NA
argcomplete                 NA
arrow                       1.3.0
asttokens                   NA
attr                        23.2.0
attrs                       23.2.0
babel                       2.14.0
certifi                     2024.02.02
cffi                        1.16.0
charset_normalizer          3.3.2
comm                        0.2.2
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.8.1
decorator                   5.1.1
defusedxml                  0.7.1
executing                   2.0.1
fastjsonschema              NA
fqdn                        NA
h5py                        3.10.0
idna                        3.6
ipykernel                   6.29.3
isoduration                 NA
jedi                        0.19.1
jinja2                      3.1.3
joblib                      1.3.2
js

In [8]:
save_dir = tempfile.TemporaryDirectory()
adata_path = os.path.join(save_dir.name, "mouse-human_pancreas_subset10000.h5ad")
adata = sc.read(
    adata_path,
    backup_url="https://github.com/theislab/cross_system_integration/raw/main/tutorials/data/mouse-human_pancreas_subset10000.h5ad",
)
adata

try downloading from url
https://github.com/theislab/cross_system_integration/raw/main/tutorials/data/mouse-human_pancreas_subset10000.h5ad
... this may take a while but only happens once


  0%|          | 0.00/38.2M [00:00<?, ?B/s]



AnnData object with n_obs × n_vars = 10000 × 1768
    obs: 'batch', 'mm_study', 'mm_sex', 'mm_age', 'mm_study_sample_design', 'mm_hc_gene_programs_parsed', 'mm_leiden_r1.5_parsed', 'cell_type_eval', 'system', 'hs_Sex', 'hs_Diabetes Status', 'leiden_system'
    var: 'gs_mm', 'gs_hs'
    obsm: 'X_pca_system'
    layers: 'counts'

In [9]:
adata2 = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)

sc.pp.normalize_per_cell(adata2, counts_per_cell_after=1e4)
sc.pp.log1p(adata2)

AttributeError: 'NoneType' object has no attribute 'X'

In [None]:
#variable genes for the full dataset
sc.pp.highly_variable_genes(adata2, min_mean=0.0125, max_mean=3, min_disp=0.5)

sc.pl.highly_variable_genes(adata2)

print("Highly variable genes: %d"%sum(adata2.var.highly_variable))

var_genes_all = adata2.var.highly_variable

In [None]:
sc.pp.highly_variable_genes(adata2, min_mean=0.0125, max_mean=3, min_disp=0.5, batch_key = 'library')

print("Highly variable genes intersection: %d"%sum(adata2.var.highly_variable_intersection))

print("Number of batches where gene is variable:")
print(adata2.var.highly_variable_nbatches.value_counts())

var_genes_batch = adata2.var.highly_variable_nbatches > 0

In [None]:
print("Any batch var genes: %d"%sum(var_genes_batch))
print("All data var genes: %d"%sum(var_genes_all))
print("Overlap: %d"%sum(var_genes_batch & var_genes_all))
print("Variable genes in all batches: %d"%sum(adata2.var.highly_variable_nbatches ==3))
print("Overlap batch instersection and all: %d"%sum(var_genes_all & adata2.var.highly_variable_intersection))

In [None]:
var_select = adata2.var.highly_variable_nbatches > 1
var_genes = var_select.index[var_select]
len(var_genes)

In [None]:
# split per batch into new objects.
batches = adata2.obs['library'].unique()
alldata = {}
for batch in batches:
    alldata[batch] = adata2[adata2.obs['library'] == batch,]

alldata    

In [None]:
batches

In [None]:
cdata = sc.external.pp.mnn_correct(alldata['LX049_LX050_an_127'],alldata['LX051_LX052_an_128'],alldata['LX053_LX054_an_100'],
                                   alldata['LX065_LX066_an_155'], alldata['LX067_LX068_an_156'], alldata['LX069_LX070_an_157'],
                                   alldata['LX071_LX072_an_132'], alldata['LX074_LX075_an_159'], alldata['LX078_LX079_an_161'],
                                   alldata['LX080_LX081_an_162'], alldata['LX093_LX094_an_163'], alldata['LX095_LX096_an_164'],
                                   alldata['LX097_LX098_an_165'], alldata['LX099_LX100_an_166'], alldata['LX101_LX102_an_167'],
                                   alldata['LX103_LX104_an_168'], alldata['LX183_LX184_an_322'], alldata['LX185_LX186_an_323'],
                                   alldata['LX187_LX188_an_324'], alldata['LX189_LX190_an_325'], alldata['LX290_LX291_an_423'],
                                   alldata['LX347_LX348_an_595'], alldata['LX379_LX380_an_596'], alldata['LX381_LX382_an_597'],
                                   alldata['LX383_LX384_an_598'], svd_dim = 50, batch_key = 'library', 
                                   batch_categories = batches, save_raw = True, var_subset = var_genes)