In [1]:
import scanpy as sc
import pandas as pd

tm_droplet_data = sc.read(
    r'./src/data/tabula_muris/TM_droplet.h5ad',
)
tm_facs_data = sc.read(
    r'./src/data/tabula_muris/TM_facs.h5ad',
)

In [2]:
tm_droplet_data

AnnData object with n_obs × n_vars = 245389 × 20138
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation'
    var: 'n_cells'

In [3]:
# List all tissue types in tm_droplet_data
tm_droplet_data.obs["tissue"].unique()
# List all tissue types in tm_facs_data
tm_facs_data.obs["tissue"].unique()
# List all cell types in tm_droplet_data
tm_droplet_data.obs["cell_ontology_class"].unique()


['keratinocyte', 'basal cell of epidermis', 'Langerhans cell', 'fibroblast of cardiac tissue', 'leukocyte', ..., 'basal epithelial cell of tracheobronchial tree', 'chondrocyte', 'neuroendocrine cell', 'blood cell', 'smooth muscle cell of trachea']
Length: 123
Categories (123, object): ['B cell', 'CD4-positive, alpha-beta T cell', 'CD8-positive, alpha-beta T cell', 'DN3 thymocyte', ..., 'stromal cell', 'thymocyte', 'type II pneumocyte', 'vein endothelial cell']

In [4]:
# Filter only for cells with valid cell ontology class
tm_droplet_data = tm_droplet_data[
    (~tm_droplet_data.obs.cell_ontology_class.isna())
].copy()
tm_facs_data = tm_facs_data[
    (~tm_facs_data.obs.cell_ontology_class.isna())
].copy()

# Add technology labels
tm_droplet_data.obs["tech"] = "10x"
tm_facs_data.obs["tech"] = "SS2"

In [11]:
gene_len = pd.read_csv(
    "https://raw.githubusercontent.com/chenlingantelope/HarmonizationSCANVI/master/data/gene_len.txt",
    delimiter=" ",
    header=None,
    index_col=0,
)
gene_len.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0610007C21Rik,94.571429
0610007L01Rik,156.0
0610007P08Rik,202.272727
0610007P14Rik,104.0
0610007P22Rik,158.75


In [13]:
import numpy as np
from scipy import sparse

gene_len = gene_len.reindex(tm_facs_data.var.index).dropna()

tm_facs_data = tm_facs_data[:, gene_len.index].copy() # break the view

gene_len_vec = gene_len[1].values.astype(np.float32)
median_len = np.median(gene_len_vec)

# column‑wise scaling in CSC format
X = tm_facs_data.X.tocsc(copy=True) # -> (n_cells × n_genes)
X = X.multiply(1.0 / gene_len_vec) # divide each column by its length
X = X.multiply(median_len) # multiply by the median length
X.data = np.rint(X.data) # round only the non‑zero entries

tm_facs_data.X = X.tocsr() # store back as CSR (Scanpy’s default)

In [14]:
import scanpy as sc
import pandas as pd

tm_adata = tm_droplet_data.concatenate(tm_facs_data)
tm_adata.layers["counts"] = tm_adata.X.copy()
sc.pp.normalize_total(tm_adata, target_sum=1e4)
sc.pp.log1p(tm_adata)
tm_adata.raw = tm_adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    tm_adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="tech",
    subset=True,
)

  tm_adata = tm_droplet_data.concatenate(tm_facs_data)


In [9]:
tm_adata

AnnData object with n_obs × n_vars = 356213 × 2000
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'tech', 'FACS.selection', 'n_counts', 'batch'
    var: 'n_cells-0', 'n_cells-1', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'log1p', 'hvg'
    layers: 'counts'

In [15]:
import pickle
# Create the directory if it doesn't exist
import os
os.makedirs(r'./src/data/dann', exist_ok=True)

with open(r'./src/data/dann/all_cell_data.pkl', 'wb') as f:
    pickle.dump(tm_adata, f)