In [15]:
import scanpy as sc

tm_droplet_data = sc.read(
    r'../data/tabula_muris/TM_droplet.h5ad',
)
tm_facs_data = sc.read(
    r'../data/tabula_muris/TM_facs.h5ad',
)

In [16]:
# Filter only for cells with valid cell ontology class
tm_droplet_data = tm_droplet_data[
    (~tm_droplet_data.obs.cell_ontology_class.isna())
].copy()
tm_facs_data = tm_facs_data[
    (~tm_facs_data.obs.cell_ontology_class.isna())
].copy()

# Add technology labels
tm_droplet_data.obs["tech"] = "10x"
tm_facs_data.obs["tech"] = "SS2"

In [17]:
import pandas as pd

gene_len = pd.read_csv(
    "https://raw.githubusercontent.com/chenlingantelope/HarmonizationSCANVI/master/data/gene_len.txt",
    delimiter=" ",
    header=None,
    index_col=0,
)
gene_len.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0610007C21Rik,94.571429
0610007L01Rik,156.0
0610007P08Rik,202.272727
0610007P14Rik,104.0
0610007P22Rik,158.75


In [18]:
import numpy as np
from scipy import sparse

gene_len = gene_len.reindex(tm_facs_data.var.index).dropna()

tm_facs_data = tm_facs_data[:, gene_len.index].copy() # break the view

gene_len_vec = gene_len[1].values.astype(np.float32)
median_len = np.median(gene_len_vec)

# column‑wise scaling in CSC format
X = tm_facs_data.X.tocsc(copy=True) # -> (n_cells × n_genes)
X = X.multiply(1.0 / gene_len_vec) # divide each column by its length
X = X.multiply(median_len) # multiply by the median length
X.data = np.rint(X.data) # round only the non‑zero entries

tm_facs_data.X = X.tocsr() # store back as CSR (Scanpy’s default)

In [19]:
import anndata as ad
tm_adata = ad.concat([tm_droplet_data, tm_facs_data])

In [27]:
print(tm_adata.X.shape)

(356213, 18244)


In [22]:
# Print the number of occurrences of each cell_ontology_class
cell_ontology_class_counts = tm_adata.obs['cell_ontology_class'].value_counts()
print(cell_ontology_class_counts)


cell_ontology_class
B cell                      36349
basal cell of epidermis     15645
granulocyte                 15562
endothelial cell            14822
mesenchymal stem cell       14466
                            ...  
lymphoid progenitor cell       13
lung neuroendocrine cell        8
type I pneumocyte               4
kidney cell                     3
duct epithelial cell            2
Name: count, Length: 155, dtype: int64


In [5]:
import anndata as ad

tm_adata = ad.concat([tm_droplet_data, tm_facs_data])[0:2]
print(tm_adata.shape)
tm_adata.layers["counts"] = tm_adata.X.copy()
sc.pp.normalize_total(tm_adata, target_sum=1e4)
sc.pp.log1p(tm_adata)
tm_adata.raw = tm_adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    tm_adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="tech",
    subset=True,
)

(2, 18244)


  tm_adata.layers["counts"] = tm_adata.X.copy()


: 

In [None]:
tm_adata.obs['cell_ontology_class'].replace(
    to_replace='pancreatic ductal cel',
    value='pancreatic ductal cell',
    inplace=True
)

In [None]:
tm_droplet_data_tissues = set(tm_droplet_data.obs.tissue)
tm_facs_data_tissues = set(tm_facs_data.obs.tissue)
tm_all_tissues = tm_droplet_data_tissues | tm_facs_data_tissues

In [None]:
test_tissues={'Skin', 'Liver', 'Limb_Muscle', 'Pancreas'}
train_tissues = tm_all_tissues.difference(test_tissues)

In [None]:
tm_adata_train = tm_adata[
    tm_adata.obs['tissue'].isin(train_tissues)
]
tm_adata_test = tm_adata[
    tm_adata.obs['tissue'].isin(test_tissues)
]

In [None]:
import pickle
import os

os.makedirs(r'./src/data/tabula_muris/preprocessed', exist_ok=True)

with open(r'./src/data/tabula_muris/preprocessed/tm_adata_train.pkl', 'wb') as f: # NOTE: be careful of where this actually is LOL
    pickle.dump(tm_adata_train, f)

with open(r'./src/data/tabula_muris/preprocessed/tm_adata_test.pkl', 'wb') as f:
    pickle.dump(tm_adata_test, f)