In [None]:
import scanpy as sc
import pandas as pd

tm_droplet_data = sc.read(
    r'./data/tabula_muris/TM_droplet.h5ad',
)
tm_facs_data = sc.read(
    r'./data/tabula_muris/TM_facs.h5ad',
)

In [None]:
tm_droplet_data

AnnData object with n_obs × n_vars = 245389 × 20138
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation'
    var: 'n_cells'

In [None]:
# List all tissue types in tm_droplet_data
tm_droplet_data.obs["tissue"].unique()
# List all tissue types in tm_facs_data
tm_facs_data.obs["tissue"].unique()
# List all cell types in tm_droplet_data
tm_droplet_data.obs["cell_ontology_class"].unique()


In [3]:
tm_droplet_data = tm_droplet_data[
    (tm_droplet_data.obs.tissue == "Marrow")
    & (~tm_droplet_data.obs.cell_ontology_class.isna())
    & (tm_droplet_data.obs.sex == "female")
].copy()
tm_facs_data = tm_facs_data[
    (tm_facs_data.obs.tissue == "Marrow")
    & (~tm_facs_data.obs.cell_ontology_class.isna())
    & (tm_facs_data.obs.sex == "female")
].copy()

tm_droplet_data.obs["tech"] = "10x"
tm_facs_data.obs["tech"] = "SS2"

In [4]:
gene_len = pd.read_csv(
    "https://raw.githubusercontent.com/chenlingantelope/HarmonizationSCANVI/master/data/gene_len.txt",
    delimiter=" ",
    header=None,
    index_col=0,
)
gene_len.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0610007C21Rik,94.571429
0610007L01Rik,156.0
0610007P08Rik,202.272727
0610007P14Rik,104.0
0610007P22Rik,158.75


In [None]:
import numpy as np

gene_len = gene_len.reindex(tm_facs_data.var.index).dropna()
tm_facs_data = tm_facs_data[:, gene_len.index]
assert (tm_facs_data.var.index == gene_len.index).sum() == tm_facs_data.shape[1]
tm_facs_data.X = tm_facs_data.X / gene_len[1].values * np.median(gene_len[1].values)
# round to integer
tm_facs_data.X = np.rint(tm_facs_data.X)

In [6]:
tm_adata = tm_droplet_data.concatenate(tm_facs_data)
tm_adata.layers["counts"] = tm_adata.X.copy()
sc.pp.normalize_total(tm_adata, target_sum=1e4)
sc.pp.log1p(tm_adata)
tm_adata.raw = tm_adata  # keep full dimension safe
sc.pp.highly_variable_genes(
    tm_adata,
    flavor="seurat_v3",
    n_top_genes=2000,
    layer="counts",
    batch_key="tech",
    subset=True,
)

  tm_adata = tm_droplet_data.concatenate(tm_facs_data)


In [7]:
tm_adata

AnnData object with n_obs × n_vars = 15907 × 2000
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'tech', 'FACS.selection', 'n_counts', 'batch'
    var: 'n_cells-0', 'n_cells-1', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'log1p', 'hvg'
    layers: 'counts'

In [10]:
import pickle
with open(r'./data/tabula_muris/preprocessed/tm_data_marrow_female.pkl', 'wb') as f:
    pickle.dump(tm_adata, f)