In [2]:
import scanpy as sc
import numpy as np
import pandas as pd

In [2]:
# Load the .h5ad file
adata_heart = sc.read_h5ad("datasets/combined/9d5eb472-3657-4035-8aea-d3053934e120.h5ad")

In [5]:
adata_heart.X.shape

(704296, 31832)

In [7]:
adata_heart.var

Unnamed: 0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length,feature_type
ENSG00000243485,False,MIR1302-2HG,NCBITaxon:9606,gene,517,lncRNA
ENSG00000237613,False,FAM138A,NCBITaxon:9606,gene,1015,lncRNA
ENSG00000186092,False,OR4F5,NCBITaxon:9606,gene,2618,protein_coding
ENSG00000239945,False,ENSG00000239945,NCBITaxon:9606,gene,1319,lncRNA
ENSG00000239906,False,ENSG00000239906,NCBITaxon:9606,gene,323,lncRNA
...,...,...,...,...,...,...
ENSG00000277856,False,ENSG00000277856,NCBITaxon:9606,gene,294,protein_coding
ENSG00000275063,False,ENSG00000275063,NCBITaxon:9606,gene,351,protein_coding
ENSG00000271254,False,ENSG00000271254,NCBITaxon:9606,gene,2922,protein_coding
ENSG00000277475,False,ENSG00000277475,NCBITaxon:9606,gene,831,protein_coding


In [6]:
(adata_heart.obs["assay"] == "10x multiome").value_counts()

assay
False    493236
True     211060
Name: count, dtype: int64

In [7]:
(adata_heart.obs["suspension_type"] == "cell").value_counts()

suspension_type
False    568598
True     135698
Name: count, dtype: int64

In [10]:
adata_heart.obs["cell_type"].cat.categories

Index(['regular atrial cardiac myocyte', 'regular ventricular cardiac myocyte',
       'fibroblast', 'endothelial cell',
       'endothelial cell of lymphatic vessel', 'mural cell',
       'mesothelial cell', 'neural cell', 'adipocyte', 'myeloid cell',
       'lymphocyte', 'mast cell'],
      dtype='object')

In [None]:
# assay = "10x multiome"
mask_multiome = adata_heart.obs["assay"].astype(str).eq("10x multiome")
celltype_counts_multiome = adata_heart.obs.loc[mask_multiome, "cell_type"].value_counts()

celltype_counts_multiome

cell_type
fibroblast                              63789
regular ventricular cardiac myocyte     47782
endothelial cell                        26288
myeloid cell                            24857
regular atrial cardiac myocyte          19509
mural cell                              14794
lymphocyte                               7403
neural cell                              2680
adipocyte                                2502
mast cell                                 625
endothelial cell of lymphatic vessel      618
mesothelial cell                          213
Name: count, dtype: int64

In [14]:
# suspension type = "cell"
mask_cell = adata_heart.obs["suspension_type"].astype(str).eq("cell")
celltype_counts_cell = adata_heart.obs.loc[mask_cell, "cell_type"].value_counts()

celltype_counts_cell

cell_type
endothelial cell                        80398
mural cell                              22641
myeloid cell                            14702
lymphocyte                              11945
fibroblast                               3163
regular ventricular cardiac myocyte      1098
regular atrial cardiac myocyte            826
neural cell                               497
endothelial cell of lymphatic vessel      231
mesothelial cell                          195
adipocyte                                   1
mast cell                                   1
Name: count, dtype: int64

In [None]:
df = pd.concat(
    {"10x multiome": celltype_counts_multiome, "sc": celltype_counts_cell},
    axis=1
).fillna(0).astype(int)

df

Unnamed: 0_level_0,10x multiome,sc
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1
fibroblast,63789,3163
regular ventricular cardiac myocyte,47782,1098
endothelial cell,26288,80398
myeloid cell,24857,14702
regular atrial cardiac myocyte,19509,826
mural cell,14794,22641
lymphocyte,7403,11945
neural cell,2680,497
adipocyte,2502,1
mast cell,625,1


## Healthy human liver: integrated
Link: https://datasets.cellxgene.cziscience.com/16f4799d-fe7d-4394-b41b-14e7a93d10a5.h5ad  

In [3]:
# Load the .h5ad file
adata_liver = sc.read_h5ad("datasets/combined/16f4799d-fe7d-4394-b41b-14e7a93d10a5.h5ad")

In [6]:
adata_liver.obs["suspension_type"].value_counts()

suspension_type
nucleus    43863
cell       29432
Name: count, dtype: int64

In [5]:
adata_liver.obs["cell_type"].value_counts()

cell_type
periportal region hepatocyte                        23509
centrilobular region hepatocyte                     15819
midzonal region hepatocyte                          11059
endothelial cell of pericentral hepatic sinusoid     6376
natural killer cell                                  3283
hepatic stellate cell                                3240
Kupffer cell                                         3044
cholangiocyte                                        1934
B cell                                               1780
inflammatory macrophage                              1627
blood vessel endothelial cell                        1558
erythroid lineage cell                                 66
Name: count, dtype: int64

In [None]:
macro_map = {
    "periportal region hepatocyte": "hepatocyte",
    "midzonal region hepatocyte": "hepatocyte",
    "centrilobular region hepatocyte": "hepatocyte",

    "endothelial cell of pericentral hepatic sinusoid": "endothelial",
    "blood vessel endothelial cell": "endothelial",

    "hepatic stellate cell": "hepatic stellate cell",

    "Kupffer cell": "myeloid/macrophage",
    "inflammatory macrophage": "myeloid/macrophage",

    "natural killer cell": "lymphoid",
    "B cell": "lymphoid",

    "cholangiocyte": "epithelial",
    "erythroid lineage cell": "erythroid",
}

ct_col = "cell_type"

adata_liver.obs["cell_type"] = adata_liver.obs[ct_col].map(macro_map).fillna("other")

In [20]:
adata_liver.obs["cell_type"].value_counts()

cell_type
hepatocyte               50387
endothelial               7934
lymphoid                  5063
myeloid/macrophage        4671
hepatic stellate cell     3240
epithelial                1934
erythroid                   66
Name: count, dtype: int64

In [23]:
to_remove = ["epithelial", "erythroid"]

mask = ~adata_liver.obs["macro_cell_type"].isin(to_remove)
adata_liver_filt = adata_liver[mask].copy()

In [24]:
adata_liver_filt.obs["cell_type"].value_counts()

cell_type
hepatocyte               50387
endothelial               7934
lymphoid                  5063
myeloid/macrophage        4671
hepatic stellate cell     3240
Name: count, dtype: int64

In [29]:
# suspension_type = cell
mask_cell = adata_liver_filt.obs["suspension_type"].astype(str).eq("cell")
celltype_counts_cell = adata_liver_filt.obs.loc[mask_cell, "cell_type"].value_counts()

# suspension_type = nucleus
mask_nucleus = adata_liver_filt.obs["suspension_type"].astype(str).eq("nucleus")
celltype_counts_nucleus = adata_liver_filt.obs.loc[mask_nucleus, "cell_type"].value_counts()

df = pd.concat(
    {"cell": celltype_counts_cell, "nucleus": celltype_counts_nucleus},
    axis=1
).fillna(0).astype(int)

df["min(cell,nucleus)"] = df[["cell", "nucleus"]].min(axis=1)

df

Unnamed: 0_level_0,cell,nucleus,"min(cell,nucleus)"
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hepatocyte,19811,30576,19811
lymphoid,3240,1823,1823
endothelial,2615,5319,2615
myeloid/macrophage,2447,2224,2224
hepatic stellate cell,795,2445,795


In [37]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(0)

obs = adata_liver_filt.obs
susp = obs["suspension_type"].astype(str)
ct = obs["cell_type"].astype(str)

keep = []

for c in ct.unique():
    idx_cell = obs_names_cell = obs.index[(ct == c) & (susp == "cell")]
    idx_nuc  = obs.index[(ct == c) & (susp == "nucleus")]

    m = min(len(idx_cell), len(idx_nuc))
    if m == 0:
        continue  # skip cell_types missing one modality

    keep.append(rng.choice(idx_cell, size=m, replace=False))
    keep.append(rng.choice(idx_nuc,  size=m, replace=False))

keep = np.concatenate(keep)

adata_balanced = adata_liver_filt[keep].copy()
adata_balanced.obs["suspension_type"].value_counts(), adata_balanced

(suspension_type
 cell       27268
 nucleus    27268
 Name: count, dtype: int64,
 AnnData object with n_obs × n_vars = 54536 × 32596
     obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'Phase', 'sample', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'macro_cell_type'
     var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
     uns: 'citation', 'default_embedding', 'organism', 'organism_ontology_term_id', 'schema_reference', 'schema_version',

In [39]:
# suspension_type = cell
mask_cell = adata_balanced.obs["suspension_type"].astype(str).eq("cell")
celltype_counts_cell = adata_balanced.obs.loc[mask_cell, "cell_type"].value_counts()

# suspension_type = nucleus
mask_nucleus = adata_balanced.obs["suspension_type"].astype(str).eq("nucleus")
celltype_counts_nucleus = adata_balanced.obs.loc[mask_nucleus, "cell_type"].value_counts()

df_balanced = pd.concat(
    {"cell": celltype_counts_cell, "nucleus": celltype_counts_nucleus},
    axis=1
).fillna(0).astype(int)

df_balanced

Unnamed: 0_level_0,cell,nucleus
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1
hepatocyte,19811,19811
endothelial,2615,2615
myeloid/macrophage,2224,2224
lymphoid,1823,1823
hepatic stellate cell,795,795


In [47]:
# masks
mask_cell = adata_balanced.obs["suspension_type"].astype(str).eq("cell")
mask_nucleus = adata_balanced.obs["suspension_type"].astype(str).eq("nucleus")

# split datasets
adata_filt_cell   = adata_balanced[mask_cell].copy()
adata_filt_nucleus = adata_balanced[mask_nucleus].copy()

In [48]:
# save datasets
adata_filt_cell.write_h5ad("datasets/combined/liver_cell_filtered.h5ad", compression="gzip")
adata_filt_nucleus.write_h5ad("datasets/combined/liver_nucleus_filtered.h5ad", compression="gzip")