In [1]:
from io import StringIO
import os

import scanpy
import pandas as pd

# Make minimal obs

## Read adata with no duplicates

In [2]:
h5ad = "/home/olga/data_lg/data_sm_copy/czb-tabula-muris-senis/Data-objects/tabula-muris-senis-droplet-official-raw-obj--no-duplicate-barcodes-per-seq-run.h5ad"
mouse_no_duplicates = scanpy.read(h5ad)
mouse_no_duplicates

AnnData object with n_obs × n_vars = 238915 × 20138 
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'species', 'species_latin', 'channel', 'channel_tissue', 'sequencing_run', 'cell_barcode', 'n_barcodes'
    var: 'n_cells'

## Add "individual" prefixed with "mouse"

In [3]:
id_to_individual = {mouse_id: f'mouse_{i+1}_{mouse_id}' for i, ((age, mouse_id), df) in enumerate(mouse_no_duplicates.obs.groupby(['age', 'mouse.id']))}
id_to_individual

{'1-M-62': 'mouse_1_1-M-62',
 '1-M-63': 'mouse_2_1-M-63',
 '3-F-56': 'mouse_3_3-F-56',
 '3-F-57': 'mouse_4_3-F-57',
 '3-M-5/6': 'mouse_5_3-M-5/6',
 '3-M-7/8': 'mouse_6_3-M-7/8',
 '3-M-8': 'mouse_7_3-M-8',
 '3-M-8/9': 'mouse_8_3-M-8/9',
 '3-M-9': 'mouse_9_3-M-9',
 '18-F-50': 'mouse_10_18-F-50',
 '18-F-51': 'mouse_11_18-F-51',
 '18-M-52': 'mouse_12_18-M-52',
 '18-M-53': 'mouse_13_18-M-53',
 '21-F-54': 'mouse_14_21-F-54',
 '21-F-55': 'mouse_15_21-F-55',
 '24-M-58': 'mouse_16_24-M-58',
 '24-M-59': 'mouse_17_24-M-59',
 '24-M-60': 'mouse_18_24-M-60',
 '24-M-61': 'mouse_19_24-M-61',
 '30-M-2': 'mouse_20_30-M-2',
 '30-M-3': 'mouse_21_30-M-3',
 '30-M-4': 'mouse_22_30-M-4',
 '30-M-5': 'mouse_23_30-M-5'}

In [4]:
list(id_to_individual.values())

['mouse_1_1-M-62',
 'mouse_2_1-M-63',
 'mouse_3_3-F-56',
 'mouse_4_3-F-57',
 'mouse_5_3-M-5/6',
 'mouse_6_3-M-7/8',
 'mouse_7_3-M-8',
 'mouse_8_3-M-8/9',
 'mouse_9_3-M-9',
 'mouse_10_18-F-50',
 'mouse_11_18-F-51',
 'mouse_12_18-M-52',
 'mouse_13_18-M-53',
 'mouse_14_21-F-54',
 'mouse_15_21-F-55',
 'mouse_16_24-M-58',
 'mouse_17_24-M-59',
 'mouse_18_24-M-60',
 'mouse_19_24-M-61',
 'mouse_20_30-M-2',
 'mouse_21_30-M-3',
 'mouse_22_30-M-4',
 'mouse_23_30-M-5']

In [5]:
mouse_no_duplicates.obs['individual'] = mouse_no_duplicates.obs['mouse.id'].map(id_to_individual)
mouse_no_duplicates.obs['individual'].value_counts()

mouse_20_30-M-2     24369
mouse_15_21-F-55    18359
mouse_12_18-M-52    17441
mouse_14_21-F-54    16285
mouse_2_1-M-63      15653
mouse_3_3-F-56      14787
mouse_23_30-M-5     12484
mouse_16_24-M-58    11883
mouse_17_24-M-59    11838
mouse_10_18-F-50    11808
mouse_11_18-F-51     9571
mouse_1_1-M-62       9500
mouse_18_24-M-60     9497
mouse_21_30-M-3      9286
mouse_4_3-F-57       8233
mouse_22_30-M-4      7768
mouse_7_3-M-8        6189
mouse_6_3-M-7/8      5235
mouse_9_3-M-9        4897
mouse_5_3-M-5/6      4880
mouse_13_18-M-53     4837
mouse_19_24-M-61     3665
mouse_8_3-M-8/9       450
Name: individual, dtype: int64

## Subset to only minimal obs

In [6]:
obs_minimal = mouse_no_duplicates.obs.rename(
    columns={
        "age": "age",
        "cell": "cell_id",
        "cell_barcode": "cell_barcode",
        "cell_ontology_class": "cell_ontology_class",
        "cell_ontology_id": "cell_ontology_id",
        "channel": "channel",
        "channel_tissue": None,
        "free_annotation": "free_annotation",
        "method": None,
        "mouse.id": None,
        "individual": "individual",
        "n_barcodes": "n_counts",
        "n_genes": "n_genes",
        "sequencing_run": "sequencing_run",
        "sex": "sex",
        "species": "species",
        "species_latin": "species_latin",
        "subtissue": "subtissue",
        "tissue": "tissue",
        "tissue_free_annotation": None,
    }
)
obs_minimal = obs_minimal.loc[:, obs_minimal.columns.notnull()]
obs_minimal.head()

Unnamed: 0_level_0,age,cell_id,cell_ontology_class,cell_ontology_id,free_annotation,n_genes,sex,subtissue,tissue,species,species_latin,channel,sequencing_run,cell_barcode,n_counts,individual
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAACCTGCAGTAAGCG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGCAGTAAGCG,keratinocyte,,suprabasal,3481.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACCTGCAGTAAGCG,1,mouse_18_24-M-60
AAACCTGTCATTATCC-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGTCATTATCC,keratinocyte,,suprabasal,2599.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACCTGTCATTATCC,1,mouse_18_24-M-60
AAACGGGGTACAGTGG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTACAGTGG,keratinocyte,,suprabasal differentiating,3468.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACGGGGTACAGTGG,1,mouse_18_24-M-60
AAACGGGGTCTTCTCG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTCTTCTCG,keratinocyte,,suprabasal differentiating,3189.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACGGGGTCTTCTCG,1,mouse_18_24-M-60
AAAGATGAGCTATGCT-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAAGATGAGCTATGCT,keratinocyte,,suprabasal,3419.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAAGATGAGCTATGCT,1,mouse_18_24-M-60


## Add unified cell type groups

In [7]:
import unified_annotations

muscle_grouping = unified_annotations.get_celltype_converter('Muscle', ("Mouse", "Tabula Muris Senis", "cell_ontology_class"))
print(muscle_grouping.shape)
muscle_grouping

(9, 4)


Unnamed: 0_level_0,narrow_group,broad_group,compartment_group,tissue
"(Mouse, Tabula Muris Senis, cell_ontology_class)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B cell,B cell,B cell,lymphoid,Muscle
T cell,T cell,T cell,lymphoid,Muscle
macrophage,macrophage,macrophage,myeloid,Muscle
endothelial cell,endothelial cell,endothelial cell,endothelial,Muscle
skeletal muscle satellite cell,skeletal muscle satellite stem cell,skeletal muscle satellite stem cell,stromal,Muscle
skeletal muscle cell,skeletal muscle cell,skeletal muscle cell,stromal,Muscle
smooth muscle cell,pericyte cell_smooth muscle cell,pericyte cell_smooth muscle cell,stromal,Muscle
mesenchymal stem cell,mesenchymal cell,mesenchymal cell,stromal,Muscle
Schwann cell,Schwann cell,Schwann cell,neural,Muscle



### Join minimal obs with muscle

In [8]:
obs_muscle_joined = obs_minimal.merge(
    muscle_grouping.drop('tissue', axis=1), how="left", left_on="cell_ontology_class", right_index=True
)
print(obs_muscle_joined.shape)
obs_muscle_joined.head()

(238915, 19)


Unnamed: 0_level_0,age,cell_id,cell_ontology_class,cell_ontology_id,free_annotation,n_genes,sex,subtissue,tissue,species,species_latin,channel,sequencing_run,cell_barcode,n_counts,individual,narrow_group,broad_group,compartment_group
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AAACCTGCAGTAAGCG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGCAGTAAGCG,keratinocyte,,suprabasal,3481.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACCTGCAGTAAGCG,1,mouse_18_24-M-60,,,
AAACCTGTCATTATCC-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGTCATTATCC,keratinocyte,,suprabasal,2599.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACCTGTCATTATCC,1,mouse_18_24-M-60,,,
AAACGGGGTACAGTGG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTACAGTGG,keratinocyte,,suprabasal differentiating,3468.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACGGGGTACAGTGG,1,mouse_18_24-M-60,,,
AAACGGGGTCTTCTCG-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTCTTCTCG,keratinocyte,,suprabasal differentiating,3189.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAACGGGGTCTTCTCG,1,mouse_18_24-M-60,,,
AAAGATGAGCTATGCT-1-0-0-0,24m,MACA_24m_M_TONGUE_60_AAAGATGAGCTATGCT,keratinocyte,,suprabasal,3419.0,male,,Tongue,Mouse,Mus musculus,MACA_24m_M_TONGUE_60,171103_A00111_0082_BH523JDMXX,AAAGATGAGCTATGCT,1,mouse_18_24-M-60,,,


## Get lung to cell grouping

In [9]:
lung_grouping = unified_annotations.get_celltype_converter('Lung', ("Mouse", "Tabula Muris Senis", "free_annotation"))
lung_grouping = lung_grouping.drop('tissue', axis=1)
lung_grouping.loc['B', :] = lung_grouping.loc['B cell']
print(lung_grouping.shape)
lung_grouping.tail(20)

(40, 3)


Unnamed: 0_level_0,narrow_group,broad_group,compartment_group
"(Mouse, Tabula Muris Senis, free_annotation)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ly6g5b+ T,LY6G5B+ T cell,T cell,lymphoid
Proliferating NK,proliferating natural killer cell_T cell,proliferating natural killer cell_T cell,lymphoid
Proliferating T,proliferating natural killer cell_T cell,proliferating natural killer cell_T cell,lymphoid
Natural Killer,natural killer cell,natural killer cell,lymphoid
Natural Killer T,natural killer T cell,natural killer T cell,lymphoid
Myeloid Dendritic Type 1,conventional dendritic cell,dendritic cell,myeloid
Myeloid Dendritic Type 2,conventional dendritic cell,dendritic cell,myeloid
Plasmacytoid Dendritic,plasmacytoid dendritic cell,dendritic cell,myeloid
Ccr7+ Dendritic,CCR7+ dendritic cell,dendritic cell,myeloid
Proliferating Dendritic,proliferating dendritic cell,dendritic cell,myeloid


## Concatenate grouping

In [10]:
obs_muscle_joined_lung = obs_muscle_joined.query('tissue == "Lung"')
obs_muscle_joined_lung.free_annotation.cat.remove_unused_categories(inplace=True)
obs_muscle_joined_lung.free_annotation.value_counts()

Classical Monocyte                   5269
Proliferating Classical Monocyte     2393
Capillary                            1743
Intermediate Monocyte                1686
Alveolar Fibroblast                  1499
Alveolar Macrophage                  1217
Natural Killer                       1141
Interstitial Macrophage              1115
B                                    1052
Nonclassical Monocyte                 971
CD8+ T                                854
Neutrophil                            543
CD4+ T                                534
Adventitial Fibroblast                517
Capillary Aerocyte                    512
Natural Killer T                      406
Zbtb32+ B                             405
Vein                                  306
Myofibroblast                         217
Myeloid Dendritic Type 1              161
Ly6g5b+ T                             154
Basophil                              128
Alveolar Epithelial Type 2            122
Regulatory T                      

In [11]:
sorted(obs_muscle_joined_lung.free_annotation.unique())

['Adventitial Fibroblast',
 'Airway Smooth Muscle',
 'Alveolar Epithelial Type 2',
 'Alveolar Fibroblast',
 'Alveolar Macrophage',
 'Artery',
 'B',
 'Basophil',
 'CD4+ T',
 'CD8+ T',
 'Capillary',
 'Capillary Aerocyte',
 'Ccr7+ Dendritic',
 'Ciliated',
 'Classical Monocyte',
 'Club',
 'Intermediate Monocyte',
 'Interstitial Macrophage',
 'Ly6g5b+ T',
 'Lympatic',
 'Myeloid Dendritic Type 1',
 'Myeloid Dendritic Type 2',
 'Myofibroblast',
 'Natural Killer',
 'Natural Killer T',
 'Neuroendocrine',
 'Neutrophil',
 'Nonclassical Monocyte',
 'Pericyte',
 'Plasma',
 'Plasmacytoid Dendritic',
 'Proliferating Alveolar Macrophage',
 'Proliferating Classical Monocyte',
 'Proliferating Dendritic',
 'Proliferating NK',
 'Proliferating T',
 'Regulatory T',
 'Vein',
 'Zbtb32+ B']

In [12]:
lung_grouping

Unnamed: 0_level_0,narrow_group,broad_group,compartment_group
"(Mouse, Tabula Muris Senis, free_annotation)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adventitial Fibroblast,adventitial fibroblast,fibroblast,stromal
Alveolar Fibroblast,alveolar fibroblast,fibroblast,stromal
Airway Smooth Muscle,airway associated smooth muscle cell,airway associated smooth muscle cell,stromal
Myofibroblast,myofibroblast cell,myofibroblast cell,stromal
Pericyte,pericyte cell,pericyte cell,stromal
Capillary,capillary cell,capillary cell,endothelial
Capillary Aerocyte,capillary aerocyte cell,capillary cell,endothelial
Vein,vein cell,vein cell,endothelial
Artery,artery cell,artery cell,endothelial
Lympatic,lymphatic cell,lymphatic cell,endothelial


### Set narrow group, broad group as strings

In [27]:
obs_muscle_joined[lung_grouping.columns] = obs_muscle_joined[lung_grouping.columns].astype(str)

In [28]:
for free_annotation, df in obs_muscle_joined_lung.groupby("free_annotation"):
    for group_name, group_value in lung_grouping.loc[free_annotation].items():
        obs_muscle_joined.loc[df.index, group_name] = obs_muscle_joined.loc[
            df.index, group_name
        ].replace({"nan": group_value}).astype(str)

### Make sure narrow group and broad group actually got replaced

In [29]:
obs_muscle_joined.tissue.value_counts()

Marrow             39125
Spleen             34853
Limb_Muscle        28176
Lung               23802
Kidney             21038
Tongue             20271
Mammary_Gland      11954
Thymus              9082
Bladder             8752
Heart_and_Aorta     8253
Trachea             7901
Liver               7052
Fat                 6534
Pancreas            5969
Skin                4308
Large_Intestine     1845
Name: tissue, dtype: int64

In [30]:
obs_muscle_joined.query('tissue == "Limb_Muscle"').narrow_group.value_counts()

mesenchymal cell                       12712
endothelial cell                        6738
macrophage                              2415
skeletal muscle satellite stem cell     2018
B cell                                  1472
T cell                                  1234
pericyte cell_smooth muscle cell        1129
Schwann cell                             270
skeletal muscle cell                     188
Name: narrow_group, dtype: int64

In [31]:
obs_muscle_joined.query('tissue == "Lung"').broad_group.value_counts()

monocyte                                    10319
macrophage                                   2434
capillary cell                               2255
fibroblast                                   2016
T cell                                       1752
B cell                                       1457
natural killer cell                          1141
neutrophil                                    543
natural killer T cell                         406
dendritic cell                                354
vein cell                                     306
myofibroblast cell                            217
basophil                                      128
alveolar epithelial cell type 2               122
artery cell                                    95
pericyte cell                                  59
ciliated cell                                  55
plasma cell                                    47
lymphatic cell                                 40
proliferating natural killer cell_T cell       28


In [32]:
obs_muscle_joined.query('tissue == "Lung"').compartment_group.value_counts()

myeloid        13778
lymphoid        4831
endothelial     2696
stromal         2305
epithelial       188
neural             4
Name: compartment_group, dtype: int64

## Add compartment group for Bladder

In [33]:
bladder_compartment = {'bladder urothelial cell': 'epithelial',
 'bladder cell': 'stromal',
 'endothelial cell': 'endothelial',
 'leukocyte': 'immune'}

obs_muscle_joined.query("tissue == 'Bladder'").cell_ontology_class.unique()

[bladder urothelial cell, bladder cell, endothelial cell, leukocyte]
Categories (4, object): [bladder urothelial cell, bladder cell, endothelial cell, leukocyte]

In [34]:
dict.fromkeys(['bladder urothelial cell', 'bladder cell', 'endothelial cell', 'leukocyte'])

{'bladder urothelial cell': None,
 'bladder cell': None,
 'endothelial cell': None,
 'leukocyte': None}

## Add new obs

In [21]:
mouse_no_duplicates_new_obs = mouse_no_duplicates.copy()
mouse_no_duplicates_new_obs.obs = obs_muscle_joined

# Write mouse adata with minimal adata and new grouping

In [22]:
h5ad = os.path.join(
    "/home/olga/data_lg/data_sm_copy/czb-tabula-muris-senis/Data-objects/",
    "tabula-muris-senis-droplet-official-raw-obj--no-duplicate-barcodes-per-seq-run--minimal-obs-unified-celltypes.h5ad",
) 
mouse_no_duplicates_new_obs.write(h5ad)

... storing 'cell_ontology_class' as categorical
... storing 'narrow_group' as categorical
... storing 'broad_group' as categorical
... storing 'compartment_group' as categorical


In [23]:
mouse_no_duplicates_new_obs.obs.individual.value_counts()

mouse_20_30-M-2     24369
mouse_15_21-F-55    18359
mouse_12_18-M-52    17441
mouse_14_21-F-54    16285
mouse_2_1-M-63      15653
mouse_3_3-F-56      14787
mouse_23_30-M-5     12484
mouse_16_24-M-58    11883
mouse_17_24-M-59    11838
mouse_10_18-F-50    11808
mouse_11_18-F-51     9571
mouse_1_1-M-62       9500
mouse_18_24-M-60     9497
mouse_21_30-M-3      9286
mouse_4_3-F-57       8233
mouse_22_30-M-4      7768
mouse_7_3-M-8        6189
mouse_6_3-M-7/8      5235
mouse_9_3-M-9        4897
mouse_5_3-M-5/6      4880
mouse_13_18-M-53     4837
mouse_19_24-M-61     3665
mouse_8_3-M-8/9       450
Name: individual, dtype: int64

In [38]:
def check_celltype_presence(adata, celltypes=('alveolar fibroblast', 'skeletal muscle cell'), celltype_col='narrow_group'):
    for celltype in celltypes:
        assert celltype in set(adata.obs[celltype_col])

check_celltype_presence(mouse_no_duplicates_new_obs)