In [1]:
from io import StringIO
import os

import numpy as np
import scanpy as sc
import pandas as pd


outdir_gdrive = "/home/olga/googledrive/TabulaMicrocebus/data/cross-species"
outdir_local = "/home/olga/data_lg/data_sm_copy/tabula-microcebus/data-objects/cross-species"
outdirs = outdir_gdrive, outdir_local

# Make minimal obs

Goal of this notebook:

1. Read the original Tabula Muris Senis object
2. Replace the muscle data with newly annotated cell types from Camille and Antoine in `free_annotation`
3. Unify annotations to `narrow_group`, `broad_group`, `compartment_group`

## Read adata with no duplicates

In [44]:
h5ad = os.path.join(
    "/home/olga/data_lg/data_sm_copy/czb-tabula-muris-senis/Data-objects/",
    "tabula-muris-senis-droplet-official-raw-obj--no-duplicate-barcodes-per-seq-run--minimal-obs-unified-celltypes.h5ad",
) 
adata = sc.read(h5ad)
adata

AnnData object with n_obs × n_vars = 238915 × 20138
    obs: 'age', 'cell_id', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'n_genes', 'sex', 'subtissue', 'tissue', 'species', 'species_latin', 'channel', 'sequencing_run', 'cell_barcode', 'n_counts', 'individual', 'narrow_group', 'broad_group', 'compartment_group'
    var: 'n_cells'

In [46]:
adata.raw

In [49]:
set(adata.obs.tissue)

{'Bladder',
 'Fat',
 'Heart_and_Aorta',
 'Kidney',
 'Large_Intestine',
 'Limb_Muscle',
 'Liver',
 'Lung',
 'Mammary_Gland',
 'Marrow',
 'Pancreas',
 'Skin',
 'Spleen',
 'Thymus',
 'Tongue',
 'Trachea'}

In [51]:
adata_no_muscle = adata[adata.obs.tissue != 'Limb_Muscle']
adata_no_muscle

View of AnnData object with n_obs × n_vars = 210739 × 20138
    obs: 'age', 'cell_id', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'n_genes', 'sex', 'subtissue', 'tissue', 'species', 'species_latin', 'channel', 'sequencing_run', 'cell_barcode', 'n_counts', 'individual', 'narrow_group', 'broad_group', 'compartment_group'
    var: 'n_cells'

In [47]:
adata[:5, :5].to_df()

index,Xkr4,Rp1,Sox17,Mrpl15,Lypla1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAACCTGCAGTAAGCG-1-0-0-0,0.0,0.0,0.0,0.0,1.0
AAACCTGTCATTATCC-1-0-0-0,0.0,0.0,0.0,1.0,0.0
AAACGGGGTACAGTGG-1-0-0-0,0.0,0.0,0.0,2.0,0.0
AAACGGGGTCTTCTCG-1-0-0-0,0.0,0.0,0.0,2.0,1.0
AAAGATGAGCTATGCT-1-0-0-0,0.0,0.0,0.0,2.0,0.0


## Read in re-annotated mouse data

In [3]:
%%time

# h5ad = "/home/olga/googledrive/TabulaMicrocebus/data/cross-species/TMS_limb_muscle_droplet_updated_annotations/tabula-muris-senis-droplet-processed-official-annotations-Limb_Muscle_updated-annotations.h5ad"
h5ad = os.path.join(
    "/home/olga/googledrive/TabulaMicrocebus/data/cross-species/unified_annotations/",
    "tabula-muris-senis_10X_Limb_Muscle_updated-annotations.h5ad",
)

muscle = sc.read(h5ad)
muscle

AnnData object with n_obs × n_vars = 28867 × 20138
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'n_counts', 'louvain', 'leiden', 'TMS_muscle_ADM_free_annotation', 'TMS_muscle_CE_compartment', 'TMS_muscle_CE_free_annotation'
    var: 'n_cells', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
    uns: 'TMS_muscle_CE_compartment_colors', 'TMS_muscle_CE_free_annotation_colors', 'age_colors', 'cell_ontology_class_colors', 'leiden', 'louvain', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [48]:
adata.obs.columns.intersection(muscle.obs.columns)

Index(['age', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation',
       'n_genes', 'sex', 'subtissue', 'tissue', 'n_counts'],
      dtype='object')

In [37]:
muscle.raw

<anndata._core.raw.Raw at 0x7f04ecad8278>

In [19]:
muscle.obs.head()

Unnamed: 0_level_0,age,cell,cell_ontology_class,cell_ontology_id,free_annotation,method,mouse.id,n_genes,sex,subtissue,tissue,tissue_free_annotation,n_counts,louvain,leiden,TMS_muscle_ADM_free_annotation,TMS_muscle_CE_compartment,TMS_muscle_CE_free_annotation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AAACCTGAGAAACCGC-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGAGAAACCGC,macrophage,,,droplet,18-F-50,2506.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,7596.0,10,19,macrophage,myeloid,classical monocyte
AAACCTGCAATAACGA-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGCAATAACGA,endothelial cell,,,droplet,18-F-50,1361.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,3351.0,3,0,capillary endothelial cell,endothelial,capillary cell
AAACCTGCAGCGTAAG-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGCAGCGTAAG,mesenchymal stem cell,,chondrocyte-like,droplet,18-F-50,1482.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,4630.0,6,7,tendon cell,stromal,tendon cell
AAACCTGGTAGCTTGT-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGGTAGCTTGT,mesenchymal stem cell,,chondrocyte-like,droplet,18-F-50,1652.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,3838.0,16,17,mesenchymal stem cell,stromal,fibroadipogenic progenitor cell
AAAGCAAGTCTGGTCG-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAAGCAAGTCTGGTCG,mesenchymal stem cell,,,droplet,18-F-50,1069.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,2576.0,1,4,mesenchymal stem cell,stromal,fibroadipogenic progenitor cell


In [20]:
muscle.obs.TMS_muscle_CE_free_annotation.value_counts()

fibroadipogenic progenitor cell        8858
capillary cell                         3688
tendon cell                            2529
skeletal muscle satellite stem cell    1958
B cell                                 1251
                                       ... 
plasmacytoid dendritic cell              12
B cell (PF ZBTB32+)                       9
unknown_epithelial_stromal (KRT5+)        7
T cell (PF CD4- CD8+)                     6
slow muscle cell                          5
Name: TMS_muscle_CE_free_annotation, Length: 61, dtype: int64

## Set the raw expression matrix as the `.X`

In [21]:
muscle.X = muscle.raw.X

In [22]:
muscle.obs.head()

Unnamed: 0_level_0,age,cell,cell_ontology_class,cell_ontology_id,free_annotation,method,mouse.id,n_genes,sex,subtissue,tissue,tissue_free_annotation,n_counts,louvain,leiden,TMS_muscle_ADM_free_annotation,TMS_muscle_CE_compartment,TMS_muscle_CE_free_annotation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AAACCTGAGAAACCGC-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGAGAAACCGC,macrophage,,,droplet,18-F-50,2506.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,7596.0,10,19,macrophage,myeloid,classical monocyte
AAACCTGCAATAACGA-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGCAATAACGA,endothelial cell,,,droplet,18-F-50,1361.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,3351.0,3,0,capillary endothelial cell,endothelial,capillary cell
AAACCTGCAGCGTAAG-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGCAGCGTAAG,mesenchymal stem cell,,chondrocyte-like,droplet,18-F-50,1482.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,4630.0,6,7,tendon cell,stromal,tendon cell
AAACCTGGTAGCTTGT-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGGTAGCTTGT,mesenchymal stem cell,,chondrocyte-like,droplet,18-F-50,1652.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,3838.0,16,17,mesenchymal stem cell,stromal,fibroadipogenic progenitor cell
AAAGCAAGTCTGGTCG-1-14-0-0,18m,MACA_18m_F_MUSCLE_50_pre_sort_AAAGCAAGTCTGGTCG,mesenchymal stem cell,,,droplet,18-F-50,1069.0,female,Pre-Sort,Limb_Muscle,Limb_Muscle,2576.0,1,4,mesenchymal stem cell,stromal,fibroadipogenic progenitor cell


In [23]:
muscle[:5, :5].to_df()

index,Xkr4,Rp1,Sox17,Mrpl15,Lypla1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAACCTGAGAAACCGC-1-14-0-0,0.0,0.0,0.0,0.0,2.0
AAACCTGCAATAACGA-1-14-0-0,0.0,0.0,5.0,0.0,0.0
AAACCTGCAGCGTAAG-1-14-0-0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTAGCTTGT-1-14-0-0,0.0,0.0,0.0,0.0,0.0
AAAGCAAGTCTGGTCG-1-14-0-0,0.0,0.0,0.0,0.0,0.0


In [27]:
muscle.obs.TMS_muscle_CE_free_annotation.value_counts()

fibroadipogenic progenitor cell        8858
capillary cell                         3688
tendon cell                            2529
skeletal muscle satellite stem cell    1958
B cell                                 1251
                                       ... 
plasmacytoid dendritic cell              12
B cell (PF ZBTB32+)                       9
unknown_epithelial_stromal (KRT5+)        7
T cell (PF CD4- CD8+)                     6
slow muscle cell                          5
Name: TMS_muscle_CE_free_annotation, Length: 61, dtype: int64

In [28]:
muscle.X.shape

(28867, 20138)

In [29]:
muscle.raw.X.shape

(28867, 20138)

In [30]:
muscle.raw.X[:5, :5].todense()

matrix([[0., 0., 0., 0., 2.],
        [0., 0., 5., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], dtype=float32)

In [31]:
muscle.obs.age.value_counts()

24m    7178
1m     5757
18m    5569
3m     3550
21m    3528
30m    3285
Name: age, dtype: int64

In [32]:
muscle.obs.query('TMS_muscle_CE_free_annotation == "slow muscle cell"').TMS_muscle_CE_free_annotation.value_counts().head(1)

slow muscle cell    5
Name: TMS_muscle_CE_free_annotation, dtype: int64

In [33]:
muscle.obs.query('TMS_muscle_CE_free_annotation == "slow muscle cell"').age.value_counts()

3m     2
24m    1
21m    1
1m     1
30m    0
18m    0
Name: age, dtype: int64

### Remove 1m and 3m data

In [34]:
muscle_no1m_no3m = muscle[~muscle.obs.age.isin(('1m', '3m'))]
muscle_no1m_no3m

View of AnnData object with n_obs × n_vars = 19560 × 20138
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'n_counts', 'louvain', 'leiden', 'TMS_muscle_ADM_free_annotation', 'TMS_muscle_CE_compartment', 'TMS_muscle_CE_free_annotation'
    var: 'n_cells', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
    uns: 'TMS_muscle_CE_compartment_colors', 'TMS_muscle_CE_free_annotation_colors', 'age_colors', 'cell_ontology_class_colors', 'leiden', 'louvain', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

### Set `free_annotation` as the updated one to be consistent with lung

In [53]:
new_annotation_col = 'TMS_muscle_CE_free_annotation'

muscle_no1m_no3m.obs['free_annotation'] = muscle_no1m_no3m.obs[new_annotation_col]


In [54]:
muscle_no1m_no3m.obs.TMS_muscle_CE_compartment.value_counts()

stromal                    10973
endothelial                 4640
myeloid                     1651
lymphoid                    1500
unassigned                   608
neural                       170
megakaryocyte-erythroid       13
epithelial                     5
Name: TMS_muscle_CE_compartment, dtype: int64

In [55]:
muscle_no1m_no3m.obs.free_annotation.value_counts()

fibroadipogenic progenitor cell        6483
capillary cell                         2499
tendon cell                            1944
skeletal muscle satellite stem cell    1176
macrophage                              793
                                       ... 
T cell (PF CD4- CD8+)                     5
unknown_epithelial_stromal (KRT5+)        5
unknown_lymphoid (FCER1G+)                5
slow muscle cell                          2
tendon cell_chondrocyte (COL2A1+)         2
Name: free_annotation, Length: 61, dtype: int64

## Concatenate adata without muscle, with the new muscle data

In [62]:
adata_no_muscle.shape

(210739, 20138)

In [63]:
muscle_no1m_no3m.shape

(19560, 20138)

In [57]:
%%time

adata_updated_muscle = adata_no_muscle.concatenate(muscle_no1m_no3m)
adata_updated_muscle

CPU times: user 7.03 s, sys: 22min 23s, total: 22min 30s
Wall time: 23min 29s


AnnData object with n_obs × n_vars = 230299 × 20138
    obs: 'age', 'cell_id', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'n_genes', 'sex', 'subtissue', 'tissue', 'species', 'species_latin', 'channel', 'sequencing_run', 'cell_barcode', 'n_counts', 'individual', 'narrow_group', 'broad_group', 'compartment_group', 'cell', 'method', 'mouse.id', 'tissue_free_annotation', 'louvain', 'leiden', 'TMS_muscle_ADM_free_annotation', 'TMS_muscle_CE_compartment', 'TMS_muscle_CE_free_annotation', 'batch'
    var: 'n_cells', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'highly_variable-1'

## Add unified cell type groups: Muscle first

In [60]:
import unified_annotations

muscle_grouping = unified_annotations.get_celltype_converter(
    "Muscle",
    (
        "Mouse",
        "Tabula Muris Senis",
        f"{new_annotation_col}",
    ),
)
print(muscle_grouping.shape)
muscle_grouping

(61, 4)


Unnamed: 0_level_0,narrow_group,broad_group,compartment_group,tissue
"(Mouse, Tabula Muris Senis, TMS_muscle_CE_free_annotation)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B cell,B cell,B cell,lymphoid,Muscle
B cell (ZBTB32+),B cell,B cell,lymphoid,Muscle
B cell (PF ZBTB32+),B cell (PF),B cell,lymphoid,Muscle
plasma cell,plasma cell,plasma cell,lymphoid,Muscle
T cell (CD4- CD8- TMEM176A+),T cell,T cell,lymphoid,Muscle
...,...,...,...,...
unknown_epithelial_stromal (KRT5+),unknown_epithelial_stromal (KRT5+),unknown_epithelial_stromal (KRT5+),epithelial,Muscle
doublets_endothelial_stromal,doublet,doublet,,Muscle
doublets_lymphoid_endothelial,doublet,doublet,,Muscle
doublets_myeloid_endothelial,doublet,doublet,,Muscle


## Concatenate grouping

In [65]:
obs_muscle = adata_updated_muscle.obs.query('tissue == "Limb_Muscle"')
# obs_muscle.free_annotation.cat.remove_unused_categories(inplace=True)
obs_muscle.free_annotation.value_counts().sort_index()

B cell                                             626
B cell (PF ZBTB32+)                                  6
B cell (ZBTB32+)                                   198
Schwann cell (NGFR+)                                52
T cell (CD4+ CD8-)                                  97
                                                  ... 
unknown_stromal (CLDN1+ COL9A2-)                   136
unknown_stromal (CLDN1- COL9A2+)                   185
vascular associated smooth muscle cell (ACTG2+)    107
vascular associated smooth muscle cell (PLN+)      130
vein cell                                          777
Name: free_annotation, Length: 61, dtype: int64

In [68]:
obs_muscle.free_annotation.value_counts()

fibroadipogenic progenitor cell        6483
capillary cell                         2499
tendon cell                            1944
skeletal muscle satellite stem cell    1176
macrophage                              793
                                       ... 
T cell (PF CD4- CD8+)                     5
unknown_lymphoid (FCER1G+)                5
unknown_epithelial_stromal (KRT5+)        5
slow muscle cell                          2
tendon cell_chondrocyte (COL2A1+)         2
Name: free_annotation, Length: 61, dtype: int64

### Make sure all muscle cell subtypes are there

In [66]:
obs_muscle.free_annotation[obs_muscle.free_annotation.str.contains('muscle cell')].unique()

array(['fast muscle cell',
       'vascular associated smooth muscle cell (PLN+)',
       'pericyte_vascular associated smooth muscle cell',
       'vascular associated smooth muscle cell (ACTG2+)',
       'slow muscle cell'], dtype=object)

In [73]:
obs_muscle.free_annotation[obs_muscle.free_annotation.str.contains('T cell')].value_counts()

T cell (CD4- CD8- TMEM176A+)     182
T cell (CD4- CD8+ CCL5-)         129
T cell (CD4- CD8+ CCL5+)         101
T cell (CD4+ CD8-)                97
T cell (regulatory CD4+ CD8-)     54
T cell (PF CD4+ CD8-)              6
T cell (PF CD4- CD8+)              5
Name: free_annotation, dtype: int64

### How many of each muscle cell?

In [67]:
obs_muscle.free_annotation[obs_muscle.free_annotation.str.contains('muscle cell')].value_counts()

pericyte_vascular associated smooth muscle cell    187
fast muscle cell                                   147
vascular associated smooth muscle cell (PLN+)      130
vascular associated smooth muscle cell (ACTG2+)    107
slow muscle cell                                     2
Name: free_annotation, dtype: int64

## Replace only the Muscle's `narrow_group`, `broad_group`, `comaprtment_group`

In [69]:
obs_muscle[muscle_grouping.columns] = 'nan'
obs_muscle.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,age,cell_id,cell_ontology_class,cell_ontology_id,free_annotation,n_genes,sex,subtissue,tissue,species,...,cell,method,mouse.id,tissue_free_annotation,louvain,leiden,TMS_muscle_ADM_free_annotation,TMS_muscle_CE_compartment,TMS_muscle_CE_free_annotation,batch
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGAAACCGC-1-14-0-0-1,18m,,macrophage,,classical monocyte,2506.0,female,Pre-Sort,,,...,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGAGAAACCGC,droplet,18-F-50,Limb_Muscle,10,19,macrophage,myeloid,classical monocyte,1
AAACCTGCAATAACGA-1-14-0-0-1,18m,,endothelial cell,,capillary cell,1361.0,female,Pre-Sort,,,...,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGCAATAACGA,droplet,18-F-50,Limb_Muscle,3,0,capillary endothelial cell,endothelial,capillary cell,1
AAACCTGCAGCGTAAG-1-14-0-0-1,18m,,mesenchymal stem cell,,tendon cell,1482.0,female,Pre-Sort,,,...,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGCAGCGTAAG,droplet,18-F-50,Limb_Muscle,6,7,tendon cell,stromal,tendon cell,1
AAACCTGGTAGCTTGT-1-14-0-0-1,18m,,mesenchymal stem cell,,fibroadipogenic progenitor cell,1652.0,female,Pre-Sort,,,...,MACA_18m_F_MUSCLE_50_pre_sort_AAACCTGGTAGCTTGT,droplet,18-F-50,Limb_Muscle,16,17,mesenchymal stem cell,stromal,fibroadipogenic progenitor cell,1
AAAGCAAGTCTGGTCG-1-14-0-0-1,18m,,mesenchymal stem cell,,fibroadipogenic progenitor cell,1069.0,female,Pre-Sort,,,...,MACA_18m_F_MUSCLE_50_pre_sort_AAAGCAAGTCTGGTCG,droplet,18-F-50,Limb_Muscle,1,4,mesenchymal stem cell,stromal,fibroadipogenic progenitor cell,1


In [70]:
for x in sorted(muscle_grouping.index.symmetric_difference(obs_muscle.free_annotation.unique())):
    print(x)

In [71]:
for free_annotation, df in obs_muscle.groupby("free_annotation"):
    for group_name, group_value in muscle_grouping.loc[free_annotation].items():
#         obs_muscle.loc[df.index, group_name] = obs_muscle.loc[
#             df.index, group_name
#         ].replace({"nan": group_value}).astype(str)
        obs_muscle.loc[df.index, group_name] = group_value
        
obs_muscle.groupby(['compartment_group', 'broad_group', 'narrow_group'], observed=True).size().to_frame()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
compartment_group,broad_group,narrow_group,Unnamed: 3_level_1
endothelial,artery cell,artery cell,763
endothelial,artery cell_capillary cell (RBP7+),artery cell_capillary cell (RBP7+),286
endothelial,capillary cell,capillary cell,2499
endothelial,lymphatic cell,lymphatic cell,265
endothelial,unknown_endothelial (COL13A1+),unknown_endothelial (COL13A1+),50
endothelial,vein cell,vein cell,777
epithelial,unknown_epithelial_stromal (KRT5+),unknown_epithelial_stromal (KRT5+),5
lymphoid,B cell,B cell,824
lymphoid,B cell,B cell (PF),6
lymphoid,T cell,T cell,182


## Get lung to cell grouping

In [74]:
lung_grouping = unified_annotations.get_celltype_converter('Lung', ("Mouse", "Tabula Muris Senis", "free_annotation"))
lung_grouping = lung_grouping.drop('tissue', axis=1)
lung_grouping.loc['B', :] = lung_grouping.loc['B cell']
print(lung_grouping.shape)
lung_grouping

(40, 3)


Unnamed: 0_level_0,narrow_group,broad_group,compartment_group
"(Mouse, Tabula Muris Senis, free_annotation)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adventitial Fibroblast,adventitial fibroblast,fibroblast,stromal
Alveolar Fibroblast,alveolar fibroblast,fibroblast,stromal
Airway Smooth Muscle,airway associated smooth muscle cell,airway associated smooth muscle cell,stromal
Myofibroblast,myofibroblast,myofibroblast,stromal
Pericyte,pericyte,pericyte,stromal
Capillary,capillary cell,capillary cell,endothelial
Capillary Aerocyte,capillary aerocyte cell,capillary cell,endothelial
Vein,vein cell,vein cell,endothelial
Artery,artery cell,artery cell,endothelial
Lympatic,lymphatic cell,lymphatic cell,endothelial


## Concatenate grouping

In [83]:
obs_lung = adata_updated_muscle.obs.query('tissue == "Lung"')
# obs_lung.free_annotation.cat.remove_unused_categories(inplace=True)
obs_lung.free_annotation.value_counts()

Classical Monocyte                   5269
Proliferating Classical Monocyte     2393
Capillary                            1743
Intermediate Monocyte                1686
Alveolar Fibroblast                  1499
Alveolar Macrophage                  1217
Natural Killer                       1141
Interstitial Macrophage              1115
B                                    1052
Nonclassical Monocyte                 971
CD8+ T                                854
Neutrophil                            543
CD4+ T                                534
Adventitial Fibroblast                517
Capillary Aerocyte                    512
Natural Killer T                      406
Zbtb32+ B                             405
Vein                                  306
Myofibroblast                         217
Myeloid Dendritic Type 1              161
Ly6g5b+ T                             154
Basophil                              128
Alveolar Epithelial Type 2            122
Regulatory T                      

### Set narrow group, broad group as strings

In [84]:
obs_lung[lung_grouping.columns] = obs_lung[lung_grouping.columns].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [85]:
lung_grouping

Unnamed: 0_level_0,narrow_group,broad_group,compartment_group
"(Mouse, Tabula Muris Senis, free_annotation)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adventitial Fibroblast,adventitial fibroblast,fibroblast,stromal
Alveolar Fibroblast,alveolar fibroblast,fibroblast,stromal
Airway Smooth Muscle,airway associated smooth muscle cell,airway associated smooth muscle cell,stromal
Myofibroblast,myofibroblast,myofibroblast,stromal
Pericyte,pericyte,pericyte,stromal
Capillary,capillary cell,capillary cell,endothelial
Capillary Aerocyte,capillary aerocyte cell,capillary cell,endothelial
Vein,vein cell,vein cell,endothelial
Artery,artery cell,artery cell,endothelial
Lympatic,lymphatic cell,lymphatic cell,endothelial


In [86]:
for free_annotation, df in obs_lung.groupby("free_annotation"):
#     print(f'free_annotation: {free_annotation}')
    for group_name, group_value in lung_grouping.loc[free_annotation].items():
#         print(f'\t{group_name}: {group_value}')
#         if group_value == 'CD8+ T':
#             break
#         obs_lung.loc[df.index, group_name] = obs_lung.loc[
#             df.index, group_name
#         ].replace({"nan": group_value}).astype(str)
        obs_lung.loc[df.index, group_name] = group_value

### Make sure narrow group and broad group actually got replaced

obs_lung.groupby(['compartment_group', 'broad_group', 'narrow_group'], observed=True).size().to_frame()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
compartment_group,broad_group,narrow_group,Unnamed: 3_level_1
endothelial,artery cell,artery cell,95
endothelial,capillary cell,capillary aerocyte cell,512
endothelial,capillary cell,capillary cell,1743
endothelial,lymphatic cell,lymphatic cell,40
endothelial,vein cell,vein cell,306
epithelial,alveolar epithelial cell type 2,alveolar epithelial cell type 2,122
epithelial,ciliated cell,ciliated cell,55
epithelial,club cell,club cell,11
lymphoid,B cell,B cell,1457
lymphoid,T cell,Regulatory T cell,121


In [87]:
obs_lung.query('tissue == "Limb_Muscle"').narrow_group.value_counts()

Series([], Name: narrow_group, dtype: int64)

In [88]:
obs_lung.query('tissue == "Lung"').broad_group.value_counts()

monocyte                                10319
macrophage                               2434
capillary cell                           2255
fibroblast                               2016
T cell                                   1663
B cell                                   1457
natural killer cell                      1141
neutrophil                                543
natural killer T cell                     406
dendritic cell                            354
vein cell                                 306
myofibroblast                             217
basophil                                  128
alveolar epithelial cell type 2           122
natural killer cell_T cell (PF)           117
artery cell                                95
pericyte                                   59
ciliated cell                              55
plasma cell                                47
lymphatic cell                             40
airway associated smooth muscle cell       13
club cell                         

## Add new obs

In [89]:
adata_updated_muscle_new_obs = adata_updated_muscle.copy()
adata_updated_muscle_new_obs.obs.loc[obs_muscle.index] = obs_muscle
adata_updated_muscle_new_obs.obs.loc[obs_lung.index] = obs_lung

## Add species

In [102]:
adata_updated_muscle_new_obs.obs['species'] = "Mouse"
adata_updated_muscle_new_obs.obs['species_latin'] = 'Mus musculus'
adata_updated_muscle_new_obs.obs.head()

Unnamed: 0_level_0,age,cell_id,cell_ontology_class,cell_ontology_id,free_annotation,n_genes,sex,subtissue,tissue,species,...,cell,method,mouse.id,tissue_free_annotation,louvain,leiden,TMS_muscle_ADM_free_annotation,TMS_muscle_CE_compartment,TMS_muscle_CE_free_annotation,batch
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGCAGTAAGCG-1-0-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGCAGTAAGCG,keratinocyte,,suprabasal,3481.0,male,,Tongue,Mouse,...,,,,,,,,,,0
AAACCTGTCATTATCC-1-0-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACCTGTCATTATCC,keratinocyte,,suprabasal,2599.0,male,,Tongue,Mouse,...,,,,,,,,,,0
AAACGGGGTACAGTGG-1-0-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTACAGTGG,keratinocyte,,suprabasal differentiating,3468.0,male,,Tongue,Mouse,...,,,,,,,,,,0
AAACGGGGTCTTCTCG-1-0-0-0-0,24m,MACA_24m_M_TONGUE_60_AAACGGGGTCTTCTCG,keratinocyte,,suprabasal differentiating,3189.0,male,,Tongue,Mouse,...,,,,,,,,,,0
AAAGATGAGCTATGCT-1-0-0-0-0,24m,MACA_24m_M_TONGUE_60_AAAGATGAGCTATGCT,keratinocyte,,suprabasal,3419.0,male,,Tongue,Mouse,...,,,,,,,,,,0


# Write mouse adata with minimal adata and new grouping

In [103]:
h5ad = os.path.join(
    "/home/olga/data_lg/data_sm_copy/czb-tabula-muris-senis/Data-objects/",
    "tabula-muris-senis-droplet-official-raw-obj--no-duplicate-barcodes-per-seq-run--minimal-obs-unified-celltypes--august2021.h5ad",
) 
adata_updated_muscle_new_obs.write(h5ad)

... storing 'species' as categorical
... storing 'species_latin' as categorical


In [104]:
adata_updated_muscle_new_obs.obs.individual.value_counts()

mouse_20_30-M-2     24369
nan                 19560
mouse_15_21-F-55    16569
mouse_12_18-M-52    15163
mouse_14_21-F-54    14647
mouse_3_3-F-56      13089
mouse_2_1-M-63      12977
mouse_10_18-F-50    11023
mouse_23_30-M-5     10845
mouse_16_24-M-58    10128
mouse_17_24-M-59     9964
mouse_21_30-M-3      9286
mouse_11_18-F-51     9066
mouse_18_24-M-60     7810
mouse_1_1-M-62       6598
mouse_4_3-F-57       6497
mouse_7_3-M-8        6189
mouse_22_30-M-4      6157
mouse_6_3-M-7/8      5235
mouse_9_3-M-9        4897
mouse_5_3-M-5/6      4880
mouse_13_18-M-53     2972
mouse_19_24-M-61     1928
mouse_8_3-M-8/9       450
Name: individual, dtype: int64

In [105]:
def check_celltype_presence(
    adata,
    celltypes=("fast muscle cell", "slow muscle cell"),
    celltype_col="narrow_group",
):
    for celltype in celltypes:
        assert celltype in set(adata.obs[celltype_col])


check_celltype_presence(adata_updated_muscle_new_obs)

# Subset on 1:1 orthologs

## Read orthologs

In [106]:
csv = os.path.join(
    "/home/olga/googledrive/TabulaMicrocebus/data/orthologous-genes",
    "ncbi_mgi_ensembl__mouse-lemur_human_mouse__orthologs__gene_names__one2one.csv",
)

gene_orthologies = pd.read_csv(csv, index_col=0)
print(gene_orthologies.shape)
gene_orthologies.head()

(15514, 3)


Unnamed: 0,mouse_lemur__gene_name,human__gene_name,mouse__gene_name
0,ADAT3,ADAT3,Adat3
1,CDK7,CDK7,Cdk7
2,CNMD,CNMD,Cnmd
3,TMEM229B,TMEM229B,Tmem229b
4,DIO2,DIO2,Dio2


## Get only gene names present in 1:1 ortholog dataframe

In [107]:
gene_mask = adata_updated_muscle_new_obs.var.index.isin(
    gene_orthologies["mouse__gene_name"].values
)
print(gene_mask.sum())
adata_updated_muscle_new_obs_one2one = adata_updated_muscle_new_obs[:, gene_mask]
adata_updated_muscle_new_obs_one2one

13694


View of AnnData object with n_obs × n_vars = 230299 × 13694
    obs: 'age', 'cell_id', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'n_genes', 'sex', 'subtissue', 'tissue', 'species', 'species_latin', 'channel', 'sequencing_run', 'cell_barcode', 'n_counts', 'individual', 'narrow_group', 'broad_group', 'compartment_group', 'cell', 'method', 'mouse.id', 'tissue_free_annotation', 'louvain', 'leiden', 'TMS_muscle_ADM_free_annotation', 'TMS_muscle_CE_compartment', 'TMS_muscle_CE_free_annotation', 'batch'
    var: 'n_cells', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'highly_variable-1'

In [108]:
adata_updated_muscle_new_obs_one2one.var.head()

Unnamed: 0_level_0,n_cells,means-1,dispersions-1,dispersions_norm-1,highly_variable-1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Xkr4,147,0.000721,0.804374,-0.298597,False
Sox17,27289,1.117266,3.091175,1.529381,True
Mrpl15,95312,0.263145,0.856202,-0.185976,False
Lypla1,86096,0.298199,0.858446,-0.513725,False
Tcea1,120238,0.581056,0.862771,-0.662509,False


## Use human gene names

In [109]:
adata_updated_muscle_new_obs_one2one_new_var = (
    adata_updated_muscle_new_obs_one2one.var.merge(
        gene_orthologies, left_index=True, right_on="mouse__gene_name"
    )
)
print(adata_updated_muscle_new_obs_one2one_new_var.shape)
adata_updated_muscle_new_obs_one2one_new_var.head()

(13694, 8)


Unnamed: 0,n_cells,means-1,dispersions-1,dispersions_norm-1,highly_variable-1,mouse_lemur__gene_name,human__gene_name,mouse__gene_name
8454,147,0.000721,0.804374,-0.298597,False,XKR4,XKR4,Xkr4
736,27289,1.117266,3.091175,1.529381,True,SOX17,SOX17,Sox17
7571,95312,0.263145,0.856202,-0.185976,False,MRPL15,MRPL15,Mrpl15
7486,86096,0.298199,0.858446,-0.513725,False,LYPLA1,LYPLA1,Lypla1
14726,120238,0.581056,0.862771,-0.662509,False,TCEA1,TCEA1,Tcea1


### Make new object with human gene names

In [110]:
adata_updated_muscle_new_obs_one2one_human_genes = adata_updated_muscle_new_obs_one2one.copy()
adata_updated_muscle_new_obs_one2one_human_genes

AnnData object with n_obs × n_vars = 230299 × 13694
    obs: 'age', 'cell_id', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'n_genes', 'sex', 'subtissue', 'tissue', 'species', 'species_latin', 'channel', 'sequencing_run', 'cell_barcode', 'n_counts', 'individual', 'narrow_group', 'broad_group', 'compartment_group', 'cell', 'method', 'mouse.id', 'tissue_free_annotation', 'louvain', 'leiden', 'TMS_muscle_ADM_free_annotation', 'TMS_muscle_CE_compartment', 'TMS_muscle_CE_free_annotation', 'batch'
    var: 'n_cells', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'highly_variable-1'

In [111]:
adata_updated_muscle_new_obs_one2one_human_genes.var = (
    adata_updated_muscle_new_obs_one2one_new_var.set_index("human__gene_name")
)
adata_updated_muscle_new_obs_one2one_human_genes.var.head()

Unnamed: 0_level_0,n_cells,means-1,dispersions-1,dispersions_norm-1,highly_variable-1,mouse_lemur__gene_name,mouse__gene_name
human__gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
XKR4,147,0.000721,0.804374,-0.298597,False,XKR4,Xkr4
SOX17,27289,1.117266,3.091175,1.529381,True,SOX17,Sox17
MRPL15,95312,0.263145,0.856202,-0.185976,False,MRPL15,Mrpl15
LYPLA1,86096,0.298199,0.858446,-0.513725,False,LYPLA1,Lypla1
TCEA1,120238,0.581056,0.862771,-0.662509,False,TCEA1,Tcea1


## Write mouse all tissues, 1:1 orthologs, human gene names to file

### Write h5ad

In [112]:
%%time

for d in outdirs:
    h5ad = f"{d}/alltissues__10x__tabula-muris-senis__one2one_orthologs--august2021.h5ad"
    %time adata_updated_muscle_new_obs_one2one_human_genes.write(h5ad)

CPU times: user 1.87 s, sys: 4.86 s, total: 6.73 s
Wall time: 2min 10s
CPU times: user 1.03 s, sys: 1.94 s, total: 2.97 s
Wall time: 5.49 s
CPU times: user 2.9 s, sys: 6.81 s, total: 9.71 s
Wall time: 2min 15s


In [113]:
outdirs

('/home/olga/googledrive/TabulaMicrocebus/data/cross-species',
 '/home/olga/data_lg/data_sm_copy/tabula-microcebus/data-objects/cross-species')

In [116]:
adata_updated_muscle_new_obs_one2one_human_genes.obs.tissue.value_counts(dropna=False).sort_index()

Bladder             8752
Fat                 6534
Heart_and_Aorta     8253
Kidney             21038
Large_Intestine     1845
Liver               7052
Lung               23802
Mammary_Gland      11954
Marrow             39125
Muscle             19560
Pancreas            5969
Skin                4308
Spleen             34853
Thymus              9082
Tongue             20271
Trachea             7901
Name: tissue, dtype: int64

In [118]:
sorted(set(adata_updated_muscle_new_obs_one2one_human_genes.obs.tissue.values))

['Bladder',
 'Fat',
 'Heart_and_Aorta',
 'Kidney',
 'Large_Intestine',
 'Liver',
 'Lung',
 'Mammary_Gland',
 'Marrow',
 'Muscle',
 'Pancreas',
 'Skin',
 'Spleen',
 'Thymus',
 'Tongue',
 'Trachea']