### Notebook for the formatting of TB PBMC objects as input for `scNym`

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v221017

### Import required modules

In [1]:
import anndata
import scipy as sp
import numpy as np
import pandas as pd
import scanpy as sc

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 140, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         9.1.0
anyio                       NA
appnope                     0.1.2
attr                        21.2.0
babel                       2.9.1
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.2
brotli                      NA
certifi                     2021.10.08
cffi                        1.14.6
chardet                     4.0.0
charset_normalizer          2.0.4
cloudpickle                 2.0.0
colorama                    0.4.4
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2021.10.0
dateutil                    2.8.2
debugpy                     1.4.1
decorator                   5.1.0
defusedxml                  0.7.1
entrypoints                 0.3
fsspec                      2021.08.1
google                      NA
h5py                        3.6.0
hyp

### Read in `scNym`-annotated object

In [3]:
tb_pbmc = sc.read_h5ad('/Users/carlostalavera-lopez/Downloads/CaiY_TB-PBMC_scnym_annotated_ctl221017.h5ad')
tb_pbmc

AnnData object with n_obs × n_vars = 145381 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

### Transform counts to `cellchat` liking

In [4]:
sc.pp.normalize_per_cell(tb_pbmc, counts_per_cell_after = 1e4)
sc.pp.log1p(tb_pbmc)
tb_pbmc.X = tb_pbmc.X.tocsc() ### Thanks to `kp9` for help with this!

normalizing by total count per cell
    finished (0:00:03): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


### Split dataset by tissue

In [5]:
tb_pbmc.obs['tissue'].cat.categories

Index(['PBMC', 'PFMC'], dtype='object')

In [6]:
tb_pbmc.obs['status'].value_counts()

active_TB    105339
Healthy       22049
latent_TB     17993
Name: status, dtype: int64

In [7]:
tb_pbmc.obs['tissue'].value_counts()

PBMC    100600
PFMC     44781
Name: tissue, dtype: int64

In [8]:
pbmc = tb_pbmc[tb_pbmc.obs['tissue'].isin(['PBMC'])]
pbmc

View of AnnData object with n_obs × n_vars = 100600 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    uns: 'log1p'
    layers: 'counts', 'sqrt_norm'

In [9]:
pbmc.obs['status'].value_counts()

active_TB    60558
Healthy      22049
latent_TB    17993
Name: status, dtype: int64

In [10]:
pfmc = tb_pbmc[tb_pbmc.obs['tissue'].isin(['PFMC'])]
pfmc

View of AnnData object with n_obs × n_vars = 44781 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    uns: 'log1p'
    layers: 'counts', 'sqrt_norm'

### Split PBMCs by status

In [11]:
healthy_pbmc = pbmc[pbmc.obs['status'].isin(['Healthy'])]
healthy_pbmc

View of AnnData object with n_obs × n_vars = 22049 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    uns: 'log1p'
    layers: 'counts', 'sqrt_norm'

In [12]:
active_pbmc = pbmc[pbmc.obs['status'].isin(['active_TB'])]
active_pbmc

View of AnnData object with n_obs × n_vars = 60558 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    uns: 'log1p'
    layers: 'counts', 'sqrt_norm'

In [13]:
latent_pbmc = pbmc[pbmc.obs['status'].isin(['latent_TB'])]
latent_pbmc

View of AnnData object with n_obs × n_vars = 17993 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    uns: 'log1p'
    layers: 'counts', 'sqrt_norm'

### Export object

In [14]:
pfmc_export = anndata.AnnData(X = pfmc.X, var = pfmc.var, obs = pfmc.obs)

healthy_export = anndata.AnnData(X = healthy_pbmc.X, var = healthy_pbmc.var, obs = healthy_pbmc.obs)
active_export = anndata.AnnData(X = active_pbmc.X, var = active_pbmc.var, obs = active_pbmc.obs)
latent_export = anndata.AnnData(X = latent_pbmc.X, var = latent_pbmc.var, obs = latent_pbmc.obs)

In [15]:
healthy_export.write('/Users/carlostalavera-lopez/Downloads/CaiY_Healthy-PBMC_cellchat-ready.log.h5ad')

In [16]:
active_export.write('/Users/carlostalavera-lopez/Downloads/CaiY_activeTB-PBMC_cellchat-ready.log.h5ad')

In [17]:
latent_export.write('/Users/carlostalavera-lopez/Downloads/CaiY_latentTB-PBMC_cellchat-ready.log.h5ad')

In [18]:
pfmc_export.write('/Users/carlostalavera-lopez/Downloads/CaiY_TB-PFMC_cellchat-ready.log.h5ad')