### Notebook for the formatting of TB PBMC objects as input for `scNym`

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v221017

### Import required modules

In [1]:
import anndata
import scipy as sp
import numpy as np
import pandas as pd
import scanpy as sc

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 140, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         9.2.0
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.1
colorama                    0.4.5
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
entrypoints                 0.4
executing                   0.8.3
h5py                        3.7.0
hypergeom_ufunc             NA
ipykernel                   6.9.1
jedi                        0.18.1
joblib                      1.2.0
kiwisolver                  1.4.4
llvmlite                    0.39.1
matplotlib                  3.6.1
mpl_toolkits                NA
natsort                     8.2.0
nbinom_ufunc                NA
ncf_ufunc                   NA
numba                       0.56.2
numpy                       1.23.3
packaging           

### Read in `scNym`-annotated object

In [3]:
tb_pbmc = sc.read_h5ad('/home/cartalop/data/single_cell/lung/tb/working_objects/CaiY_TB-PBMC_scnym_annotated_ctl221017.h5ad')
tb_pbmc

AnnData object with n_obs × n_vars = 145381 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

### Fix labels for downstream analyses

- Add disease status label

In [4]:
tb_pbmc.obs['scNym'].cat.categories

Index(['B invar', 'B n-sw mem', 'B n-sw mem IFN stim', 'B naive',
       'B naive IFN stim', 'B sw mem', 'Cycling', 'HPC', 'ILC', 'MAIT',
       'Monocyte CD14', 'Monocyte CD14 IFN stim', 'Monocyte CD16',
       'Monocyte CD16 IFN stim', 'Monocyte CD16+C1', 'NK', 'NK CD56',
       'NK IFN stim', 'Plasma cells', 'Plasmablasts', 'Platelets', 'RBC',
       'T CD4 CTL', 'T CD4 helper', 'T CD4 naive', 'T CD4 naive IFN stim',
       'T CD8 CM', 'T CD8 CTL', 'T CD8 EM', 'T CD8 EMRA', 'T CD8 naive',
       'T g/d', 'T reg', 'cDC2', 'pDC'],
      dtype='object')

In [5]:
sc.pp.normalize_per_cell(tb_pbmc, counts_per_cell_after = 1e4)
sc.pp.log1p(tb_pbmc)
tb_pbmc.X = tb_pbmc.X.tocsc() ### Thanks to `kp9` for help with this!

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)


### Export object

In [6]:
adata_export = anndata.AnnData(X = tb_pbmc.X.todense(), var = tb_pbmc.var, obs = tb_pbmc.obs)
adata_export

AnnData object with n_obs × n_vars = 145381 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'scNym', 'scNym_confidence', 'cell_type'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'

In [7]:
adata_export.write('/home/cartalop/data/single_cell/lung/tb/working_objects/CaiY_TB-PBMC_cellchat-ready.log.h5ad')