### Notebook to add `scNym` labels and scores to query TB PBMC object

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v221017

### Export required modules

In [1]:
import anndata
import scipy as sp
import pandas as pd
import scanpy as sc

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         9.2.0
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.1
colorama                    0.4.5
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
entrypoints                 0.4
executing                   0.8.3
h5py                        3.7.0
hypergeom_ufunc             NA
ipykernel                   6.9.1
jedi                        0.18.1
joblib                      1.2.0
kiwisolver                  1.4.4
llvmlite                    0.39.1
matplotlib                  3.6.1
mpl_toolkits                NA
natsort                     8.2.0
nbinom_ufunc                NA
ncf_ufunc                   NA
numba                       0.56.2
numpy                       1.23.3
packaging           

## Format CTRL object

- Read in `scNym` annotated object to extract labels

In [4]:
query_scnym = sc.read('/home/cartalop/data/single_cell/lung/tb/working_objects/CaiY_PBMC_TB_post-scnym_ctl220717.h5ad')
query_scnym

AnnData object with n_obs × n_vars = 145381 × 20199
    obs: 'object', 'domain_label', 'cell_states', 'scNym', 'scNym_confidence'
    var: 'gene_id-query', 'n_cells', 'n_counts'
    uns: 'cell_states_colors', 'log1p', 'neighbors', 'object_colors', 'scNym_colors', 'scNym_probabilities', 'umap'
    obsm: 'X_scnym', 'X_umap'
    obsp: 'connectivities', 'distances'

- Read in raw object

In [5]:
query_raw = sc.read_h5ad('/home/cartalop/data/single_cell/lung/tb/merged/CaiY_PBMC-TB_QCed_pre-processed_ctl221017.h5ad') 
query_raw

AnnData object with n_obs × n_vars = 145381 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

### Add annotations to raw object

In [9]:
query_annotated = query_raw.copy()
query_annotated

AnnData object with n_obs × n_vars = 145381 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

### Copy observations from raw object to annotated object

In [10]:
query_annotated.obs.head()

Unnamed: 0_level_0,study,individual,sample,tissue,donor,age,gender,status,data_type,centre,...,n_counts,percent_chrY,XIST-counts,S_score,G2M_score,doublet_scores,predicted_doublets,object,protocol,dataset
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGAACAATC-acTB3-caiy2020,CaiY_2021,SAMN14048025,PBMC_TB_3,PBMC,SAMN14048025,34,male,active_TB,scRNAseq,Shenzhen University,...,5634.0,0.053248,0.0,-0.352188,-0.193287,0.028007,False,,,caiy2020
AAACCTGAGAAGGTGA-acTB3-caiy2020,CaiY_2021,SAMN14048025,PBMC_TB_3,PBMC,SAMN14048025,34,male,active_TB,scRNAseq,Shenzhen University,...,3536.0,0.113122,0.0,-0.064944,-0.071169,0.058026,False,,,caiy2020
AAACCTGAGATCTGCT-acTB3-caiy2020,CaiY_2021,SAMN14048025,PBMC_TB_3,PBMC,SAMN14048025,34,male,active_TB,scRNAseq,Shenzhen University,...,1846.0,0.054171,0.0,-0.231399,-0.080643,0.093938,False,,,caiy2020
AAACCTGAGCACAGGT-acTB3-caiy2020,CaiY_2021,SAMN14048025,PBMC_TB_3,PBMC,SAMN14048025,34,male,active_TB,scRNAseq,Shenzhen University,...,3993.0,0.075131,0.0,-0.227884,-0.25877,0.071884,False,,,caiy2020
AAACCTGAGCGTGAAC-acTB3-caiy2020,CaiY_2021,SAMN14048025,PBMC_TB_3,PBMC,SAMN14048025,34,male,active_TB,scRNAseq,Shenzhen University,...,2035.0,0.14742,0.0,0.020959,0.02503,0.02244,False,,,caiy2020


In [11]:
query_annotated.obs = query_annotated.obs.assign(scNym = pd.Series(query_scnym.obs['scNym']).values)
query_annotated.obs = query_annotated.obs.assign(scNym_confidence = pd.Series(query_scnym.obs['scNym_confidence']).values)
query_annotated.obs['scNym'].value_counts()

T CD4 naive               41886
Monocyte CD14             26295
NK                        19083
T CD4 helper              12651
T CD8 CTL                  9015
T CD8 naive                6255
B naive                    6236
B n-sw mem                 3504
T reg                      2998
Monocyte CD14 IFN stim     2898
T CD8 CM                   1646
T CD8 EMRA                 1627
Monocyte CD16              1275
MAIT                       1227
B sw mem                   1110
T g/d                       984
T CD8 EM                    919
Platelets                   892
NK CD56                     687
cDC2                        679
T CD4 naive IFN stim        599
T CD4 CTL                   566
Plasma cells                537
Cycling                     441
Monocyte CD16 IFN stim      358
Monocyte CD16+C1            236
B invar                     217
pDC                         217
B naive IFN stim            172
HPC                          55
NK IFN stim                  48
RBC     

In [12]:
query_annotated.obs['scNym'].cat.categories

Index(['B invar', 'B n-sw mem', 'B n-sw mem IFN stim', 'B naive',
       'B naive IFN stim', 'B sw mem', 'Cycling', 'HPC', 'ILC', 'MAIT',
       'Monocyte CD14', 'Monocyte CD14 IFN stim', 'Monocyte CD16',
       'Monocyte CD16 IFN stim', 'Monocyte CD16+C1', 'NK', 'NK CD56',
       'NK IFN stim', 'Plasma cells', 'Plasmablasts', 'Platelets', 'RBC',
       'T CD4 CTL', 'T CD4 helper', 'T CD4 naive', 'T CD4 naive IFN stim',
       'T CD8 CM', 'T CD8 CTL', 'T CD8 EM', 'T CD8 EMRA', 'T CD8 naive',
       'T g/d', 'T reg', 'cDC2', 'pDC'],
      dtype='object')

### Make matrix sparse

In [13]:
query_annotated.X = sp.sparse.csr_matrix(query_annotated.X)

### Save object

In [15]:
query_annotated.write('/home/cartalop/data/single_cell/lung/tb/working_objects/CaiY_TB-PBMC_scnym_annotated_ctl221017.h5ad')