### Notebook to format healthy PBMCS [Cai 2020 + 2022] for label transfer with `scNym`
> Healthy reference taken from Yoshida et al, 2021

- **Developed by**: Carlos Talavera-López Ph.D

- **Modified by**: Mairi McClean

- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**

- original v221017; modified v230320

### Import required modules

In [1]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

### Read in query and reference objects

In [2]:
query = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/scvi/post_sccaf/CaiY_healthy_scRNA_PBMC_mm230316_scVI-clustered.raw.h5ad') 
query

AnnData object with n_obs × n_vars = 145381 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

In [3]:
query.var.head()

Unnamed: 0,gene_id,mt,ribo,n_cells_by_counts-caiy2020,mean_counts-caiy2020,pct_dropout_by_counts-caiy2020,total_counts-caiy2020,n_cells_by_counts-caiy2022,mean_counts-caiy2022,pct_dropout_by_counts-caiy2022,total_counts-caiy2022
DDX11L1,ENSG00000223972.5,False,False,6,8.2e-05,99.991797,6.0,9,0.000138,99.987541,10.0
WASH7P,ENSG00000227232.5,False,False,1,1.4e-05,99.998633,1.0,41,0.000568,99.943241,41.0
MIR6859-1,ENSG00000278267.1,False,False,0,0.0,100.0,0.0,0,0.0,100.0,0.0
MIR1302-2HG,ENSG00000243485.5,False,False,0,0.0,100.0,0.0,0,0.0,100.0,0.0
MIR1302-2,ENSG00000284332.1,False,False,0,0.0,100.0,0.0,0,0.0,100.0,0.0


In [4]:
query.obs['status'].cat.categories

Index(['Healthy', 'active_TB', 'latent_TB'], dtype='object')

In [5]:
yoshida = sc.read_h5ad('/Volumes/LaCie/data_lake/Mairi_example/INBOX/sc_downloads/yoshida_2021/meyer_nikolic_covid_pbmc.cellxgene.20210813.h5ad') 
yoshida

AnnData object with n_obs × n_vars = 422220 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'

In [6]:
yoshida_pbmc = yoshida[yoshida.obs['COVID_status'].isin(['Healthy'])]
yoshida_pbmc

View of AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'

### Format data as reference for `scNym`

In [7]:
query.obs['domain_label'] = query.obs['sample'].copy()
query.obs['domain_label'] = 'target_' + query.obs['domain_label'].astype(str)
query.obs['domain_label'] = query.obs['domain_label'].astype('category')
query.obs['domain_label'].cat.categories

Index(['target_HRS100507', 'target_HRS100508', 'target_HRS100509',
       'target_HRS100510', 'target_HRS100511', 'target_HRS100512',
       'target_HRS100513', 'target_HRS100514', 'target_HRS180101',
       'target_HRS180102', 'target_HRS180103', 'target_HRS180104',
       'target_PBMC_HC_1', 'target_PBMC_HC_2', 'target_PBMC_LTBI_1',
       'target_PBMC_LTBI_2', 'target_PBMC_TB_1', 'target_PBMC_TB_2',
       'target_PBMC_TB_3'],
      dtype='object')

In [8]:
query.obs['cell_states'] = 'Unlabeled'

### Format data as query for `scNym`

In [9]:
yoshida_pbmc.obs['annotation_detailed'].value_counts()

T CD4 naive               32672
Monocyte CD14             20464
B naive                   19295
NK                        19085
T CD8 naive               16140
T CD4 helper              13552
T CD8 CTL                  9541
T CD8 CM                   5544
Monocyte CD16              4457
T reg                      3251
T g/d                      3183
B n-sw mem                 2993
Monocyte CD14 IFN stim     2559
NK CD56                    2353
MAIT                       2213
B sw mem                   2068
T CD4 naive IFN stim       1860
T CD8 EMRA                 1834
cDC2                       1371
T CD4 CTL                  1331
Cycling                    1012
B invar                     869
T CD8 EM                    795
B naive IFN stim            745
pDC                         706
Platelets                   626
Monocyte CD16 IFN stim      616
Monocyte CD16+C1            464
NK IFN stim                 433
HPC                         414
Plasma cells                305
NKT     

In [10]:
yoshida_pbmc.obs['cell_states'] = yoshida_pbmc.obs['annotation_detailed'].copy()
yoshida_pbmc.obs['status'] = 'Healthy'

  meyer_pbmc.obs['cell_states'] = meyer_pbmc.obs['annotation_detailed'].copy()


In [11]:
yoshida_pbmc.obs['domain_label'] = yoshida_pbmc.obs['sample_id'].copy()
yoshida_pbmc.obs['domain_label'] = 'train_' + yoshida_pbmc.obs['domain_label'].astype(str)
yoshida_pbmc.obs['domain_label'] = yoshida_pbmc.obs['domain_label'].astype('category')
yoshida_pbmc.obs['domain_label'].cat.categories

Index(['train_AN1', 'train_AN11', 'train_AN12', 'train_AN13', 'train_AN14',
       'train_AN2', 'train_AN3', 'train_AN5', 'train_AN6', 'train_AN7',
       'train_AN9', 'train_NP13', 'train_NP15', 'train_NP16', 'train_NP17',
       'train_NP18', 'train_NP19', 'train_NP20', 'train_NP21', 'train_NP22',
       'train_NP23', 'train_NP24', 'train_NP26', 'train_NP27', 'train_NP28',
       'train_NP30', 'train_NP31', 'train_NP32', 'train_NP35', 'train_NP36',
       'train_NP37', 'train_NP38', 'train_NP39', 'train_NP41', 'train_NP44'],
      dtype='object')

### Merge two objects 

In [12]:
healthy_pbmc = query.concatenate(yoshida_pbmc, batch_key = 'object', batch_categories = ['query', 'reference'], join = 'inner')
healthy_pbmc

  warn(
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 319065 × 22792
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset', 'domain_label', 'cell_states', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'gene_id-query', 'mt-query', 'ribo-query', 'n_cells_by_counts-caiy2020-query', 'mean_counts-caiy2020-query', 'pct_dropout_by_counts-caiy2020-query', 'total_counts-caiy2020-query', 

### Clean up object

- Clean up `adata.obs`

In [13]:
healthy_pbmc.obs.drop(healthy_pbmc.obs.columns.difference(['domain_label','cell_states', 'object']), 1, inplace = True)
healthy_pbmc

  tb_pbmc.obs.drop(tb_pbmc.obs.columns.difference(['domain_label','cell_states', 'object']), 1, inplace = True)


AnnData object with n_obs × n_vars = 319065 × 22792
    obs: 'object', 'domain_label', 'cell_states'
    var: 'gene_id-query', 'mt-query', 'ribo-query', 'n_cells_by_counts-caiy2020-query', 'mean_counts-caiy2020-query', 'pct_dropout_by_counts-caiy2020-query', 'total_counts-caiy2020-query', 'n_cells_by_counts-caiy2022-query', 'mean_counts-caiy2022-query', 'pct_dropout_by_counts-caiy2022-query', 'total_counts-caiy2022-query', 'name-reference'

- Clean up `adata.var`

In [14]:
healthy_pbmc.var.drop(healthy_pbmc.var.columns.difference(['gene_id-query']), 1, inplace = True)
healthy_pbmc

  tb_pbmc.var.drop(tb_pbmc.var.columns.difference(['gene_id-query']), 1, inplace = True)


AnnData object with n_obs × n_vars = 319065 × 22792
    obs: 'object', 'domain_label', 'cell_states'
    var: 'gene_id-query'

### Save object for `scNym`

In [15]:
healthy_pbmc.write('/Volumes/LaCie/data_lake/Mairi_example/processed_files/label_transfer/CaiY_PBMC_healthy_pre-scnym_mm230320.h5ad')