#### scVI model benchmarking using scIB and SCCAF clustering analysis of Cai helathy PBMC data


**Objective**: Build and run two different scVI models for healthy Cai PBMC data and then run SCCAF analysis to assess Leiden clustering parameters

**Developed by**: Mairi McClean

**Affiliation**: Institute of Computational Biology, Helmholtz Zentrum Munich

-v230328

### Load modules and set figure params

In [1]:
%pip install --quiet scvi-colab
%pip install --quiet scib-metrics
from scvi_colab import install

install()

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


                Not currently in Google Colab environment.

                Please run with `run_outside_colab=True` to override.

                Returning with no further action.
                
  warn(


In [2]:
import scanpy as sc
import scvi
import matplotlib.pyplot as plt
from rich import print
from scib_metrics.benchmark import Benchmarker
from scvi.model.utils import mde
from scvi_colab import install


Global seed set to 0
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [3]:
sc.set_figure_params(figsize=(4, 4))

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

### Read in data for integration

In [4]:
# file path from local machine
# caiy2020 = sc.read_h5ad('/Users/mairi.mcclean/data/abridged_qc/human/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')

caiy2020 = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')
caiy2020

AnnData object with n_obs × n_vars = 73146 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'donor_colors'
    layers: 'counts', 'sqrt_norm'

In [5]:
caiy2020.obs['status'].value_counts()

active_TB    33104
Healthy      22049
latent_TB    17993
Name: status, dtype: int64

In [6]:
caiy2022 = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')

# file path for local machine
# caiy2022 = sc.read_h5ad('/Users/mairi.mcclean/data/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')

caiy2022.obs['status'] = 'active_TB'
caiy2022

AnnData object with n_obs × n_vars = 25417 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'data_type', 'centre', 'version', 'object', 'protocol', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets', 'status'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'donor_colors', 'sample_colors'
    layers: 'counts', 'sqrt_norm'

In [7]:
# So Cai2022 has no healthy controls? Publication suggests that all samples were taken from either TPE, pneumonia, or lung cancer

caiy2022.obs['status'].value_counts()

active_TB    25417
Name: status, dtype: int64

In [8]:
caiy_tb = caiy2020.concatenate(caiy2022, batch_key = 'dataset', batch_categories = ['caiy2020', 'caiy2022'], join = 'inner')
caiy_tb

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 98563 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

In [9]:
caiy_tb.obs['data_type'].value_counts()

scRNAseq    98563
Name: data_type, dtype: int64

In [10]:
caiy_tb.obs['tissue'].value_counts()

PBMC    98563
Name: tissue, dtype: int64

In [11]:
caiy_tb.obs['status'].value_counts()

active_TB    58521
Healthy      22049
latent_TB    17993
Name: status, dtype: int64

In [12]:
caiy_healthy = caiy_tb[~caiy_tb.obs['status'].isin(['active_TB', 'latent_TB']),:]

In [13]:
caiy_healthy.obs['status'].value_counts()

Healthy    22049
Name: status, dtype: int64

In [14]:
caiy_healthy

View of AnnData object with n_obs × n_vars = 22049 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

In [15]:
caiy_healthy.obs['batch']

barcode
AAACCTGAGATGGCGT-H2-caiy2020    H2
AAACCTGAGATGTGTA-H2-caiy2020    H2
AAACCTGAGCAACGGT-H2-caiy2020    H2
AAACCTGAGCCATCGC-H2-caiy2020    H2
AAACCTGAGGTGATTA-H2-caiy2020    H2
                                ..
TTTGTCAGTCGCGGTT-H1-caiy2020    H1
TTTGTCAGTCGCTTCT-H1-caiy2020    H1
TTTGTCAGTTCGAATC-H1-caiy2020    H1
TTTGTCAGTTTGGGCC-H1-caiy2020    H1
TTTGTCATCCCATTAT-H1-caiy2020    H1
Name: batch, Length: 22049, dtype: category
Categories (2, object): ['H2', 'H1']

In [16]:
caiy_healthy.obs['donor']

barcode
AAACCTGAGATGGCGT-H2-caiy2020    SAMN14048020
AAACCTGAGATGTGTA-H2-caiy2020    SAMN14048020
AAACCTGAGCAACGGT-H2-caiy2020    SAMN14048020
AAACCTGAGCCATCGC-H2-caiy2020    SAMN14048020
AAACCTGAGGTGATTA-H2-caiy2020    SAMN14048020
                                    ...     
TTTGTCAGTCGCGGTT-H1-caiy2020    SAMN14048019
TTTGTCAGTCGCTTCT-H1-caiy2020    SAMN14048019
TTTGTCAGTTCGAATC-H1-caiy2020    SAMN14048019
TTTGTCAGTTTGGGCC-H1-caiy2020    SAMN14048019
TTTGTCATCCCATTAT-H1-caiy2020    SAMN14048019
Name: donor, Length: 22049, dtype: object

In [17]:
caiy_tb.obs['study'].value_counts()

CaiY_2021    73146
CaiY_2022    25417
Name: study, dtype: int64

In [18]:
caiy_healthy.obs['study'].value_counts

<bound method IndexOpsMixin.value_counts of barcode
AAACCTGAGATGGCGT-H2-caiy2020    CaiY_2021
AAACCTGAGATGTGTA-H2-caiy2020    CaiY_2021
AAACCTGAGCAACGGT-H2-caiy2020    CaiY_2021
AAACCTGAGCCATCGC-H2-caiy2020    CaiY_2021
AAACCTGAGGTGATTA-H2-caiy2020    CaiY_2021
                                  ...    
TTTGTCAGTCGCGGTT-H1-caiy2020    CaiY_2021
TTTGTCAGTCGCTTCT-H1-caiy2020    CaiY_2021
TTTGTCAGTTCGAATC-H1-caiy2020    CaiY_2021
TTTGTCAGTTTGGGCC-H1-caiy2020    CaiY_2021
TTTGTCATCCCATTAT-H1-caiy2020    CaiY_2021
Name: study, Length: 22049, dtype: object>

In [19]:
caiy_tb.obs['sample'].value_counts()

PBMC_TB_1      11521
PBMC_HC_1      11275
PBMC_TB_3      11174
PBMC_HC_2      10774
PBMC_TB_2      10409
PBMC_LTBI_2     9846
HRS100507       8573
PBMC_LTBI_1     8147
HRS100513       6828
HRS100511       5250
HRS180101       2578
HRS180103       2188
Name: sample, dtype: int64

In [20]:
caiy_healthy.obs['sample'].value_counts()

PBMC_HC_1    11275
PBMC_HC_2    10774
Name: sample, dtype: int64

In [21]:
# There appears to be no healthy controls in the Cai 2022 data; check this

### Calculate HVGs

In [22]:
adata = caiy_healthy.copy()
adata.layers['counts'] = adata.X.copy()

In [23]:
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "counts",
    batch_key = "sample",
    subset = True
)



#### Remove unrequired variables prior to model run

In [24]:
del caiy2020
del caiy2022
del caiy_tb

### Integration with scVI


#### Model 1

In [25]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")



In [26]:
vae_1 = scvi.model.SCVI(adata, n_layers=2, n_latent=50, gene_likelihood="nb", dispersion="gene-batch")

In [27]:
scvi.model.SCVI.view_anndata_setup(vae_1)

In [28]:
vae_1.train()

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Epoch 17/363:   4%|▍         | 16/363 [01:49<39:06,  6.76s/it, loss=1.77e+03, v_num=1]

In [None]:
adata.obsm["X_scVI_1"] = vae_1.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI_1")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


#### Model 2

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_2 = scvi.model.SCVI(adata, n_layers=3, n_latent=50, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_2)

In [None]:
vae_2.train()

In [None]:
adata.obsm["X_scVI_2"] = vae_2.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI_2")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


#### Model 3

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_3 = scvi.model.SCVI(adata, n_layers=2, n_latent=50, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_3)

In [None]:
vae_3.train()

In [None]:
adata.obsm["X_scVI_3"] = vae_3.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI_3")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


#### Model 4

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_4 = scvi.model.SCVI(adata, n_layers=3, n_latent=50, gene_likelihood="zinb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_4)

In [None]:
vae_4.train()

In [None]:
adata.obsm["X_scVI_4"] = vae_4.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI_4")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)
