#### scVI model benchmarking using scIB and SCCAF clustering analysis of Cai helathy PBMC data


**Objective**: Build and run two different scVI models for healthy Cai PBMC data and then run SCCAF analysis to assess Leiden clustering parameters

**Developed by**: Mairi McClean

**Affiliation**: Institute of Computational Biology, Helmholtz Zentrum Munich

-v230328

### Load modules and set figure params

In [None]:
%pip install --quiet scvi-colab
%pip install --quiet scib-metrics
from scvi_colab import install

install()

In [None]:
import scanpy as sc
import scvi
import matplotlib.pyplot as plt
from rich import print
from scib_metrics.benchmark import Benchmarker
from scvi.model.utils import mde
from scvi_colab import install


In [None]:
sc.set_figure_params(figsize=(4, 4))

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

### Read in data for integration

In [None]:
# file path from local machine
caiy2020 = sc.read_h5ad('/Users/mairi.mcclean/data/abridged_qc/human/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')
caiy2020

In [None]:
caiy2020.obs['status'].value_counts()

In [None]:
# caiy2022 = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')

# file path for local machine
caiy2022 = sc.read_h5ad('/Users/mairi.mcclean/data/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')

caiy2022.obs['status'] = 'active_TB'
caiy2022

In [None]:
# So Cai2022 has no healthy controls? Publication suggests that all samples were taken from either TPE, pneumonia, or lung cancer

caiy2022.obs['status'].value_counts()

In [None]:
caiy_tb = caiy2020.concatenate(caiy2022, batch_key = 'dataset', batch_categories = ['caiy2020', 'caiy2022'], join = 'inner')
caiy_tb

In [None]:
caiy_tb.obs['data_type'].value_counts()

In [None]:
caiy_tb.obs['tissue'].value_counts()

In [None]:
caiy_tb.obs['status'].value_counts()

In [None]:
caiy_healthy = caiy_tb[~caiy_tb.obs['status'].isin(['active_TB', 'latent_TB']),:]

In [None]:
caiy_healthy.obs['status'].value_counts()

In [None]:
caiy_healthy

In [None]:
caiy_healthy.obs['batch']

In [None]:
caiy_healthy.obs['donor']

In [None]:
caiy_tb.obs['study'].value_counts()

In [None]:
caiy_healthy.obs['study'].value_counts

In [None]:
caiy_tb.obs['sample'].value_counts()

In [None]:
caiy_healthy.obs['sample'].value_counts()

In [None]:
# There appears to be no healthy controls in the Cai 2022 data; check this

### Calculate HVGs

In [None]:
adata = caiy_healthy.copy()
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "counts",
    batch_key = "sample",
    subset = True
)

#### Remove unrequired variables prior to model run

In [None]:
del caiy2020
del caiy2022
del caiy_tb

### Integration with scVI


#### Model 1

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_1 = scvi.model.SCVI(adata, n_layers=3, n_latent=50, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_1)

In [None]:
vae_1.train()

In [None]:
adata.obsm["X_scVI"] = vae_1.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


#### Model 2

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_2 = scvi.model.SCVI(adata, n_layers=2, n_latent=50, gene_likelihood="zinb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_2)

In [None]:
vae_2.train()

In [None]:
adata.obsm["X_scVI_2"] = vae_2.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


#### Model 3

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_3 = scvi.model.SCVI(adata, n_layers=2, n_latent=50, gene_likelihood="poisson", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_2)

In [None]:
vae_2.train()

In [None]:
adata.obsm["X_scVI_3"] = vae_2.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)
