### scVI and SCCAF data integration and clustering analysis of healthy PBMC [Cai20_22]

**Objective**: Run complete integration and clustering analysis after solving scanpy/scVI dependency issues and incorporate Yoshida et al 2021 data

**Developed by**: Mairi McClean

**Institute of Computational Biology - Helmholtz Zentrum Munich**

**v230425**


In [None]:
%pip install --quiet scvi-colab
%pip install --quiet scib-metrics
from scvi_colab import install

install()

In [None]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

In [None]:
sc.set_figure_params(figsize=(4, 4))

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

## Read in datasets for integration
> Cai 2020 + Cai 2022

### Read in datasets

- Read in _Cai Y et al 2020_

In [None]:
caiy2020 = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')
caiy2020

In [None]:
# file path from local machine
# caiy2020 = sc.read_h5ad('/Users/mairi.mcclean/data/abridged_qc/human/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')
# caiy2020

In [None]:
caiy2020.obs['status'].value_counts()

- Read in _Cai Y et al 2022_

In [None]:
caiy2022 = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')

# file path for local machine
# caiy2022 = sc.read_h5ad('/Users/mairi.mcclean/data/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')

caiy2022.obs['status'] = 'active_TB'
caiy2022

In [None]:
# So Cai2022 has no healthy controls? Publication suggests that all samples were taken from either TPE, pneumonia, or lung cancer

caiy2022.obs['status'].value_counts()

- Merge objects


In [None]:
caiy_tb = caiy2020.concatenate(caiy2022, batch_key = 'dataset', batch_categories = ['caiy2020', 'caiy2022'], join = 'inner')
caiy_tb

- Read in _Yoshida et al 2021_

In [None]:
adata_yoshida = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/INBOX/sc_downloads/yoshida_2021/meyer_nikolic_covid_pbmc.cellxgene.20210813.h5ad')
adata_yoshida

In [None]:
adata_yoshida.obs

In [None]:
adata_yoshida.obs[''].value_counts()

### Check that Cai anndata object only contains PBMC scRNA from healthy donors

In [None]:
caiy_tb.obs

In [None]:
caiy_tb.obs['data_type'].value_counts()

In [None]:
caiy_tb.obs['tissue'].value_counts()

In [None]:
caiy_tb.obs['status'].value_counts()

In [None]:
caiy_healthy = caiy_tb[~caiy_tb.obs['status'].isin(['active_TB', 'latent_TB']),:]

In [None]:
caiy_healthy.obs['status'].value_counts()

- Merge _Cai_ and _Yoshida_ data

In [None]:
adata_healthy = caiy_healthy.concatenate(adata_yoshida, batch_key = 'dataset', batch_categories = ['caiy2022', 'yoshida2021'], join = 'inner')
adata_healthy

### Calculate HVGs

In [None]:
adata = caiy_healthy.copy()
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "counts",
    batch_key = "sample",
    subset = True
)

#### Remove unrequired variables prior to model run

In [None]:
del caiy2020
del caiy2022
del caiy_tb

### Integration with scVI


In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae = scvi.model.SCVI(adata, n_layers=3, n_latent=50, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae)

In [None]:
vae.train()

In [None]:
adata.obsm["X_scVI"] = vae.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
# adata.obsm["X_mde"] = mde(adata.obsm["X_scVI"])

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["batch", "leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


### SCCAF clustering analysis

In [None]:
import SCCAF as sccaf
from SCCAF import SCCAF_assessment, plot_roc

In [None]:
# Resolution on a scale from 0 to 1

sc.tl.leiden(adata, resolution = 0.1, random_state = 1786)

In [None]:
# clf ? 
# cvsm ?

y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.X, adata.obs['leiden'], n = 100)



In [None]:
plot_roc(y_prob, y_test, clf, cvsm = cvsm, acc = acc)
plt.show()

In [None]:
# Markers chosen here are of interest in healthy blood cells

sc.pl.umap(adata, frameon = False, color = ['leiden', 'status', 'CD74', 'tissue', 'FOXI1', 'CDH1', 'CD3E', 'DUSP4'], size = 0.8, legend_fontsize = 5, legend_loc = 'on data')

### Export clustered object

In [None]:
adata
caiy_healthy

In [None]:
# Making a hybrid anndata object using sections from both original anndata object and the cai_tb_gex object
adata_export = anndata.AnnData(X = caiy_healthy.X, var = caiy_healthy.var, obs = adata.obs, uns = adata.uns, obsm = adata.obsm, layers = caiy_healthy.layers, obsp = adata.obsp)
adata_export

In [None]:
adata_export.write('/Volumes/Lacie/data_lake/Mairi_example/processed_files/scvi/post_sccaf/CaiY_Yoshida_healthy_scRNA_PBMC_mm230425_scVI-clustered.raw.h5ad')
