### Notebook for the integration using ```scVI``` and clustering evaluation with ```SCCAF``` for healthy PBMCs

- **Objective**: Integration of healthy PBMCs from [Cai 2020](https://pubmed.ncbi.nlm.nih.gov/32114394/) and [Yoshida 2021](https://www.nature.com/articles/s41591-021-01329-2), with SCCAF clustering analysis and scIB metric caluclation
- **Developed by**: Mairi McClean
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- **v230628**


### Import required modules

In [None]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import ipywidgets as ipw

import numpy as np
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')

### Read in datasets for integration


- Read in _Cai Y et al 2020_

In [None]:
caiy2020 = sc.read_h5ad('/Users/mairi.mcclean/example_data_lake/processed_files/final_qc_mm230627/human/cai_2020/CaiY2020_PBMC_mm230627.h5ad')
caiy2020

In [None]:
caiy2020.obs['status'].value_counts()

In [None]:
caiy_healthy = caiy2020[caiy2020.obs['status'].isin(['Healthy'])]
caiy_healthy

In [None]:
caiy_healthy.obs['status'].value_counts()

- Read in _Yoshida et al 2021_

In [None]:
adata_yoshida = sc.read_h5ad('/Users/mairi.mcclean/example_data_lake/processed_files/final_qc_mm230627/human/yoshida_2021/Yoshida2021_PBMC_mm230628.h5ad')
adata_yoshida

In [None]:
adata_yoshida.obs['COVID_status'].value_counts()

- Subset healthy Yoshida data 

In [None]:
yoshida_healthy = adata_yoshida[adata_yoshida.obs['COVID_status'].isin(['Healthy'])]
yoshida_healthy

In [None]:
yoshida_healthy.obs['COVID_status'].value_counts()

In [None]:
yoshida_healthy.obs

In [None]:
yoshida_healthy.obs['patient_id'].value_counts()

In [None]:
yoshida_healthy.obs['sample_id'].value_counts()

- Rename shared column headers 
> so that subsequent object concatenation is easier

In [None]:
yoshida_healthy.obs.rename(columns={"patient_id": "donor"}, inplace=True)
yoshida_healthy.obs.rename(columns={"COVID_status": "status"}, inplace=True)
yoshida_healthy.obs.rename(columns={"Sex": "gender"}, inplace=True)
yoshida_healthy.obs.rename(columns={"Age_group": "age"}, inplace=True)
yoshida_healthy.obs['tissue'] = 'PBMC'
yoshida_healthy.obs

- Merge _Cai_ and _Yoshida_ data

In [None]:
adata_healthy = caiy_healthy.concatenate(yoshida_healthy, batch_key = 'dataset', batch_categories = ['caiy2020', 'yoshida2021'], join = 'inner')
adata_healthy

- Remove unrequired columns

In [None]:
adata_healthy.obs = adata_healthy.obs.drop(columns=['age', 'gender', 'data_type', 'centre', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score'])

In [None]:
adata_healthy.obs = adata_healthy.obs.drop(columns= ['version', 'batch', 'Ethnicity', 'Group', 'sequencing_library', 'Protein_modality_weight', 
'individual', 'sample', 'COVID_severity', 'Smoker', 'BMI', 'nFeature_ADT', 'nCount_ADT', 'nFeature_RNA', 'nCount_RNA', 'orig.ident', 
'G2M_score', 'predicted_doublets', 'total_counts_mt', 'total_counts', 'n_genes_by_counts', 'n_genes'])

In [None]:
adata_healthy.obs['study'] = adata_healthy.obs['study'].cat.add_categories("Yoshida_2021").fillna("Yoshida_2021")

In [None]:
adata_healthy.obs

### Harmonized dataset labels

- Check 'tissue' label is unanimous in both datasets

In [None]:
adata_healthy.obs['tissue'].value_counts()

In [None]:
# Check donor
adata_healthy.obs['donor'].value_counts()

### Calculate HVGs

In [None]:
adata_raw = adata_healthy.copy()
adata_healthy.layers['counts'] = adata_healthy.X.copy()

In [None]:
sc.pp.highly_variable_genes(adata_healthy,
    flavor = "seurat_v3", 
    n_top_genes = 3000, 
    layer = "counts", 
    batch_key = "donor", 
    subset = True
)

adata_healthy

#### Remove unrequired variables prior to model run
- In order to improve size of notebook

In [None]:
del caiy2020
del adata_yoshida

### Integration with scVI


- Batch effect UMAP diagnostics

In [None]:
sc.pp.neighbors(adata_healthy, n_neighbors=50, use_rep='X', random_state=30, metric = 'minkowski')
sc.tl.umap(adata_healthy, min_dist = 0.3, spread = 1, random_state = 30)
sc.pl.umap(
    adata_healthy,
    color=["dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


In [None]:
scvi.model.SCVI.setup_anndata(adata_healthy, layer="counts", batch_key="donor")

#### Model 1

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_1 = scvi.model.SCVI(adata, n_layers=2, n_latent=15, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_1)

In [None]:
vae_1.train()

In [None]:
adata.obsm["X_scVI_1"] = vae_1.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI_1")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=30)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=['batch', 'leiden', 'dataset', 'donor', 'CD3E', 'CD74', 'CD8A', 'NKG7', 'CD14', 'FCGR3A', 'CD19', 'CD24', 'TOP2A', 'CD1C', 'CCR7', 'CLDN5'],
    frameon=False,
    ncols=4,
    size=4
)

#### Model 2

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_2 = scvi.model.SCVI(adata, n_layers=3, n_latent=25, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_2)

In [None]:
vae_2.train()

In [None]:
adata.obsm["X_scVI_2"] = vae_2.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI_2")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=30)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=['batch', 'leiden', 'dataset', 'donor', 'CD3E', 'CD74', 'CD8A', 'NKG7', 'CD14', 'FCGR3A', 'CD19', 'CD24', 'TOP2A', 'CD1C', 'CCR7', 'CLDN5'],
    frameon=False,
    ncols=4,
    size=4
)

#### Model 3

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")

In [None]:
vae_3 = scvi.model.SCVI(adata, n_layers=3, n_latent=50, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae_3)

In [None]:
vae_3.train()

In [None]:
adata.obsm["X_scVI_3"] = vae_3.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI_3")
sc.tl.leiden(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=30)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=['batch', 'leiden', 'dataset', 'donor', 'CD3E', 'CD74', 'CD8A', 'NKG7', 'CD14', 'FCGR3A', 'CD19', 'CD24', 'TOP2A', 'CD1C', 'CCR7', 'CLDN5'],
    frameon=False,
    ncols=4,
    size=4
)


### SCCAF clustering analysis

In [None]:
import SCCAF as sccaf
from SCCAF import SCCAF_assessment, plot_roc

- Model 1

In [None]:
y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.obsm["X_scVI_1"], adata_healthy.obs['leiden'], n = 100)

- Model 2

In [None]:
y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.obsm["X_scVI_2"], adata_healthy.obs['leiden'], n = 100)

- Model 3

In [None]:
y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.obsm["X_scVI_3"], adata_healthy.obs['leiden'], n = 100)

In [None]:
plot_roc(y_prob, y_test, clf, cvsm = cvsm, acc = acc)

plt.show()

### Benchmarking using scVI

In [None]:
from scib_metrics.benchmark import Benchmarker

%matplotlib inline

In [None]:
adata.obs

In [None]:
bm = Benchmarker(adata, 
                 batch_key="batch", 
                 label_key="leiden",
                 embedding_obsm_keys=["X_scVI_1", "X_scVI_2", "X_scVI_3"],
                 n_jobs=1,
                 )



bm.benchmark()

In [None]:
# Benchmark vis

bm.plot_results_table()

In [None]:
# from rich import print

df = bm.get_results(min_max_scale=False)
print(df)