## Notebook for exploratory analysis of _Cai Y et al 2020_ and _Cai Y et al 2022_ scRNA-Seq data using `scVI`

- **Developed by**: Carlos Talavera-LÃ³pez Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v221015

### Load required modules

In [None]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

import matplotlib.pyplot as plt

In [None]:
save_path = "/home/cartalop/data/single_cell/lung/influenza/batch_corrected/"

In [None]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [None]:
torch.cuda.is_available()

In [None]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

### Read in datasets

- Read in formatted object

In [None]:
Versuch_1 = sc.read_h5ad('../data/COPD_influenza_V1_QCed_pre-process_light_ctl220719.h5ad')
Versuch_1

In [None]:
Versuch_2 = sc.read_h5ad('../data/COPD_influenza_V2_QCed_pre-process_light_ctl220719.h5ad')
Versuch_2

In [None]:
Versuch_3 = sc.read_h5ad('../data/COPD_influenza_V3_QCed_pre-process_light_ctl220719.h5ad')
Versuch_3

In [None]:
Versuch_4 = sc.read_h5ad('../data/COPD_influenza_V4_QCed_pre-process_light_ctl220719.h5ad')
Versuch_4

In [None]:
Versuch_5 = sc.read_h5ad('../data/COPD_influenza_V5_QCed_pre-process_light_ctl220719.h5ad')
Versuch_5

In [None]:
Versuch_6 = sc.read_h5ad('../data/COPD_influenza_V6_QCed_pre-process_light_ctl220719.h5ad')
Versuch_6

### Merge objects into a single processed one

In [None]:
copd_influenza = Versuch_1.concatenate(Versuch_2, Versuch_3, Versuch_4, Versuch_5, Versuch_6, batch_key = 'sample_group', batch_categories = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6'], join = 'inner') 
copd_influenza

In [None]:
copd_influenza.obs['condition'].value_counts()

### Select condition to study

In [None]:
adata = copd_influenza[copd_influenza.obs['condition'].isin(['CTRL'])]
adata

In [None]:
adata.obs['batch'].value_counts()

In [None]:
adata.obs['batch'] = adata.obs['batch'].astype('category')
adata.obs['batch'].cat.categories

### Calculate HVGs

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 8000,
    layer = "counts",
    batch_key = "batch",
    subset = True
)

### Data integration with `scVI`

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer = "counts",
    categorical_covariate_keys = ["batch"],
    continuous_covariate_keys = ["n_genes", "n_counts"]
)

In [None]:
model = scvi.model.SCVI(adata, n_layers = 3, n_latent = 50, gene_likelihood = "nb", dispersion = 'gene-batch')
model

In [None]:
model.train()

In [None]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent

In [None]:
sc.pp.neighbors(adata, use_rep = "X_scVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.2, spread = 8, random_state = 1712)
sc.pl.umap(adata, frameon = False, color = ['age', 'condition', 'PaCO2', 'donor', 'infection', 'disease', 'SMK', 'batch', 'n_genes', 'n_counts'], size = 1, legend_fontsize = 5, ncols = 4)

In [None]:
sc.pl.umap(adata, frameon = False, color = ['ADH7', 'CDH1', 'CD74', 'CD3E', 'MUC20', 'DUSP4', 'FOXJ1', 'MUC1', 'FOXI1'], size = 1, legend_fontsize = 5, legend_loc = 'on data', ncols = 4)

### Use `SCAFF` to select `leiden` resolution

In [None]:
sc.tl.leiden(adata, resolution = 0.7, random_state = 1786)

In [None]:
import matplotlib.pyplot as plt
from SCCAF import SCCAF_assessment, plot_roc
y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.X, adata.obs['leiden'], n = 100)

In [None]:
plot_roc(y_prob, y_test, clf, cvsm = cvsm, acc = acc)
plt.show()

In [None]:
sc.pl.umap(adata, frameon = False, color = ['leiden', 'disease', 'CD74'], size = 0.8, legend_fontsize = 5, legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, frameon = False, color = ['leiden', 'disease', 'infection', 'ADH7', 'CDH1', 'CD74', 'CD3E', 'MUC20', 'DUSP4', 'FOXJ1', 'MUC1', 'FOXI1'], size = 1, legend_fontsize = 5)

### Export clustered object

In [None]:
adata.write(save_path + 'COPD_influenza_CTRL_scVI-clustered_220813_v1.h5ad')