## Notebook for exploratory analysis of _Cai Y et al 2020_ and _Cai Y et al 2022_ scRNA-Seq data using `scVI`

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v221015

### Load required modules

In [1]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

import matplotlib.pyplot as plt

Global seed set to 0
  new_rank_zero_deprecation(
  return new_rank_zero_deprecation(*args, **kwargs)


In [2]:
save_path = "/home/cartalop/github/TB_cellular_circuits/2-data_integration/"

In [3]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [4]:
torch.cuda.is_available()

True

In [5]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                 9.2.0
absl                NA
asttokens           NA
attr                22.1.0
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
cffi                1.15.1
chex                0.1.4
colorama            0.4.5
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.2
decorator           5.1.1
defusedxml          0.7.1
deprecate           0.3.2
docrep              0.3.2
entrypoints         0.4
etils               0.7.1
executing           0.9.1
flax                0.5.3
fsspec              2022.7.1
google              NA
h5py                3.7.0
hypergeom_ufunc     NA
igraph              0.9.11
ipykernel           6.15.1
ipython_genutils    0.2.0
ipywidgets          7.7.1
jax                 0.3.15
jaxlib              0.3.15
jedi                0.18.1
joblib              1.1.0
kiwisolver          1.4.4
leidenalg           0.8.10
llvmlite            0.39.0


### Read in datasets

- Read in _Cai Y et al 2020_

In [6]:
caiy2020 = sc.read_h5ad('/home/cartalop/data/single_cell/lung/tb/caiy2020/CaiY2020_PBMC_TB_QCed_pre-process_ctl221014.h5ad')
caiy2020

AnnData object with n_obs × n_vars = 73146 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'donor_colors', 'sample_colors'
    layers: 'counts', 'sqrt_norm'

- Read in _Cai Y et al 2022_

In [7]:
caiy2022 = sc.read_h5ad('/home/cartalop/data/single_cell/lung/tb/caiy2022/CaiY2022_TB_QCed_pre-process_ctl221015.h5ad')
caiy2022.obs['status'] = 'active_TB'
caiy2022

AnnData object with n_obs × n_vars = 72235 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'data_type', 'centre', 'version', 'object', 'protocol', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'status'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'donor_colors', 'sample_colors'
    layers: 'counts', 'sqrt_norm'

### Verify sample origin

In [8]:
caiy2020.obs['tissue'].value_counts()

PBMC    73146
Name: tissue, dtype: int64

In [9]:
caiy2022.obs['data_type'].value_counts()

scRNAseq    66560
scTCRseq     5675
Name: data_type, dtype: int64

### Merge datasets

In [10]:
caiy_tb = caiy2020.concatenate(caiy2022, batch_key = 'dataset', batch_categories = ['caiy2020', 'caiy2022'], join = 'inner')
caiy_tb

AnnData object with n_obs × n_vars = 145381 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

### Split datasets into PBMC and PFMC 

In [11]:
caiy_pbmc = caiy_tb[caiy_tb.obs['tissue'].isin(['PBMC'])]
caiy_pbmc

View of AnnData object with n_obs × n_vars = 100600 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

In [12]:
caiy_pfmc = caiy_tb[caiy_tb.obs['tissue'].isin(['PFMC'])]
caiy_pfmc

View of AnnData object with n_obs × n_vars = 44781 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

In [13]:
caiy_tb.obs['sample'].value_counts()

HRS100512      12629
HRS100508      11605
PBMC_TB_1      11521
PBMC_HC_1      11275
PBMC_TB_3      11174
PBMC_HC_2      10774
PBMC_TB_2      10409
PBMC_LTBI_2     9846
HRS100507       9689
HRS100514       9359
PBMC_LTBI_1     8147
HRS100513       7401
HRS100510       6312
HRS100511       5476
HRS180104       2871
HRS180101       2588
HRS180103       2242
HRS180102       2005
HRS100509         58
Name: sample, dtype: int64

### Processs all dataset together for first exploratory analysis

In [14]:
caiy_tb_gex = caiy_tb[~caiy_tb.obs['data_type'].isin(['scTCRseq'])] 
caiy_tb_gex

View of AnnData object with n_obs × n_vars = 139706 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'predicted_doublets', 'object', 'protocol', 'dataset'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts-caiy2020', 'mean_counts-caiy2020', 'pct_dropout_by_counts-caiy2020', 'total_counts-caiy2020', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022'
    layers: 'counts', 'sqrt_norm'

### Calculate HVGs

In [15]:
adata = caiy_tb_gex.copy()
adata.layers['counts'] = adata.X.copy()

In [16]:
sc.pp.highly_variable_genes(
    adata,
    flavor = "seurat_v3",
    n_top_genes = 4000,
    layer = "counts",
    batch_key = "sample",
    subset = True
)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)


### Data integration with `scVI`

In [17]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer = "counts",
    categorical_covariate_keys = ["sample"],
    continuous_covariate_keys = ["n_genes", "n_counts"]
)

In [18]:
model = scvi.model.SCVI(adata, n_layers = 3, n_latent = 50, gene_likelihood = "nb", dispersion = 'gene-batch')
model



In [19]:
model.train()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 57/57: 100%|██████████| 57/57 [10:01<00:00, 10.55s/it, loss=1.45e+03, v_num=1]

`Trainer.fit` stopped: `max_epochs=57` reached.


Epoch 57/57: 100%|██████████| 57/57 [10:01<00:00, 10.55s/it, loss=1.45e+03, v_num=1]


In [20]:
latent = model.get_latent_representation()
adata.obsm["X_scVI"] = latent

In [21]:
sc.pp.neighbors(adata, use_rep = "X_scVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(adata, min_dist = 0.2, spread = 8, random_state = 1712)
sc.pl.umap(adata, frameon = False, color = ['study', 'individual', 'tissue', 'status', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'doublet_scores', 'object', 'dataset'], size = 1, legend_fontsize = 5, ncols = 4)

computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:49)
computing UMAP


In [None]:
sc.pl.umap(adata, frameon = False, color = ['ADH7', 'CDH1', 'CD74', 'CD3E', 'MUC20', 'DUSP4', 'FOXJ1', 'MUC1', 'FOXI1'], size = 1, legend_fontsize = 5, legend_loc = 'on data', ncols = 4)

### Use `SCAFF` to select `leiden` resolution

In [None]:
sc.tl.leiden(adata, resolution = 0.7, random_state = 1786)

In [None]:
import matplotlib.pyplot as plt
from SCCAF import SCCAF_assessment, plot_roc
y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.X, adata.obs['leiden'], n = 100)

In [None]:
plot_roc(y_prob, y_test, clf, cvsm = cvsm, acc = acc)
plt.show()

In [None]:
sc.pl.umap(adata, frameon = False, color = ['leiden', 'disease', 'CD74'], size = 0.8, legend_fontsize = 5, legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, frameon = False, color = ['leiden', 'disease', 'infection', 'ADH7', 'CDH1', 'CD74', 'CD3E', 'MUC20', 'DUSP4', 'FOXJ1', 'MUC1', 'FOXI1'], size = 1, legend_fontsize = 5)

### Export clustered object

In [None]:
adata.write(save_path + 'COPD_influenza_CTRL_scVI-clustered_220813_v1.h5ad')