## Notebook for the exploratory analysis of healthy PBMCs from Meyer21 using `scVI`

- **Developed by**: Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v220607

### Load required modules

In [1]:
import sys
import scvi
import anndata
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

import numpy as np
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

import matplotlib.pyplot as plt
#from SCCAF import SCCAF_assessment, plot_roc

Global seed set to 0


In [2]:
save_path = "/home/cartalop/data/lung/covid/COVID_km16/pbmc/"

In [3]:
%matplotlib inline
matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42

In [4]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         9.1.1
absl                        NA
asttokens                   NA
attr                        21.4.0
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.0
chex                        0.1.3
colorama                    0.4.5
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
deprecate                   0.3.1
docrep                      0.3.2
entrypoints                 0.4
etils                       0.6.0
executing                   0.8.3
flatbuffers                 2.0
flax                        0.5.2
fsspec                      2022.5.0
google                      NA
h5py                        3.7.0
hypergeom_ufunc             NA
importlib_resources         NA
ipykernel             

### Read in datasets

- Read in formatted object

In [5]:
pbmc_healthy = sc.read_h5ad('/home/cartalop/data/lung/covid/COVID_km16/pbmc/meyer_nikolic_healthy_pbmc_raw.h5ad')
pbmc_healthy

AnnData object with n_obs × n_vars = 173684 × 33559
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'

In [6]:
pbmc_healthy.obs['sample_id'].cat.categories

Index(['AN1', 'AN2', 'AN3', 'AN5', 'AN6', 'AN7', 'AN9', 'AN11', 'AN12', 'AN13',
       'AN14', 'NP13', 'NP15', 'NP16', 'NP17', 'NP18', 'NP19', 'NP20', 'NP21',
       'NP22', 'NP23', 'NP24', 'NP26', 'NP27', 'NP28', 'NP30', 'NP31', 'NP32',
       'NP35', 'NP36', 'NP37', 'NP38', 'NP39', 'NP41', 'NP44'],
      dtype='object')

In [7]:
pbmc_healthy.obs['patient_id'].cat.categories

Index(['AN1', 'AN2', 'AN3', 'AN5', 'AN6', 'AN7', 'AN9', 'AN11', 'AN12', 'AN13',
       'AN14', 'NP13', 'NP15', 'NP16', 'NP17', 'NP18', 'NP19', 'NP20', 'NP21',
       'NP22', 'NP23', 'NP24', 'NP26', 'NP27', 'NP28', 'NP30', 'NP31', 'NP32',
       'NP35', 'NP36', 'NP37', 'NP38', 'NP39', 'NP41', 'NP44'],
      dtype='object')

### Calculate HVGs

In [8]:
pbmc_healthy.raw = pbmc_healthy.copy()  # keep full dimension safe
pbmc_healthy.layers['counts'] = pbmc_healthy.X.copy()

In [9]:
sc.pp.highly_variable_genes(
    pbmc_healthy,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "sample_id",
    subset = True
)
pbmc_healthy

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)


AnnData object with n_obs × n_vars = 173684 × 7000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'patient_id', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg'
    layers: 'counts'

### Data integration with `scVI`

In [10]:
scvi.model.SCVI.setup_anndata(
    pbmc_healthy,
    layer = "counts",
    categorical_covariate_keys = ["sample_id"],
    continuous_covariate_keys = ["nFeature_RNA", "nCount_RNA"]
)

In [11]:
model = scvi.model.SCVI(pbmc_healthy, n_layers = 3, n_latent = 50, gene_likelihood = "nb", dispersion = "gene-batch")
model



In [12]:
model.train(use_gpu = True, early_stopping = True)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 2/46:   2%|▏         | 1/46 [04:03<3:02:59, 243.99s/it, loss=1.73e+03, v_num=1]

In [None]:
latent = model.get_latent_representation()
pbmc_healthy.obsm["X_scVI"] = latent

In [None]:
sc.pp.neighbors(pbmc_healthy, use_rep = "X_scVI", n_neighbors = 50, metric = 'minkowski')
sc.tl.umap(pbmc_healthy, min_dist = 0.3, spread = 5, random_state = 1712)
sc.pl.umap(pbmc_healthy, frameon = False, color = ['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'Age_group', 'BMI', 'COVID_severity', 'COVID_status', 'Ethnicity', 'Group', 'Sex', 'Smoker', 'annotation_broad', 'patient_id', 'sample_id'], size = 0.4, legend_fontsize = 5, ncols = 4)

In [None]:
sc.pl.umap(heart_immune, frameon = False, color = ['sample', 'condition', 'timepoint', 'n_genes','n_counts','pct_counts_mt', 'pct_counts_ribo', 'doublet_scores'], size = 0.4, legend_fontsize = 4, ncols = 4)

In [None]:
sc.pl.umap(heart_immune, frameon = False, color = ['Ttn', 'Dcn', 'Cdh5', 'C1qa', 'Cd3e', 'Cd8a', 'Cd40lg', 'Folr2', 'Lyve1'], size = 0.4, legend_fontsize = 5, legend_loc = 'on data', ncols = 4)

### Use `SCAFF` to select `leiden` resolution

In [None]:
sc.tl.leiden(heart_immune, resolution = 1, random_state = 1786)

In [None]:
import matplotlib.pyplot as plt
from SCCAF import SCCAF_assessment, plot_roc
y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(heart_immune.X, heart_immune.obs['leiden'], n = 100)

In [None]:
plot_roc(y_prob, y_test, clf, cvsm = cvsm, acc = acc)
plt.show()

In [None]:
sc.pl.umap(heart_immune, frameon = False, color = ['leiden'], size = 0.8, legend_fontsize = 5, legend_loc = 'on data')

In [None]:
sc.pl.umap(heart_immune, frameon = False, color = ['leiden', 'Lyve1', 'Folr2', 'Trem2'], size = 0.8, legend_fontsize = 5, legend_loc = 'on data')