## scANVI analysis for healthy PBMC pilot study (Cai 2020 and Cai 2022)

**Objective**: Run scANVI analysis for label transfer for healthy PBMCs [Cai 2020 and Cai 2022]


- **Developed by**: Mairi McClean

- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**

- v230317

- Following this tutorial: https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html
> "This particular workflow is useful in the case where a model is trained on some data (called reference here) and new samples are received (called query)."



In [1]:
# Sanity check
import os 
os.write(1, b"text\n")

text


5

### Import modules 

In [2]:
import sys

from scvi_colab import install

install()

IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    %pip install --quiet scrublet

                Not currently in Google Colab environment.

                Please run with `run_outside_colab=True` to override.

                Returning with no further action.
                
  warn(


In [3]:
import sys
import warnings

import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import scvi



Global seed set to 0
  from .autonotebook import tqdm as notebook_tqdm
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [4]:
warnings.simplefilter(action="ignore", category=FutureWarning)


sc.set_figure_params(figsize=(4, 4))
scvi.settings.seed = 94705

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

Global seed set to 94705


### Read in and prepare data
- Following steps from scVI portion of tutorial

In [5]:
adata_ref = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/INBOX/sc_downloads/yoshida_2021/meyer_nikolic_covid_pbmc.cellxgene.20210813.h5ad')
adata_ref

AnnData object with n_obs × n_vars = 422220 × 33751
    obs: 'patient_id', 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Sex', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'
    obsm: 'X_ umap (wnn derived)', 'X_umap (after harmony ADT)', 'X_umap (after harmony RNA)', 'X_umap (before harmony ADT)', 'X_umap (before harmony RNA)'

In [6]:
adata_ref.obs

Unnamed: 0,patient_id,Ethnicity,BMI,annotation_broad,annotation_detailed,annotation_detailed_fullNames,Age_group,COVID_severity,COVID_status,Group,Sex,Smoker,sample_id,sequencing_library,Protein_modality_weight
CV001_KM10202384-CV001_KM10202394_AAACCTGAGGCAGGTT-1,AN5,EUR,Unknown,Monocyte,Monocyte CD14,Classical monocyte,Adult,Healthy,Healthy,Adult,Female,Non-smoker,AN5,CV001_KM10202384-CV001_KM10202394,0.359517
CV001_KM10202384-CV001_KM10202394_AAACCTGAGTGTCCCG-1,AN5,EUR,Unknown,T CD4+,T CD4 helper,T CD4 helper,Adult,Healthy,Healthy,Adult,Female,Non-smoker,AN5,CV001_KM10202384-CV001_KM10202394,0.577522
CV001_KM10202384-CV001_KM10202394_AAACCTGCAGATGGGT-1,AN3,EUR,Unknown,T CD4+,T CD4 helper,T CD4 helper,Adult,Healthy,Healthy,Adult,Male,Non-smoker,AN3,CV001_KM10202384-CV001_KM10202394,0.369143
CV001_KM10202384-CV001_KM10202394_AAACCTGGTATAGTAG-1,AN5,EUR,Unknown,T CD8+,T CD8 naive,T CD8 naive,Adult,Healthy,Healthy,Adult,Female,Non-smoker,AN5,CV001_KM10202384-CV001_KM10202394,0.785563
CV001_KM10202384-CV001_KM10202394_AAACCTGGTGTGCGTC-1,AN5,EUR,Unknown,T CD4+,T CD4 naive,T CD4 naive,Adult,Healthy,Healthy,Adult,Female,Non-smoker,AN5,CV001_KM10202384-CV001_KM10202394,0.564174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S28_TTTGTCAGTTCTGTTT-1,PC9,EUR,27.17,NK,NK,NK,Adult,Severe,Post-COVID-19,Adult,Male,Non-smoker,PC9,CV001_KM9294396-CV001_KM9294404,0.429398
S28_TTTGTCATCAACCAAC-1,PC9,EUR,27.17,Monocyte,Monocyte CD14,Classical monocyte,Adult,Severe,Post-COVID-19,Adult,Male,Non-smoker,PC9,CV001_KM9294396-CV001_KM9294404,0.677910
S28_TTTGTCATCATTATCC-1,PC9,EUR,27.17,Monocyte,Monocyte CD14,Classical monocyte,Adult,Severe,Post-COVID-19,Adult,Male,Non-smoker,PC9,CV001_KM9294396-CV001_KM9294404,0.422796
S28_TTTGTCATCCTATGTT-1,PC9,EUR,27.17,DC,pDC,pDC,Adult,Severe,Post-COVID-19,Adult,Male,Non-smoker,PC9,CV001_KM9294396-CV001_KM9294404,0.471905


- Filter query dataset to online include healthy cells

In [7]:
adata_ref.obs['COVID_status'].value_counts()

Healthy          173684
COVID-19         151312
Post-COVID-19     97224
Name: COVID_status, dtype: int64

In [8]:
adata_ref_new = adata_ref[~adata_ref.obs['COVID_status'].isin(['COVID-19', 'Post-COVID-19']),:]
adata_ref_new

View of AnnData object with n_obs × n_vars = 173684 × 33751
    obs: 'patient_id', 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Sex', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'
    obsm: 'X_ umap (wnn derived)', 'X_umap (after harmony ADT)', 'X_umap (after harmony RNA)', 'X_umap (before harmony ADT)', 'X_umap (before harmony RNA)'

In [9]:
adata_ref_new.obs['COVID_status'].value_counts()

Healthy    173684
Name: COVID_status, dtype: int64

- Read in query data

In [11]:
# Note: Query data is that of only Cai, not the Cai+Yoshida data, but clustered following scvi. This is not the correct data object?

# adata_query = sc.read_h5ad('/Volumes/Lacie/data_lake/Mairi_example/processed_files/scvi/post_sccaf/CaiY_healthy_scRNA_PBMC_mm230316_scVI-clustered.raw.h5ad')

adata_query_2020 = sc.read_h5ad('/Volumes/LaCie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')
adata_query_2020

AnnData object with n_obs × n_vars = 73146 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'donor_colors'
    layers: 'counts', 'sqrt_norm'

In [12]:
adata_query_2022 = sc.read_h5ad('/Volumes/LaCie/data_lake/Mairi_example/processed_files/abridged_qc/human/Cai2022_scRNA_PBMC_mm230315_qcd.h5ad')
adata_query_2022

AnnData object with n_obs × n_vars = 25417 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'data_type', 'centre', 'version', 'object', 'protocol', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'donor_colors', 'sample_colors'
    layers: 'counts', 'sqrt_norm'

- Concatenate both query datasets


In [13]:
adata_query = adata_query_2020.concatenate(adata_query_2022, join='inner', batch_categories=['2020', '2022'], batch_key='year')

- Concatenate anndata objects

In [14]:
adata_healthy = adata_ref_new.concatenate(adata_query, join="inner", batch_categories=['reference', 'query'], batch_key='dataset')

  warn(


In [15]:
adata_healthy

AnnData object with n_obs × n_vars = 272247 × 22792
    obs: 'patient_id', 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Sex', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets', 'object', 'protocol', 'year', 'dataset'
    var: 'gene_id-query', 'mt-query', 'ribo-query', 'n_cells_by_counts-2020-query', 'mean_counts-2020-query', 'pct_dropout_by_counts-2020-query', 'total_counts-2020-query', 'n_cells_by_counts-2022-query', 'mean_counts-2022-query', 'pct_dropout_by_counts-2022-query', 'total_counts-2022-query', 'name-refe

In [16]:
adata_healthy.obs

Unnamed: 0,patient_id,Ethnicity,BMI,annotation_broad,annotation_detailed,annotation_detailed_fullNames,Age_group,COVID_severity,COVID_status,Group,...,n_counts,percent_chrY,XIST-counts,S_score,G2M_score,predicted_doublets,object,protocol,year,dataset
CV001_KM10202384-CV001_KM10202394_AAACCTGAGGCAGGTT-1-reference,AN5,EUR,Unknown,Monocyte,Monocyte CD14,Classical monocyte,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGAGTGTCCCG-1-reference,AN5,EUR,Unknown,T CD4+,T CD4 helper,T CD4 helper,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGCAGATGGGT-1-reference,AN3,EUR,Unknown,T CD4+,T CD4 helper,T CD4 helper,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGGTATAGTAG-1-reference,AN5,EUR,Unknown,T CD8+,T CD8 naive,T CD8 naive,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGGTGTGCGTC-1-reference,AN5,EUR,Unknown,T CD4+,T CD4 naive,T CD4 naive,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTCCATCCT-HRS100513-pbmc_scRNAseq-2022-query,,,,,,,,,,,...,746.0,0.000000,0.0,-0.137711,-0.152166,0.0,HRS100513,pbmc_scRNAseq,2022,query
TTTGTCAGTGCTGTAT-HRS100513-pbmc_scRNAseq-2022-query,,,,,,,,,,,...,369.0,0.000000,0.0,0.130521,0.092987,0.0,HRS100513,pbmc_scRNAseq,2022,query
TTTGTCAGTGTGGCTC-HRS100513-pbmc_scRNAseq-2022-query,,,,,,,,,,,...,348.0,0.000000,0.0,-0.043432,-0.034049,0.0,HRS100513,pbmc_scRNAseq,2022,query
TTTGTCATCGTTTATC-HRS100513-pbmc_scRNAseq-2022-query,,,,,,,,,,,...,1340.0,0.074627,0.0,0.236621,0.071757,0.0,HRS100513,pbmc_scRNAseq,2022,query


In [17]:
print(adata_healthy.obs.dtypes)

patient_id                       category
Ethnicity                        category
BMI                              category
annotation_broad                 category
annotation_detailed              category
annotation_detailed_fullNames    category
Age_group                        category
COVID_severity                   category
COVID_status                     category
Group                            category
Sex                              category
Smoker                           category
sample_id                        category
sequencing_library               category
Protein_modality_weight           float64
study                              object
individual                         object
sample                             object
tissue                           category
donor                              object
age                              category
gender                           category
status                           category
data_type                        c

In [18]:
adata_healthy.obs['annotation_broad'].value_counts()

T CD4+       49415
T CD8+       34110
Monocyte     28637
B            26132
NK           21871
T reg         3251
T g/d         3183
MAIT          2213
DC            2151
Cycling       1012
Platelets      626
HPC            416
Plasma         352
ILC            199
RBC            106
Baso/Eos        10
Name: annotation_broad, dtype: int64

In [21]:
adata_healthy.obs["annotation_broad"] = adata_healthy.obs["annotation_broad"].cat.add_categories("unknown").fillna("unknown")

In [19]:
adata_healthy.obs["annotation_detailed"] = adata_healthy.obs["annotation_detailed"].cat.add_categories("unknown").fillna("unknown")


In [20]:
adata_healthy.obs["annotation_detailed_fullNames"] = adata_healthy.obs["annotation_detailed_fullNames"].cat.add_categories("unknown").fillna("unknown")


In [22]:
adata_healthy.obs

Unnamed: 0,patient_id,Ethnicity,BMI,annotation_broad,annotation_detailed,annotation_detailed_fullNames,Age_group,COVID_severity,COVID_status,Group,...,n_counts,percent_chrY,XIST-counts,S_score,G2M_score,predicted_doublets,object,protocol,year,dataset
CV001_KM10202384-CV001_KM10202394_AAACCTGAGGCAGGTT-1-reference,AN5,EUR,Unknown,Monocyte,Monocyte CD14,Classical monocyte,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGAGTGTCCCG-1-reference,AN5,EUR,Unknown,T CD4+,T CD4 helper,T CD4 helper,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGCAGATGGGT-1-reference,AN3,EUR,Unknown,T CD4+,T CD4 helper,T CD4 helper,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGGTATAGTAG-1-reference,AN5,EUR,Unknown,T CD8+,T CD8 naive,T CD8 naive,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
CV001_KM10202384-CV001_KM10202394_AAACCTGGTGTGCGTC-1-reference,AN5,EUR,Unknown,T CD4+,T CD4 naive,T CD4 naive,Adult,Healthy,Healthy,Adult,...,,,,,,,,,,reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTCCATCCT-HRS100513-pbmc_scRNAseq-2022-query,,,,unknown,unknown,unknown,,,,,...,746.0,0.000000,0.0,-0.137711,-0.152166,0.0,HRS100513,pbmc_scRNAseq,2022,query
TTTGTCAGTGCTGTAT-HRS100513-pbmc_scRNAseq-2022-query,,,,unknown,unknown,unknown,,,,,...,369.0,0.000000,0.0,0.130521,0.092987,0.0,HRS100513,pbmc_scRNAseq,2022,query
TTTGTCAGTGTGGCTC-HRS100513-pbmc_scRNAseq-2022-query,,,,unknown,unknown,unknown,,,,,...,348.0,0.000000,0.0,-0.043432,-0.034049,0.0,HRS100513,pbmc_scRNAseq,2022,query
TTTGTCATCGTTTATC-HRS100513-pbmc_scRNAseq-2022-query,,,,unknown,unknown,unknown,,,,,...,1340.0,0.074627,0.0,0.236621,0.071757,0.0,HRS100513,pbmc_scRNAseq,2022,query


- Rename cell type labels column

In [23]:
adata_healthy.obs["labels_scanvi"] = adata_healthy.obs["annotation_broad"].values

In [24]:
adata_healthy.obs['labels_scanvi'].value_counts()

unknown      98563
T CD4+       49415
T CD8+       34110
Monocyte     28637
B            26132
NK           21871
T reg         3251
T g/d         3183
MAIT          2213
DC            2151
Cycling       1012
Platelets      626
HPC            416
Plasma         352
ILC            199
RBC            106
Baso/Eos        10
Name: labels_scanvi, dtype: int64

- Filter genes

In [27]:
# Save counts in layer

adata_healthy.layers['counts'] = adata_healthy.X.copy()
adata_healthy

AnnData object with n_obs × n_vars = 272247 × 22792
    obs: 'patient_id', 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Sex', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets', 'object', 'protocol', 'year', 'dataset', 'labels_scanvi'
    var: 'gene_id-query', 'mt-query', 'ribo-query', 'n_cells_by_counts-2020-query', 'mean_counts-2020-query', 'pct_dropout_by_counts-2020-query', 'total_counts-2020-query', 'n_cells_by_counts-2022-query', 'mean_counts-2022-query', 'pct_dropout_by_counts-2022-query', 'total_counts-2022-q

In [28]:
# Note: For best practices of how/when to perform feature selection, 
# please refer to the model-specific tutorial. For scVI, we recommend anywhere from 1,000 to 10,000 HVGs, but it will be context-dependent.

sc.pp.highly_variable_genes(
    adata_healthy,
    n_top_genes=1200,
    subset=True,
    layer="counts",
    flavor="seurat_v3",
    batch_key="dataset",
)



- Set labels and prepare adata subsets for model training

In [29]:
# Taken from Anna's notebook

condition_key = 'dataset'
cell_type_key = 'labels_scanvi'
target_conditions = ['query']

In [32]:
from scarches.dataset.trvae.data_handling import remove_sparsity

 captum (see https://github.com/pytorch/captum).


In [31]:
# Taken from Anna's notebook

adata_healthy = remove_sparsity(adata_healthy)
source_adata = adata_healthy[~adata_healthy.obs[condition_key].isin(target_conditions)].copy()
target_adata = adata_healthy[adata_healthy.obs[condition_key].isin(target_conditions)].copy()

ModuleNotFoundError: No module named 'scarches'

### Train reference

> From tutorial: SCANVI tends to perform better in situations where it has been initialized using a pre-trained SCVI model. 

- scVI model

In [None]:
scvi.model.SCVI.setup_anndata(adata_ref_new, batch_key="sequencing_library", layer="counts")

In [None]:
# Added for sarches params following tutorial; check if we require it?
arches_params = dict(
    use_layer_norm="both",
    use_batch_norm="none",
    encode_covariates=True,
    dropout_rate=0.2,
    n_layers=2,
)

scvi_ref_new_model = scvi.model.SCVI(adata_ref_new, **arches_params)
scvi_ref_new_model.train()

- scANVI model

In [None]:
# unlabeled category does not exist in adata.obs[labels_key]
# so all cells are treated as labeled
vae_ref_scanvi = scvi.model.SCANVI.from_scvi_model(
    scvi_ref_new_model,
    unlabeled_category="Unknown",
    labels_key="labels_scanvi",
)

In [None]:
vae_ref_scanvi.train()

In [None]:
adata_ref_new.obsm["X_scANVI"] = vae_ref_scanvi.get_latent_representation()
sc.pp.neighbors(adata_ref_new, use_rep="X_scANVI")
sc.tl.leiden(adata_ref_new)
sc.tl.umap(adata_ref_new)


In [None]:
sc.pl.umap(
    adata_ref_new,
    color=["sequencing_library", "annotation_detailed"],
    frameon=False,
    ncols=1,
)

### Update with query

Question: Do you cluster your query data before entering it into the scanvi model?

In [None]:
dir_path_scan = "/Volumes/LaCie/data_lake/Mairi_example/processed_files/scanvi/models/"
vae_ref_scanvi.save(dir_path_scan, overwrite=True)

In [None]:
# again a no-op in this tutorial, but good practice to use
scvi.model.SCANVI.prepare_query_anndata(adata_query, dir_path_scan)

In [None]:
vae_q = scvi.model.SCANVI.load_query_data(
    adata_query,
    dir_path_scan,
)

In [None]:
vae_q.train(
    max_epochs=100,
    plan_kwargs=dict(weight_decay=0.0),
    check_val_every_n_epoch=10,
)

In [None]:
adata_query.obsm["X_scANVI"] = vae_q.get_latent_representation()
adata_query.obs["predictions"] = vae_q.predict()

In [None]:
df = adata_query.obs.groupby(["celltype", "predictions"]).size().unstack(fill_value=0)
norm_df = df / df.sum(axis=0)

plt.figure(figsize=(8, 8))
_ = plt.pcolor(norm_df)
_ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90)
_ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xlabel("Predicted")
plt.ylabel("Observed")

### Analyze reference and query

In [None]:
adata_full = adata_query.concatenate(adata_ref)

In [None]:
adata_full.obs.batch.cat.rename_categories(["Query", "Reference"], inplace=True)

In [None]:
full_predictions = vae_q.predict(adata_full)
print(f"Acc: {np.mean(full_predictions == adata_full.obs.celltype)}")

adata_full.obs["predictions"] = full_predictions

In [None]:
sc.pp.neighbors(adata_full, use_rep="X_scANVI")
sc.tl.leiden(adata_full)
sc.tl.umap(adata_full)

In [None]:
sc.pl.umap(
    adata_full,
    color=["tech", "celltype"],
    frameon=False,
    ncols=1,
)

In [None]:
ax = sc.pl.umap(
    adata_full,
    frameon=False,
    show=False,
)
sc.pl.umap(
    adata_full[: adata_query.n_obs],
    color=["predictions"],
    frameon=False,
    title="Query predictions",
    ax=ax,
    alpha=0.7,
)

ax = sc.pl.umap(
    adata_full,
    frameon=False,
    show=False,
)
sc.pl.umap(
    adata_full[: adata_query.n_obs],
    color=["celltype"],
    frameon=False,
    title="Query observed cell types",
    ax=ax,
    alpha=0.7,
)