### Notebook for the integration using ```scVI``` and clustering evaluation with ```SCCAF``` for Healthy PBMCs

**Objective**: Integration of healthy PBMCs from Cai 2020 {https://pubmed.ncbi.nlm.nih.gov/32114394/} and Yoshida 2021 {https://www.nature.com/articles/s41591-021-01329-2#data-availability}. 

**Developed by**: Mairi McClean

**Institute of Computational Biology - Helmholtz Zentrum Munich**

**v230425**


### Import required modules

In [1]:
import sys
import scvi
import torch
import anndata
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

import numpy as np
import pandas as pd
import scanpy as sc
import numpy.random as random


from umap import UMAP
import warnings; warnings.simplefilter('ignore')

Global seed set to 0
  from .autonotebook import tqdm as notebook_tqdm
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [2]:
sc.set_figure_params(figsize=(4, 4))

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

### Read in datasets for integration


- Read in _Cai Y et al 2020_

In [3]:
# Remove full file paths

caiy2020 = sc.read_h5ad('../data/Cai2020_scRNA_PBMC_mm230315_qcd.h5ad')
caiy2020

AnnData object with n_obs × n_vars = 73146 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets'
    var: 'gene_id', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
    uns: 'donor_colors'
    layers: 'counts', 'sqrt_norm'

In [4]:
caiy2020.obs['status'].value_counts()

status
active_TB    33104
Healthy      22049
latent_TB    17993
Name: count, dtype: int64

In [6]:
caiy_healthy = caiy2020[~caiy2020.obs['status'].isin(['active_TB', 'latent_TB']),:]

In [7]:
caiy_healthy.obs['status'].value_counts()

status
Healthy    22049
Name: count, dtype: int64

- Read in _Yoshida et al 2021_

In [8]:
adata_yoshida = sc.read_h5ad('../data/meyer_nikolic_covid_pbmc.cellxgene.20210813.h5ad')
adata_yoshida

AnnData object with n_obs × n_vars = 422220 × 33751
    obs: 'patient_id', 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Sex', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight'
    var: 'name'
    obsm: 'X_ umap (wnn derived)', 'X_umap (after harmony ADT)', 'X_umap (after harmony RNA)', 'X_umap (before harmony ADT)', 'X_umap (before harmony RNA)'

In [11]:
adata_yoshida.obs['COVID_status'].value_counts()

COVID_status
Healthy          173684
COVID-19         151312
Post-COVID-19     97224
Name: count, dtype: int64

- Subset healthy Yoshida data 

In [12]:
yoshida_healthy = adata_yoshida[~adata_yoshida.obs['COVID_status'].isin(['COVID-19', 'Post-COVID-19']),:]

In [13]:
yoshida_healthy.obs['COVID_status'].value_counts()

COVID_status
Healthy    173684
Name: count, dtype: int64

- Merge _Cai_ and _Yoshida_ data

In [14]:
adata_healthy = caiy_healthy.concatenate(yoshida_healthy, batch_key = 'dataset', batch_categories = ['caiy2022', 'yoshida2021'], join = 'inner')
adata_healthy

AnnData object with n_obs × n_vars = 195733 × 22792
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'percent_mt2', 'n_counts', 'percent_chrY', 'XIST-counts', 'S_score', 'G2M_score', 'predicted_doublets', 'patient_id', 'Ethnicity', 'BMI', 'annotation_broad', 'annotation_detailed', 'annotation_detailed_fullNames', 'Age_group', 'COVID_severity', 'COVID_status', 'Group', 'Sex', 'Smoker', 'sample_id', 'sequencing_library', 'Protein_modality_weight', 'dataset'
    var: 'gene_id-caiy2022', 'mt-caiy2022', 'ribo-caiy2022', 'n_cells_by_counts-caiy2022', 'mean_counts-caiy2022', 'pct_dropout_by_counts-caiy2022', 'total_counts-caiy2022', 'name-yoshida2021'

In [15]:
adata_healthy.obs

Unnamed: 0,study,individual,sample,tissue,donor,age,gender,status,data_type,centre,...,Age_group,COVID_severity,COVID_status,Group,Sex,Smoker,sample_id,sequencing_library,Protein_modality_weight,dataset
AAACCTGAGATGGCGT-H2-caiy2022,CaiY_2021,SAMN14048020,PBMC_HC_2,PBMC,SAMN14048020,30,male,Healthy,scRNAseq,Shenzhen University,...,,,,,,,,,,caiy2022
AAACCTGAGATGTGTA-H2-caiy2022,CaiY_2021,SAMN14048020,PBMC_HC_2,PBMC,SAMN14048020,30,male,Healthy,scRNAseq,Shenzhen University,...,,,,,,,,,,caiy2022
AAACCTGAGCAACGGT-H2-caiy2022,CaiY_2021,SAMN14048020,PBMC_HC_2,PBMC,SAMN14048020,30,male,Healthy,scRNAseq,Shenzhen University,...,,,,,,,,,,caiy2022
AAACCTGAGCCATCGC-H2-caiy2022,CaiY_2021,SAMN14048020,PBMC_HC_2,PBMC,SAMN14048020,30,male,Healthy,scRNAseq,Shenzhen University,...,,,,,,,,,,caiy2022
AAACCTGAGGTGATTA-H2-caiy2022,CaiY_2021,SAMN14048020,PBMC_HC_2,PBMC,SAMN14048020,30,male,Healthy,scRNAseq,Shenzhen University,...,,,,,,,,,,caiy2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S22_TTTGTCAGTCGCATCG-1-yoshida2021,,,,,,,,,,,...,Adolescent,Healthy,Healthy,Paediatric,Male,Non-smoker,NP32,CV001_KM9166642-CV001_KM9166650,0.343914,yoshida2021
S22_TTTGTCAGTGTAAGTA-1-yoshida2021,,,,,,,,,,,...,Adolescent,Healthy,Healthy,Paediatric,Male,Non-smoker,NP32,CV001_KM9166642-CV001_KM9166650,0.264285,yoshida2021
S22_TTTGTCATCATGTCCC-1-yoshida2021,,,,,,,,,,,...,Child,Healthy,Healthy,Paediatric,Male,Non-smoker,NP31,CV001_KM9166642-CV001_KM9166650,0.353094,yoshida2021
S22_TTTGTCATCGAGGTAG-1-yoshida2021,,,,,,,,,,,...,Child,Healthy,Healthy,Paediatric,Male,Non-smoker,NP31,CV001_KM9166642-CV001_KM9166650,0.611991,yoshida2021


### Harmonize dataset labels

In [18]:
# tissue
adata_healthy.obs['tissue'] == 'PBMC'

AAACCTGAGATGGCGT-H2-caiy2022           True
AAACCTGAGATGTGTA-H2-caiy2022           True
AAACCTGAGCAACGGT-H2-caiy2022           True
AAACCTGAGCCATCGC-H2-caiy2022           True
AAACCTGAGGTGATTA-H2-caiy2022           True
                                      ...  
S22_TTTGTCAGTCGCATCG-1-yoshida2021    False
S22_TTTGTCAGTGTAAGTA-1-yoshida2021    False
S22_TTTGTCATCATGTCCC-1-yoshida2021    False
S22_TTTGTCATCGAGGTAG-1-yoshida2021    False
S22_TTTGTCATCTGCTGCT-1-yoshida2021    False
Name: tissue, Length: 195733, dtype: bool

In [None]:
# donor - 2 from Cai "donor" and X from Yoshida "sample_id"


### Calculate HVGs

In [None]:
adata = adata_healthy.copy()
adata.layers['counts'] = adata.X.copy()

In [None]:
# Comment from Carlos: "Is "dataset" the most important covariate here?"

sc.pp.highly_variable_genes(adata,
    flavor = "seurat_v3", 
    n_top_genes = 3000, 
    layer = "counts", 
    batch_key = "dataset", 
    subset = True
)

#### Remove unrequired variables prior to model run

In [None]:
del caiy2020
del caiy2022
del adata_yoshida

### Integration with scVI


In [None]:
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="dataset")

In [None]:
vae = scvi.model.SCVI(adata, n_layers=3, n_latent=50, gene_likelihood="nb", dispersion="gene-batch")

In [None]:
scvi.model.SCVI.view_anndata_setup(vae)

In [None]:
vae.train()

In [None]:
adata.obsm["X_scVI"] = vae.get_latent_representation()

In [None]:
# Add random state command for neighbors and leiden

sc.pp.neighbors(adata, n_neighbors=50, use_rep="X_scVI")
sc.tl.leiden(adata, resolution=1)

# Why is the spread of 8 used?

sc.tl.umap(adata, min_dist=0.5, spread=8, random_state=0)

In [None]:
plt.figure(figsize = (5, 5))

In [None]:
sc.pl.umap(
    adata,
    color=["leiden", "tissue", "dataset", "donor"],
    frameon=False,
    ncols=4,
    size=4
)


### SCCAF clustering analysis

In [None]:
import SCCAF as sccaf
from SCCAF import SCCAF_assessment, plot_roc

In [None]:
# Resolution on a scale from 0 to 1

# Comment from Carlos: Why am I running leiden here again?

sc.tl.leiden(adata, resolution = 1.0, random_state = 1786)

In [None]:
# clf ? 
# cvsm ?

y_prob, y_pred, y_test, clf, cvsm, acc = SCCAF_assessment(adata.X, adata.obs['leiden'], n = 100)



In [None]:
plot_roc(y_prob, y_test, clf, cvsm = cvsm, acc = acc)

plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=1.5,
                    hspace=1.5)

plt.show()

In [None]:
# Comment from Carlos - what is wrong with the plotting function?

In [None]:
# Markers chosen here are of interest in healthy blood cells
# Removed due to "could not find" error: 'FOXI1'

# Please do not use the viridis palette as I can't really see it. Please use either magma or RdPu.

sc.pl.umap(adata, frameon = False, color = ['leiden', 'CD74', 'CDH1', 'CD3E', 'DUSP4'], size = 0.8, legend_fontsize = 5, legend_loc = 'on data')

### Export clustered object

In [None]:
adata

In [None]:
adata_healthy

In [None]:
# Making a hybrid anndata object using sections from both original anndata object and the cai_tb_gex object
adata_export = anndata.AnnData(X = adata_healthy.X, var = adata_healthy.var, obs = adata.obs, uns = adata.uns, obsm = adata.obsm, layers = adata_healthy.layers, obsp = adata.obsp)
adata_export

In [None]:
adata_export.write('/Volumes/Lacie/data_lake/Mairi_example/processed_files/scvi/post_sccaf/CaiY_Yoshida_healthy_scRNA_PBMC_mm230426_scVI-clustered.raw.h5ad')
