# Scanpy analysis

Use Scanpy on both the concatenated data and only RNA data to get ARI and NMI metrics. We follow the standard Scanpy preprocessing pipeline.

In [1]:
import os
os.environ[ 'NUMBA_CACHE_DIR' ] = '/scratch/st-jiaruid-1/yinian/tmp/' # https://github.com/scverse/scanpy/issues/2113

import h5py
import hdf5plugin
import tables

import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import scanpy as sc
import anndata as ad
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score

import yaml
from pathlib import Path

Matplotlib created a temporary config/cache directory at /tmp/pbs.4865804.pbsha.ib.sockeye/matplotlib-yf_1cj5j because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


## Concatenated gene + protein data analysis

### Load the data

In [2]:
config = yaml.safe_load(Path('../experiments/4_32606.yaml').read_text())
config

{'files': {'rna': ['/arc/project/st-jiaruid-1/yinian/pbmc/4_32606_raw_rna.h5ad'],
  'protein': ['/arc/project/st-jiaruid-1/yinian/pbmc/4_32606_raw_protein.h5ad'],
  'combined': ['/arc/project/st-jiaruid-1/yinian/pbmc/4_32606_raw_combined.h5ad'],
  'gene_indices': '/scratch/st-jiaruid-1/yinian/my_jupyter/scETM/scripts/gene_indices_3.pkl'},
 'model_params': {'n_epochs': 12000,
  'eval_every': 3000,
  'cell_type_col': 'cell_type',
  'day': 4,
  'donor': 32606,
  'rna_n_vars': 22085},
 'ckpt_dir': '/scratch/st-jiaruid-1/yinian/my_jupyter/scETM/results/'}

In [3]:
files = config['files']
model_params = config['model_params']
if model_params['cell_type_col'] == 'None':
    model_params['cell_type_col'] = None

In [4]:
# The concatenated data
combined_files = files['combined']
adata = ad.concat([ad.read_h5ad(c_file) for c_file in combined_files], label="batch_indices")
adata

AnnData object with n_obs × n_vars = 9504 × 22225
    obs: 'day', 'donor', 'cell_type', 'technology', 'batch_indices'

### Run the standard Scanpy workflow

In [5]:
def print_ari_nmi(adata: ad.AnnData, resolutions=[0.07, 0.1, 0.13, 0.19, 0.22, 0.25, 0.28, 0.31, 0.35, 0.44, 0.64, 1]) -> None:
    """
    Runs the standard Scanpy worflow to cluster cells using the Leiden algorithm.
    Prints out the resolution, ARI, NMI, and ARI+NMI for each resolution.
    """
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.scale(adata, max_value=10)
    
    sc.pp.pca(adata)
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)

    sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50)
    for r in resolutions:
        sc.tl.leiden(adata, resolution=r)
        leiden_assignment = list(adata.obs['leiden'].cat.codes)
        real_assignment = list(adata.obs['cell_type'].cat.codes)
        ari = adjusted_rand_score(real_assignment, leiden_assignment)
        nmi = normalized_mutual_info_score(real_assignment, leiden_assignment)
        print(r, ari, nmi, ari+nmi)

In [6]:
print_ari_nmi(adata)

2023-04-17 11:52:07.226473: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-17 11:52:07.363691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /.singularity.d/libs
2023-04-17 11:52:07.363722: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-17 11:52:10.000005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shar

0.07 0.4463800060744941 0.5562742321623159 1.00265423823681
0.1 0.4463800060744941 0.5562742321623159 1.00265423823681
0.13 0.5690842655411174 0.6685767423319435 1.237661007873061
0.19 0.4841344796588286 0.5889983934456764 1.073132873104505
0.22 0.5209161597011335 0.6133008385392172 1.1342169982403507
0.25 0.5351993992877032 0.6205841514138986 1.1557835507016017
0.28 0.5586437575746039 0.6174498776536965 1.1760936352283005
0.31 0.4921024257082286 0.6038123675326545 1.0959147932408833
0.35 0.5552920630362442 0.6175760300573301 1.1728680930935744
0.44 0.5124131527721807 0.6140249738687729 1.1264381266409536
0.64 0.49037271712564207 0.5930774071709046 1.0834501242965466
1 0.2991545474872334 0.5196564591247927 0.8188110066120262


## Only RNA data analysis

In [8]:
rna_files = files['rna']
adata = ad.concat([ad.read_h5ad(r_file) for r_file in rna_files], label="batch_indices")

In [9]:
print_ari_nmi(adata)

0.07 0.44854094769014685 0.5630887512139058 1.0116296989040525
0.1 0.45732085641728987 0.577613527707321 1.0349343841246108
0.13 0.4566485358345694 0.576300167555953 1.0329487033905225
0.19 0.47684798591892563 0.5889666364479906 1.0658146223669163
0.22 0.6301773416635736 0.6765469551805815 1.306724296844155
0.25 0.5925765586279255 0.6614106960584858 1.2539872546864113
0.28 0.6093026732099329 0.6565362222931946 1.2658388955031277
0.31 0.605572999045663 0.6518152540964525 1.2573882531421154
0.35 0.601144100356212 0.6490849576591647 1.2502290580153768
0.44 0.592161541231454 0.6401315946359681 1.232293135867422
0.64 0.43239768638904147 0.5760237912582062 1.0084214776472478
1 0.2861983276655965 0.5314457684005477 0.8176440960661442
