## Prepare a study's raw anndata file for use with CellTypist cell-type annotation prediction models

- data has to be log1p and normalized with 10K 

In [None]:
!date

#### import libraries

In [None]:
import scanpy as sc
import nb_util_funcs as nuf
from pandas import read_csv
import matplotlib.pyplot as plt
from seaborn import barplot

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

In [None]:
# import importlib
# mm = importlib.reload(nuf)

#### set notebook variables

In [None]:
# parameters
modality = 'GEX' # 'GEX' or 'ARC'

In [None]:
# variables
project = 'aging_phase2'
DEBUG = True
MAX_MITO_PERCENT = 20
DETECT_HV_FEATURES = True
FILTER_HV_FEATURES = False
TOP_FEATURES_PERCENT = 0.15
celltypist_models = {'DHB': 'Developing_Human_Brain',
                     'DLPFC': '/data/celltypist/Adult_Human_PrefrontalCortex', 'MTG': '/data/celltypist/Adult_Human_MTG'}

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
quants_dir = f'{wrk_dir}/quants'
celltypist_dir = f'{wrk_dir}/celltypist'

# input/output files
raw_anndata_file =f'{quants_dir}/{project}_{modality}.raw.h5ad'
celltypist_in_file = f'{quants_dir}/{project}_{modality}.celltypist.h5ad'

if DEBUG:
    print(f'{raw_anndata_file=}')
    print(f'{celltypist_in_file=}')
    print(f'{celltypist_models=}')

## load data

### load the raw anndata object

In [None]:
%%time
adata = sc.read_h5ad(raw_anndata_file)
nuf.peek_anndata(adata, '## input raw anndata:', DEBUG)

### subset to just the gene features if multiome

In [None]:
if 'modality' in adata.var.columns:
    adata = adata[:, adata.var.modality == 'Gene Expression']
    nuf.peek_anndata(adata, '## adata just gene features:', DEBUG)

## typical checks and preprocessing

In [None]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], 
                           inplace=True, log1p=True)


## identify high variance features
filter on if specified

In [None]:
%%time
if DETECT_HV_FEATURES:
    n_top_genes = int(adata.n_vars * TOP_FEATURES_PERCENT)
    sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, 
                                flavor='seurat_v3', 
                                subset=FILTER_HV_FEATURES)
    nuf.peek_anndata(adata, 'study adata after typical filtering', DEBUG)

## Normalization

In [None]:
%%time
# Saving count data
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata  # freeze the state in `.raw`
nuf.peek_anndata(adata)

## save the prepped anndata object

In [None]:
%%time
adata.write(celltypist_in_file)

## format the CellTypist command
- for running on NIH HPC using Singulatiry

In [None]:
for abbr, model_name in celltypist_models.items():
    print(f'\n### {model_name}')
    this_cmd = (f'sinteractive --constraint=gpuk80 --gres=lscratch:10,gpu:k80:1 --mem=96g \n'
                'module load rapids-singlecell \n'
                'module load singularity \n'
                'cd /data/ADRD/brain_aging/phase2 \n'
                'singularity pull celltypist-latest.sif docker://quay.io/teichlab/celltypist:latest \n'
                'singularity run \\\n'
                '-B /data/ADRD/brain_aging/phase2:/data \\\n'
                'celltypist-latest.sif \\\n'
                f'celltypist --indata /data/quants/{project}_{modality}.celltypist.h5ad \\\n'
                f'--model {model_name}.pkl --majority-voting \\\n'
                f'--prefix {project}_{modality}_{abbr}_ --outdir /data/celltypist')
    print(this_cmd)

## what are the predicted cell-type labels

In [None]:
for abbr, model_name in celltypist_models.items():
    cell_typist_file = f'{celltypist_dir}/{project}_{modality}_{abbr}_predicted_labels.csv'
    celltypist_pred = read_csv(cell_typist_file, index_col=0)
    nuf.peek_dataframe(celltypist_pred, DEBUG)
    if DEBUG:
        print(f'{celltypist_pred.majority_voting.nunique()=}')
        display(celltypist_pred.majority_voting.value_counts())
        
    plt.figure(figsize=(15, 20))
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=celltypist_pred.majority_voting.value_counts().to_frame(), x='count', y='majority_voting')
    plt.yticks(fontsize=11)
    plt.title(f'{modality} {abbr}')
    plt.show()        

In [None]:
!date