# Cell Typist
 A tool for automated cell type annotation of scRNA-seq data. It uses machine learning models trained on curated reference datasets to rapidly and accurately assign cell type labels. CellTypist supports both quick predictions with pre-trained models and customizable training on user-defined datasets for more tailored annotations.

In [6]:
from load_and_prepare_data import adult_human_heart
import celltypist
from celltypist import models

import sys
from pathlib import Path
# Add the preprocessing folder to the Python path
script_dir = Path('../../preprocessing')
sys.path.append(str(script_dir))

# let's load our data
adata = adult_human_heart(
    expression_matrix=Path("../../data/GSE109816_normal_heart_umi_matrix.csv.gz"),
    metadata_path=Path("../../data/GSE109816_normal_heart_cell_cluster_info.txt")
)

## Since we're using adult human heart data, we can use the model trained on healthy adult heart data
model = models.Model.load(model='Healthy_Adult_Heart.pkl')
## you can use this line to see all available models
# models.models_description()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# now I need to load actual anndata and not Seurat.
result = celltypist.annotate(combined_adata, model=model, majority_voting=True)
# let's look for adult human heart

In [None]:
pred_labels = result.predicted_labels

# Remove duplicates, keeping the first
pred_labels = pred_labels[~pred_labels.index.duplicated(keep='first')]

# Drop duplicates from adata to be safe
combined_adata = combined_adata[~combined_adata.obs_names.duplicated(keep='first')]

# Also restrict to shared indices
shared_cells = combined_adata.obs_names.intersection(pred_labels.index)

# Get the most probable label per cell
cell_types = pred_labels.loc[shared_cells, "predicted_labels"]
majority_voting = pred_labels.loc[shared_cells, "majority_voting"]
over_clustering = pred_labels.loc[shared_cells, "over_clustering"]

In [None]:
# Add only for shared cells
combined_adata.obs['cell_type'] = cell_types
combined_adata.obs['cell_type_majority'] = majority_voting
combined_adata.obs['over_clustering'] = over_clustering

In [None]:
sc.pl.umap(combined_adata, color='cell_type_majority', legend_loc='right margin')