# Celltyping Interactive Workflow

Run this notebook top-to-bottom, or cell-by-cell for iterative exploration.

In [None]:
from pathlib import Path
import os
import sys
import warnings

# Make `pipeline/` importable from notebooks/
repo_root = Path.cwd().resolve().parent
pipeline_dir = repo_root / "pipeline"
sys.path.insert(0, str(pipeline_dir))

warnings.filterwarnings("ignore")

In [None]:
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse

import stcr

from celltyping import (
    annotate,
    gene_entropy,
    load_trb,
    phenotype_tcells,
    remove_meaningless_genes,
    run_harmony_workflow,
)
from modules.celltyping_io import load_directory_manual, load_directory_scirpy
from modules.celltyping_validation import run_validation_plots
from modules.celltyping_clonality import (
    compute_clonality_patient,
    compute_temporal_correlations,
    plot_clonality_boxplots,
    plot_clonality_boxplots_by_timepoint,
    plot_clonality_heatmap,
    plot_clonality_lines,
    plot_correlation_heatmap,
    plot_top_correlations,
)
from modules.celltyping_geometry import plot_clone_simplex, plot_clone_transitions

In [None]:
# ---- Config ----
DATA_DIR = repo_root / "data" / "btc_gbm_gex_vdj"
TABLE_SIG = Path("/Users/ceglian/Downloads/41586_2025_9989_MOESM10_ESM.xlsx")
OUTPUT_DIR = repo_root / "pipeline" / "outputs" / "celltyping_interactive"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(OUTPUT_DIR)

print(f"DATA_DIR   : {DATA_DIR}")
print(f"TABLE_SIG  : {TABLE_SIG}")
print(f"OUTPUT_DIR : {OUTPUT_DIR}")

In [None]:
# ---- Load + merge TRB annotations ----
adata_scirpy = load_directory_scirpy(str(DATA_DIR))
adata_manual = load_directory_manual(str(DATA_DIR), load_trb)

adata_scirpy = annotate(adata_scirpy)
adata_manual = annotate(adata_manual)

adata_scirpy.obs["trb"] = [str(x) for x in adata_scirpy.obs["IR_VDJ_1_junction_aa"]]
adata_scirpy = adata_scirpy[adata_scirpy.obs["has_ir"] == "True"].copy()
adata_scirpy = adata_scirpy[adata_scirpy.obs["trb"] != "nan"].copy()

adata_manual = adata_manual[adata_manual.obs["trb"] != "None"].copy()
adata_manual = adata_manual[adata_scirpy.obs.index].copy()

adata_scirpy.obs["mait"] = adata_manual.obs["mait"]
adata_scirpy.obs["inkt"] = adata_manual.obs["inkt"]
adata_scirpy.obs["trb_all"] = adata_manual.obs["trb"]
adata_scirpy.obs["tra_all"] = adata_manual.obs["tra"]

adata = adata_scirpy[adata_scirpy.obs["trb"] != "nan"].copy()
adata.layers["counts"] = adata.X

print(adata)

In [None]:
# ---- Basic preprocessing + embedding ----
gene_entropy(adata)
adata = remove_meaningless_genes(adata)
adata_trim = adata[:, adata.var["entropy"] > 1.5].copy()
adata_trim = remove_meaningless_genes(adata_trim, include_tcr=True)

sc.pp.normalize_total(adata_trim)
sc.pp.log1p(adata_trim)
adata_trim = run_harmony_workflow(adata_trim, "sample")
adata.obsm["X_umap"] = adata_trim.obsm["X_umap"]

sc.pl.umap(adata, color=["patient", "timepoint", "tissue", "clone_size"], ncols=2, frameon=False, add_outline=True, s=10)

In [None]:
# ---- Phenotyping ----
adata = phenotype_tcells(adata, beltra_path=str(TABLE_SIG))
print(pd.crosstab(adata.obs["phenotype"], adata.obs["tissue"]))

In [None]:
# ---- Validation panel figures ----
_ = run_validation_plots(adata, key="phenotype", save_prefix="phenotype_validation")

In [None]:
# ---- Clonality summary ----
df = compute_clonality_patient(adata)
plot_clonality_heatmap(df, "tissue", "Clonality by Phenotype × Tissue (mean ± SEM)", savepath="clonality_tissue.png")
plot_clonality_lines(df, savepath="clonality_lines.png")
plot_clonality_boxplots(df, savepath="clonality_boxplots_tissue.png")
plot_clonality_boxplots_by_timepoint(df, savepath="clonality_boxplots_timepoint.png")

corr_df = compute_temporal_correlations(df, min_obs=4)
plot_correlation_heatmap(corr_df, savepath="clonality_correlation_heatmap.png")
plot_top_correlations(corr_df, df, savepath="clonality_top_correlations.png")

In [None]:
# ---- Optional: simplex / transitions ----
cd8_groups = {
    "Circulating
(TEMRA/Naive)": ["TEMRA", "Naive"],
    "Exhaustion
(TEXprog/eff/term)": ["TEXprog", "TEXeff", "TEXterm"],
    "Resident
(TRM/Memory)": ["TRM", "Memory"],
}

tissue_colors = {"CSF": "#cd442a", "PBMC": "#f0bd00", "TP": "#7e9437"}
_ = plot_clone_simplex(adata, cd8_groups, tissue_colors=tissue_colors, lineage="CD8", a=50, b=1.8)
_ = plot_clone_transitions(
    adata,
    cd8_groups,
    "TP",
    "CSF",
    lineage="CD8",
    tissue_labels=("TP", "CSF"),
    point_color_from=tissue_colors["TP"],
    point_color_to=tissue_colors["CSF"],
    a=100,
    b=3,
)

In [None]:
# ---- Save final AnnData ----
out_h5ad = OUTPUT_DIR / "GBM_TCR_POS_TCELLS_interactive.h5ad"
adata.write(out_h5ad)
print(f"Saved: {out_h5ad}")