In [1]:
# Suppress warnings before importing packages
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Configure dask before importing packages that use it
import dask
dask.config.set({"dataframe.query-planning": True})

# Now import packages
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt



In [2]:
# %%
# ============================================
# 0. SETUP & CONFIGURATION
# ============================================
import os
import warnings
import dask
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)
dask.config.set({"dataframe.query-planning": True})

# --- VISUALIZATION SETTINGS (WHITE THEME) ---
# 1. Enforce White Background for Seaborn
sns.set_style("white")
sns.set_context("paper", font_scale=1.2)

# 2. Enforce White Background for Matplotlib (Global)
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['savefig.facecolor'] = 'white'

# 3. Configure Scanpy
sc.set_figure_params(dpi=150, figsize=(8, 8), facecolor='white')

# Define Paths
DATA_PATH = "../../data/Kidney_ST/GSE211785_7_13_23_slide0_annotated_iPTsubclusters.h5ad"
OUTPUT_DIR = "../../analysis/Qc"

# Create Output Directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Outputs will be saved to: {OUTPUT_DIR}")


Outputs will be saved to: ../../analysis/Qc


In [3]:

# --- HELPER FUNCTION FOR SAVING PLOTS ---


def save_plot(filename_base):
    """Saves the current matplotlib figure as PDF and PNG."""
    # Save PNG (High Res Raster for PPT)
    png_path = os.path.join(OUTPUT_DIR, f"{filename_base}.png")
    plt.savefig(png_path, dpi=300, bbox_inches='tight', facecolor='white')

    # Save PDF (Vector for Paper)
    pdf_path = os.path.join(OUTPUT_DIR, f"{filename_base}.pdf")
    plt.savefig(pdf_path, format='pdf', bbox_inches='tight', facecolor='white')

    print(f"Saved: {filename_base}")



# %%
# ============================================
# 1. DATA LOADING & FIXES
# ============================================
print("Loading data...")
adata = sc.read_h5ad(DATA_PATH)

# CRITICAL FIX: Convert coordinates to Numpy Array
if hasattr(adata.obsm['spatial'], 'to_numpy'):
    print("Fixing spatial coordinates format...")
    adata.obsm['spatial'] = adata.obsm['spatial'].to_numpy()
elif not isinstance(adata.obsm['spatial'], np.ndarray):
    adata.obsm['spatial'] = np.array(adata.obsm['spatial'])

print("Data Loaded Successfully.")




# %%
# ============================================
# 2. SPATIAL QC PLOTS (Noise & Condition)
# ============================================
print("Generating Spatial QC Plots...")

# Plot A: Condition vs Noise Level
# This uses 'viridis' for noise to make high noise (yellow) stand out against white
fig = sc.pl.spatial(
    adata,
    color=['type', 'propNegative'],
    title=["Condition (Disease vs Healthy)",
           "Noise Level (% Negative Probes)"],
    spot_size=0.01,
    wspace=0.3,
    cmap='viridis',
    palette={'Healthy': 'lightgrey', 'Disease': 'firebrick'},  # Grey vs Red
    return_fig=True,
    show=False
)
save_plot("1_QC_Spatial_Condition_vs_Noise")
plt.close()


Loading data...


Fixing spatial coordinates format...
Data Loaded Successfully.
Generating Spatial QC Plots...
Saved: 1_QC_Spatial_Condition_vs_Noise


In [4]:
import scanpy as sc

print("--- 1. General AnnData Overview ---")
print(adata)

print("\n--- 2. Check Unstructured Data (.uns) ---")
# This is where Scanpy looks for spatial metadata (images, scale factors)
print("Keys in adata.uns:", list(adata.uns.keys()))

if 'spatial' in adata.uns:
    print("\n'spatial' key found in .uns. Contents:")
    print(adata.uns['spatial'].keys())
else:
    print("\n[!] 'spatial' key NOT found in adata.uns (This causes the error)")

print("\n--- 3. Check Spatial Coordinates (.obsm) ---")
# Ensure your coordinates are actually stored here
if 'spatial' in adata.obsm:
    print("Spatial coordinates shape:", adata.obsm['spatial'].shape)
    print("First 5 coords:\n", adata.obsm['spatial'][:5])
else:
    print("[!] 'spatial' not found in .obsm")


--- 1. General AnnData Overview ---
AnnData object with n_obs Ã— n_vars = 558601 Ã— 1001
    obs: 'nn_33131227.a09b.4d87.addf.d7bb907597e5_1_cluster_cluster_e1ad5c7b.3174.4ee9.bfbc.d6efb6833675_1', 'RNA_nbclust_41a5ffc1.6941.4e83.af9c.d661c9c543e5_1_clusters', 'RNA_nbclust_41a5ffc1.6941.4e83.af9c.d661c9c543e5_1_posterior_probability', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_negprobes', 'nFeature_negprobes', 'nCount_falsecode', 'nFeature_falsecode', 'fov', 'Area', 'AspectRatio', 'Width', 'Height', 'Mean.B', 'Max.B', 'Mean.CK8.18', 'Max.CK8.18', 'Mean.Membrane', 'Max.Membrane', 'Mean.R', 'Max.R', 'Mean.DAPI', 'Max.DAPI', 'cell_id', 'assay_type', 'slide_ID_numeric', 'Run_Tissue_name', 'Panel', 'cell_ID', 'x_FOV_px', 'y_FOV_px', 'x_slide_mm', 'y_slide_mm', 'propNegative', 'complexity', 'errorCtEstimate', 'percOfDataFromError', 'qcFlagsRNACounts', 'qcFlagsCellCounts', 'qcFlagsCellPropNeg', 'qcFlagsCellComplex', 'qcFlagsCellArea', 'median_negprobes', 'negprobes_quantile_0.9', 'me

In [5]:
# %%
# ============================================
# 3. QUALITY METRICS DISTRIBUTIONS (FINAL LABELS)
# ============================================

from matplotlib.ticker import FuncFormatter 

# Define the formatter function
def thousands_formatter(x, pos):
    """Converts Y-axis tick values (Count) to 'k' notation (0, 1k, 2k, etc.)."""
    if x >= 1000:
        return f'{x/1000:1.0f}k'
    return f'{x:1.0f}'

print("Generating Quality Histograms with '# of cells' Y-axis label...")

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# --- Plot 1: Transcripts ---
sns.histplot(data=adata.obs, x='nCount_RNA', hue='type',
             bins=50, ax=axes[0], palette=['firebrick', 'grey'])
axes[0].set_title("Transcripts per Cell")
axes[0].set_xlim(0, 1000)
axes[0].grid(False)
if axes[0].legend_:
    axes[0].legend_.set_title("Condition")
axes[0].yaxis.set_major_formatter(FuncFormatter(thousands_formatter))
axes[0].set_ylabel("# of cells") # <-- Y-AXIS LABEL CHANGE


# --- Plot 2: Genes ---
sns.histplot(data=adata.obs, x='nFeature_RNA', hue='type',
             bins=50, ax=axes[1], palette=['firebrick', 'grey'])
axes[1].set_title("Genes per Cell")
axes[1].grid(False)
if axes[1].legend_:
    axes[1].legend_.set_title("Condition")
axes[1].yaxis.set_major_formatter(FuncFormatter(thousands_formatter))
axes[1].set_ylabel("# of cells") # <-- Y-AXIS LABEL CHANGE


# --- Plot 3: Area ---
sns.histplot(data=adata.obs, x='Area', hue='type', bins=50,
             ax=axes[2], palette=['firebrick', 'grey' ])
axes[2].set_title("Cell Size (Area)")
axes[2].grid(False)
if axes[2].legend_:
    axes[2].legend_.set_title("Condition")
axes[2].yaxis.set_major_formatter(FuncFormatter(thousands_formatter))
axes[2].set_ylabel("# of cells") # <-- Y-AXIS LABEL CHANGE

plt.tight_layout()
save_plot("2_QC_Histograms_Quality_Metrics")
plt.close()

Generating Quality Histograms with '# of cells' Y-axis label...


Saved: 2_QC_Histograms_Quality_Metrics


In [6]:
# %%
# ============================================
# 4. CELL TYPE COMPOSITION (X-AXIS LABEL CHANGED)
# ============================================
print("Generating Composition Plots...")


def plot_proportions(adata, cell_type_col, condition_col, title, filename):
    # Create contingency table
    df = pd.crosstab(adata.obs[condition_col], adata.obs[cell_type_col])

    # Normalize to 0-1 scale
    df_prop = df.div(df.sum(axis=1), axis=0)

    # Sort for visual cleanliness
    if 'Disease' in df_prop.index:
        df_prop = df_prop.sort_values(by='Disease', axis=1, ascending=False)

    ax = df_prop.plot(
        kind='bar', stacked=True, figsize=(6, 6),
        colormap='tab20', width=0.8
    )
    plt.title(title, fontsize=14, fontweight='bold')
    plt.ylabel("Proportion of Cells (0-1)")

    # --- ADDED: EXPLICITLY SET X-AXIS LABEL ---
    plt.xlabel("Condition")

    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=0)
    plt.tight_layout()
    save_plot(filename)
    plt.close()


# Broad Types
plot_proportions(adata, 'cellType_CosMx_1', 'type',
                 'Cells Dynamics', "3_Composition_Broad")

# Detailed Types
plot_proportions(adata, 'cellType_CosMx_2', 'type',
                 'Cells Dynamics', "3_Composition_Detailed")


Generating Composition Plots...


Saved: 3_Composition_Broad
Saved: 3_Composition_Detailed


In [7]:
# %%
# ============================================
# 5. BIOLOGICAL VALIDATION (REDUCED SIZE)
# ============================================
print("Generating Validation Plots...")

# A. Protein Validation (CK8/18)
subset_types = ['PT', 'iPT', 'Fibro', 'T-cell', 'Podocyte']
subset_adata = adata[adata.obs['cellType_CosMx_2'].isin(subset_types)]

plt.figure(figsize=(4, 3))  # <-- REDUCED SIZE
sns.boxplot(data=subset_adata.obs, x='cellType_CosMx_2',
            y='Mean.CK8.18', showfliers=False, palette='Blues')
plt.title("Protein Validation: Epithelial Marker (CK8/18)")
plt.xticks(rotation=45, fontsize=8)  # <-- Adjusted font size for smaller plot
plt.yticks(fontsize=8)
plt.tight_layout()
save_plot("4_Validation_Protein_CK818")
plt.close()


Generating Validation Plots...


Saved: 4_Validation_Protein_CK818


In [8]:
# %%
# ============================================
# B. Cell Hypertrophy (Area) - Y-Axis Font Corrected
# ============================================
from matplotlib.ticker import FuncFormatter


def thousands_formatter(x, pos):
    """Converts Y-axis tick values (Area) to 'k' notation."""
    if x >= 1000:
        return f'{x/1000:1.0f}k'
    return f'{x:1.0f}'


# B. Cell Hypertrophy (Area)
pt_subset = adata[adata.obs['cellType_CosMx_2'].str.contains(
    'TAL|iPT', regex=True)]

plt.figure(figsize=(4, 2.5))  # <-- Increased height to 2.5 inches

sns.boxplot(
    data=pt_subset.obs, x='cellType_CosMx_2', y='Area', hue='type',
    showfliers=False, palette={'Healthy': 'grey', 'Disease': 'tomato'}
)

# --- Apply Y-Axis formatting ---
ax = plt.gca()
ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))
# -------------------------------

# --- CORRECTED FONT SIZES ---
plt.title("Cell Hypertrophy: Tubule Size",
          fontsize=8)  # Increased title slightly
plt.ylabel("Cell Area (pixels)", fontsize=7)
plt.xticks(rotation=45, fontsize=7)  # X-ticks increased
plt.yticks(fontsize=7)  # <-- Y-TICKS INCREASED TO LEGIBLE SIZE 7

if ax.get_legend():
    ax.get_legend().remove()

plt.tight_layout()
save_plot("4_Validation_Morphology_Area")
plt.close()


Saved: 4_Validation_Morphology_Area


In [9]:

# C. Spatial Gene Markers (Reference Genes)
genes_of_interest = ['LRP2', 'SPP1', 'HAVCR1']
available_genes = [g for g in genes_of_interest if g in adata.var_names]

if available_genes:
    # Note: sc.pl.spatial size is controlled by the number of subplots,
    # but the smaller font in the surrounding plots helps with visual consistency.
    sc.pl.spatial(
        adata,
        color=available_genes + ['type'],
        title=available_genes + ['Reference: Condition'],
        spot_size=0.01,
        cmap='plasma',
        wspace=0.3,
        palette={'Healthy': 'lightgrey', 'Disease': 'firebrick'},
        show=False
    )
    save_plot("4_Validation_Spatial_Markers")
    plt.close()


Saved: 4_Validation_Spatial_Markers


In [10]:

# %%
# ============================================
# 6. iPT SUBCLUSTER DEEP DIVE
# ============================================
print("Generating iPT Subcluster Analysis...")

# Update this column name if your specific file uses a different one
TARGET_SUBCLUSTER_COL = 'leiden_1_sub11'

if TARGET_SUBCLUSTER_COL in adata.obs.columns:
    # 1. Spatial Map of Subclusters
    sc.pl.spatial(
        adata,
        color=TARGET_SUBCLUSTER_COL,
        title="iPT Subclusters Spatial Map",
        spot_size=0.01,
        palette='tab20',  # distinct colors
        show=False
    )
    save_plot("5_iPT_Subclusters_Spatial")
    plt.close()

    # 2. HAVCR1 Expression Violin Plot
    if 'HAVCR1_expression' in adata.obs.columns:
        plt.figure(figsize=(12, 6))
        # Sort by mean expression for readability
        order = adata.obs.groupby(TARGET_SUBCLUSTER_COL)[
            'HAVCR1_expression'].mean().sort_values().index

        sns.violinplot(
            data=adata.obs, x=TARGET_SUBCLUSTER_COL, y='HAVCR1_expression',
            order=order, palette='viridis'
        )
        plt.title("HAVCR1 Injury Marker by Subcluster")
        plt.xticks(rotation=45)
        plt.tight_layout()
        save_plot("5_iPT_Subclusters_Violin")
        plt.close()
else:
    print(
        f"Warning: Column {TARGET_SUBCLUSTER_COL} not found. Skipping iPT specific plots.")

print("\n--- ALL PLOTS SAVED SUCCESSFULLY TO ./analysis/Qc ---")


Generating iPT Subcluster Analysis...


Saved: 5_iPT_Subclusters_Spatial
Saved: 5_iPT_Subclusters_Violin

--- ALL PLOTS SAVED SUCCESSFULLY TO ./analysis/Qc ---


In [11]:
# %%
# ============================================
# 6. ALL CELL TYPE DEEP DIVE (Replacing iPT Subclusters)
# ============================================
print("Generating All Cell Type Analysis...")

# Update this column name to the primary cell type column
# <-- Using the primary cell type column
TARGET_CELL_TYPE_COL = 'cellType_CosMx_2'

if TARGET_CELL_TYPE_COL in adata.obs.columns:
    # 1. Spatial Map of Cell Types
    sc.pl.spatial(
        adata,
        color=TARGET_CELL_TYPE_COL,
        title="All Cell Types Spatial Map",
        spot_size=0.01,
        palette='tab20',  # distinct colors
        show=False
    )
    save_plot("5_CellTypes_Spatial")  # <-- Updated filename
    plt.close()

    # 2. HAVCR1 Expression Violin Plot
    if 'HAVCR1_expression' in adata.obs.columns:
        plt.figure(figsize=(12, 6))

        # Sort by mean HAVCR1 expression across all cell types for readability
        order = adata.obs.groupby(TARGET_CELL_TYPE_COL)[
            'HAVCR1_expression'].mean().sort_values().index

        sns.violinplot(
            data=adata.obs, x=TARGET_CELL_TYPE_COL, y='HAVCR1_expression',
            order=order, palette='viridis'
        )
        plt.title("HAVCR1 Injury Marker by Cell Type")
        plt.xticks(rotation=45)
        plt.tight_layout()
        save_plot("5_CellTypes_Violin")  # <-- Updated filename
        plt.close()
else:
    print(
        f"Warning: Column {TARGET_CELL_TYPE_COL} not found. Skipping cell type plots.")

print("\n--- ALL PLOTS SAVED SUCCESSFULLY TO ./analysis/Qc ---")


Generating All Cell Type Analysis...


Saved: 5_CellTypes_Spatial
Saved: 5_CellTypes_Violin

--- ALL PLOTS SAVED SUCCESSFULLY TO ./analysis/Qc ---


In [12]:
# %%
# ============================================
# 7. HAVCR1 EXPRESSION SPLIT BY CONDITION
# ============================================
print("Generating HAVCR1 Expression Violin Plots split by Condition...")

TARGET_CELL_TYPE_COL = 'cellType_CosMx_2'
MARKER_COL = 'HAVCR1_expression'
HEALTHY_COLOR = 'grey'
DISEASE_COLOR = 'tomato'

if TARGET_CELL_TYPE_COL in adata.obs.columns and MARKER_COL in adata.obs.columns:

    full_obs = adata.obs.copy()

    # 1. Filter the dataframes
    healthy_obs = full_obs[full_obs['type'] == 'Healthy']
    disease_obs = full_obs[full_obs['type'] == 'Disease']

    # --- Plot 1: HEALTHY CONDITION ---
    plt.figure(figsize=(10, 5))

    # Sort by mean expression in the Healthy subset
    order_healthy = healthy_obs.groupby(TARGET_CELL_TYPE_COL)[
        MARKER_COL].mean().sort_values().index

    sns.violinplot(
        data=healthy_obs,
        x=TARGET_CELL_TYPE_COL,
        y=MARKER_COL,
        order=order_healthy,
        palette=[HEALTHY_COLOR],  # Use 'grey'
        inner='quartile'
    )
    plt.title("HAVCR1 Expression: HEALTHY Condition Only", fontsize=12)
    plt.ylabel("HAVCR1 Expression (Log Norm)", fontsize=10)
    plt.xlabel("Cell Type", fontsize=10)
    plt.xticks(rotation=45, fontsize=8)
    plt.yticks(fontsize=8)
    plt.tight_layout()
    save_plot("6_HAVCR1_Violin_Healthy")
    plt.close()

    # --- Plot 2: DISEASE CONDITION ---
    plt.figure(figsize=(10, 5))

    # Sort by mean expression in the Disease subset
    order_disease = disease_obs.groupby(TARGET_CELL_TYPE_COL)[
        MARKER_COL].mean().sort_values().index

    sns.violinplot(
        data=disease_obs,
        x=TARGET_CELL_TYPE_COL,
        y=MARKER_COL,
        order=order_disease,
        palette=[DISEASE_COLOR],  # Use 'tomato'
        inner='quartile'
    )
    plt.title("HAVCR1 Expression: DISEASE Condition Only", fontsize=12)
    plt.ylabel("HAVCR1 Expression (Log Norm)", fontsize=10)
    plt.xlabel("Cell Type", fontsize=10)
    plt.xticks(rotation=45, fontsize=8)
    plt.yticks(fontsize=8)
    plt.tight_layout()
    save_plot("6_HAVCR1_Violin_Disease")
    plt.close()

else:
    print(
        f"Warning: Columns {TARGET_CELL_TYPE_COL} or {MARKER_COL} not found. Skipping plots.")

print("\n--- HAVCR1 Condition-Split Violin Plots Saved Successfully ---")


Generating HAVCR1 Expression Violin Plots split by Condition...


The palette list has fewer values (1) than needed (17) and will cycle, which may produce an uninterpretable plot.
  sns.violinplot(


Saved: 6_HAVCR1_Violin_Healthy


The palette list has fewer values (1) than needed (17) and will cycle, which may produce an uninterpretable plot.
  sns.violinplot(


Saved: 6_HAVCR1_Violin_Disease

--- HAVCR1 Condition-Split Violin Plots Saved Successfully ---


In [13]:
# %%
# ============================================
# 9. BOX PLOT: iTAL EXPRESSION ONLY
# ============================================
print("Generating Box Plot for 'TAL|iPT' expression only...")

MARKER_COL = 'HAVCR1_expression'
HEALTHY_COLOR = 'grey'
DISEASE_COLOR = 'tomato'

# 1. Filter data using the user's exact logic (only cells where cellType_CosMx_2 is 'iTAL')
if MARKER_COL in adata.obs.columns:
    subset_obs = adata.obs[adata.obs['cellType_CosMx_2'].str.contains(
        'Injured|iPT', regex=True)].copy()

    plt.figure(figsize=(3, 3))  # Compact size

    sns.boxplot(
        data=subset_obs,
        # X-axis is the Condition (Healthy vs Disease)
        x='type',
        y=MARKER_COL,                 # HAVCR1 expression on Y-axis
        showfliers=False,             # Hide outliers
        palette={'Healthy': HEALTHY_COLOR, 'Disease': DISEASE_COLOR},
        linewidth=1
    )

    # Add labels and title
    plt.title(f"{MARKER_COL} in iTAL Cells", fontsize=9)
    plt.ylabel(f"{MARKER_COL} Expression (Log Norm)", fontsize=8)
    plt.xlabel("Condition", fontsize=8)

    # Adjust ticks
    plt.xticks(rotation=0, fontsize=8)
    plt.yticks(fontsize=7)

    plt.tight_layout()
    save_plot("8_iTAL_Expression_Condition_BoxPlot")
    plt.close()

else:
    print(f"Warning: Marker column '{MARKER_COL}' not found. Skipping plot.")

print("\n--- iTAL Expression Condition Box Plot Saved Successfully ---")


Generating Box Plot for 'TAL|iPT' expression only...


Saved: 8_iTAL_Expression_Condition_BoxPlot

--- iTAL Expression Condition Box Plot Saved Successfully ---


In [None]:
# %%
# ============================================
# 7. MICROENVIRONMENT ANALYSIS (Violin Plot, Fixed Scale & Star Position)
# ============================================
import seaborn as sns
from sklearn.neighbors import KDTree
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
print("Generating Microenvironment Analysis (Violin Plot) with Fixed Y-Scale and Star Position...")

# Define the necessary imports here for the function to run cleanly

# Define the fixed Y-axis range
Y_AXIS_LIMITS = (0, 100)
DISEASE_COLOR = 'tomato'
HEALTHY_COLOR = 'lightgrey'
# Define the exact Y-coordinate for the star annotation
STAR_Y_POSITION = 90.0

# --- A. CALCULATE NEIGHBORS (If not already present) ---
if 'immune_neighbors' not in adata.obs.columns:
    print("Calculating neighborhood composition (this may take a moment)...")
    coords = adata.obsm['spatial']
    tree = KDTree(coords)

    immune_mask = adata.obs['cellType_CosMx_2'].str.contains(
        'Immune|T-cell|Macrophage|B-cell', case=False).values
    fibro_mask = adata.obs['cellType_CosMx_2'].str.contains(
        'Fibroblast|Myofibroblast', case=False).values

    indices = tree.query_radius(coords, r=50)

    adata.obs['immune_neighbors'] = [np.sum(immune_mask[i]) for i in indices]
    adata.obs['fibro_neighbors'] = [np.sum(fibro_mask[i]) for i in indices]
    print("Neighborhood metrics calculated.")

# --- B. STATISTICAL ANALYSIS ---

# Function to run Mann-Whitney U test and format p-value as stars


def get_p_star(data_h, data_d, column):
    stat, p = mannwhitneyu(
        data_h[column], data_d[column], alternative='two-sided')
    if p < 0.01:
        return f'** (p={p:.2e})'
    elif p < 0.05:
        return f'* (p={p:.2e})'
    else:
        return f'n.s. (p={p:.2f})'

# Function to add annotation (FIXED Y-POSITION)


def add_annotation(ax, p_star, y_coord=STAR_Y_POSITION):
    # Fixed position for the horizontal line and annotation text
    y_line = y_coord

    # Draw horizontal line
    line_height_offset = 0.5
    ax.plot([0, 0, 1, 1], [y_line, y_line + line_height_offset, y_line + line_height_offset, y_line],
            lw=1.5, c='black')

    # Add significance star and p-value
    text_height_offset = 0.75
    ax.text(0.5, y_line + text_height_offset, p_star,
            ha='center', va='bottom', fontsize=12, fontweight='bold')


# Separate data by condition
healthy_data = adata.obs[adata.obs['type'] == 'Healthy']
disease_data = adata.obs[adata.obs['type'] == 'Disease']

immune_p_star = get_p_star(healthy_data, disease_data, 'immune_neighbors')
fibro_p_star = get_p_star(healthy_data, disease_data, 'fibro_neighbors')


# --- C. PLOT THE "BAD NEIGHBORHOOD" EFFECT ---
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot 1: Inflammation Environment (VIOLIN PLOT)
sns.violinplot(
    data=adata.obs, x='type', y='immune_neighbors',
    inner='quartile',  # Show the quartiles inside the violin
    bw=.2,  # Bandwidth for smoother violins
    ax=axes[0],
    palette={'Healthy': HEALTHY_COLOR, 'Disease': DISEASE_COLOR},
    order=['Healthy', 'Disease']
)
axes[0].set_title(f"Inflammation: Immune Neighbors per Cell",
                  fontsize=14, fontweight='bold')
axes[0].set_ylabel("Count (within 50$\mu$m)")
axes[0].set_xlabel("Condition")
axes[0].set_xticklabels(['Healthy', 'Disease'])
axes[0].grid(False)
axes[0].set_ylim(Y_AXIS_LIMITS)  # Apply fixed scale
# Add annotation for significance at Y=25
add_annotation(axes[0], immune_p_star.split(' ')[0])


# Plot 2: Fibrotic Environment (VIOLIN PLOT)
sns.violinplot(
    data=adata.obs, x='type', y='fibro_neighbors',
    inner='quartile',  # Show the quartiles inside the violin
    bw=.2,  # Bandwidth for smoother violins
    ax=axes[1],
    palette={'Healthy': HEALTHY_COLOR, 'Disease': DISEASE_COLOR},
    order=['Healthy', 'Disease']
)
axes[1].set_title(f"Scarring: Fibroblast Neighbors per Cell",
                  fontsize=14, fontweight='bold')
axes[1].set_ylabel("Count (within 50$\mu$m)")
axes[1].set_xlabel("Condition")
axes[1].set_xticklabels(['Healthy', 'Disease'])
axes[1].grid(False)
axes[1].set_ylim(Y_AXIS_LIMITS)  # Apply fixed scale
# Add annotation for significance at Y=25
add_annotation(axes[1], fibro_p_star.split(' ')[0])


plt.tight_layout()

# --- D. SAVE ---
save_plot("6_Microenvironment_Neighborhood_Stats_Violin_Final")
plt.close()
print("--- Microenvironment Analysis Complete ---")


  axes[0].set_ylabel("Count (within 50$\mu$m)")
  axes[1].set_ylabel("Count (within 50$\mu$m)")


Generating Microenvironment Analysis (Violin Plot) with Fixed Y-Scale and Star Position...


  axes[0].set_xticklabels(['Healthy', 'Disease'])
  axes[1].set_xticklabels(['Healthy', 'Disease'])


Saved: 6_Microenvironment_Neighborhood_Stats_Violin_Final
--- Microenvironment Analysis Complete ---


: 

In [None]:
# %%
# ============================================
# 7. MICROENVIRONMENT ANALYSIS (Fixed Scale & Star Position)
# ============================================
print("Generating Microenvironment Analysis with Star Position Fixed at Y=25...")

# Define the necessary imports here for the function to run cleanly
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KDTree
import seaborn as sns 

# Define the fixed Y-axis range
Y_AXIS_LIMITS = (0, 30)
DISEASE_COLOR = 'tomato'
HEALTHY_COLOR = 'lightgrey'
# Define the exact Y-coordinate for the star annotation
STAR_Y_POSITION = 25.0

# --- A. CALCULATE NEIGHBORS (If not already present) ---
if 'immune_neighbors' not in adata.obs.columns:
    print("Calculating neighborhood composition (this may take a moment)...")
    coords = adata.obsm['spatial']
    tree = KDTree(coords)

    immune_mask = adata.obs['cellType_CosMx_2'].str.contains(
        'Immune|T-cell|Macrophage|B-cell', case=False).values
    fibro_mask = adata.obs['cellType_CosMx_2'].str.contains(
        'Fibroblast|Myofibroblast', case=False).values

    indices = tree.query_radius(coords, r=50) 

    adata.obs['immune_neighbors'] = [np.sum(immune_mask[i]) for i in indices]
    adata.obs['fibro_neighbors'] = [np.sum(fibro_mask[i]) for i in indices]
    print("Neighborhood metrics calculated.")

# --- B. STATISTICAL ANALYSIS ---

# Function to run Mann-Whitney U test and format p-value as stars
def get_p_star(data_h, data_d, column):
    stat, p = mannwhitneyu(data_h[column], data_d[column], alternative='two-sided')
    if p < 0.01:
        return f'** (p={p:.2e})'
    elif p < 0.05:
        return f'* (p={p:.2e})'
    else:
        return f'n.s. (p={p:.2f})'

# Function to add annotation (FIXED Y-POSITION)
def add_annotation(ax, p_star, y_coord=STAR_Y_POSITION):
    # Fixed position for the horizontal line and annotation text
    y_line = y_coord
    
    # Draw horizontal line
    line_height_offset = 0.5 
    ax.plot([0, 0, 1, 1], [y_line, y_line + line_height_offset, y_line + line_height_offset, y_line], 
            lw=1.5, c='black')
    
    # Add significance star and p-value
    text_height_offset = 0.75 
    ax.text(0.5, y_line + text_height_offset, p_star, 
            ha='center', va='bottom', fontsize=12, fontweight='bold')
    

# Separate data by condition
healthy_data = adata.obs[adata.obs['type'] == 'Healthy']
disease_data = adata.obs[adata.obs['type'] == 'Disease']

immune_p_star = get_p_star(healthy_data, disease_data, 'immune_neighbors')
fibro_p_star = get_p_star(healthy_data, disease_data, 'fibro_neighbors')


# --- C. PLOT THE "BAD NEIGHBORHOOD" EFFECT ---
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot 1: Inflammation Environment
sns.boxplot(
    data=adata.obs, x='type', y='immune_neighbors',
    showfliers=False, ax=axes[0],
    palette={'Healthy': HEALTHY_COLOR, 'Disease': DISEASE_COLOR}, 
    order=['Healthy', 'Disease'] # Ensures 'Healthy' is 0 (left) and 'Disease' is 1 (right)
)
axes[0].set_title(f"Inflammation: Immune Neighbors per Cell", 
                   fontsize=14, fontweight='bold')
axes[0].set_ylabel("Count (within 50$\mu$m)")
axes[0].set_xlabel("Condition") 
axes[0].set_xticklabels(['Healthy', 'Disease'])
axes[0].grid(False)
axes[0].set_ylim(Y_AXIS_LIMITS) # Apply fixed scale
# Add annotation for significance at Y=25
add_annotation(axes[0], immune_p_star.split(' ')[0])


# Plot 2: Fibrotic Environment
sns.boxplot(
    data=adata.obs, x='type', y='fibro_neighbors',
    showfliers=False, ax=axes[1],
    palette={'Healthy': HEALTHY_COLOR, 'Disease': DISEASE_COLOR}, # Using confirmed colors
    order=['Healthy', 'Disease'] # Ensures 'Healthy' is 0 (left) and 'Disease' is 1 (right)
)
axes[1].set_title(f"Scarring: Fibroblast Neighbors per Cell", 
                   fontsize=14, fontweight='bold')
axes[1].set_ylabel("Count (within 50$\mu$m)")
axes[1].set_xlabel("Condition")
axes[1].set_xticklabels(['Healthy', 'Disease'])
axes[1].grid(False)
axes[1].set_ylim(Y_AXIS_LIMITS) # Apply fixed scale
# Add annotation for significance at Y=25
add_annotation(axes[1], fibro_p_star.split(' ')[0])


plt.tight_layout()

# --- D. SAVE ---
save_plot("6_Microenvironment_Neighborhood_Stats")
plt.close()
print("--- Microenvironment Analysis Complete ---")

  axes[0].set_ylabel("Count (within 50$\mu$m)")
  axes[1].set_ylabel("Count (within 50$\mu$m)")


Generating Microenvironment Analysis with Star Position Fixed at Y=25...


  axes[0].set_xticklabels(['Healthy', 'Disease'])
  axes[1].set_xticklabels(['Healthy', 'Disease'])


Saved: 6_Microenvironment_Neighborhood_Stats
--- Microenvironment Analysis Complete ---


In [None]:
# %%
# ============================================
# 8. SPATIAL CELL TYPE MAPS (The Architecture)
# ============================================
print("Generating Spatial Cell Type Maps...")

# --- A. Broad Cell Types (CosMx_1) ---
# This shows the major zones: Tubules vs Glomeruli vs Interstitium
fig = sc.pl.spatial(
    adata,
    color='cellType_CosMx_1',
    title="Tissue Architecture: Broad Cell Types",
    spot_size=0.01,
    palette='tab20',  # High contrast for many categories
    return_fig=True,
    show=False
)
save_plot("7_Spatial_Map_Broad_CosMx1")
plt.close()

# --- B. Detailed Cell Types (CosMx_2) ---
# This shows the specific pathology: iPT, Injured TAL, etc.
fig = sc.pl.spatial(
    adata,
    color='cellType_CosMx_2',
    title="Tissue Pathology: Detailed Cell States",
    spot_size=0.01,
    palette='tab20',
    return_fig=True,
    show=False
)
save_plot("7_Spatial_Map_Detailed_CosMx2")
plt.close()

# --- C. Side-by-Side Comparison (Corrected) ---
# Plot Healthy vs Disease side-by-side for the Detailed types
# Fix: Removed palette='set1' to avoid string parsing error
fig = sc.pl.spatial(
    adata,
    color='cellType_CosMx_2',
    # Highlight only the "Bad" cells
    groups=['Injured TAL', 'iPT', 'Fibroblast'],
    title="Pathology Highlight: Fibrotic Drivers",
    spot_size=0.01,
    # palette='set1',  <-- REMOVED THIS LINE
    return_fig=True,
    show=False
)
save_plot("7_Spatial_Map_Pathology_Highlights")
plt.close()

print("Spatial Cell Type plots saved.")


Generating Spatial Cell Type Maps...


Saved: 7_Spatial_Map_Broad_CosMx1
Saved: 7_Spatial_Map_Detailed_CosMx2
Saved: 7_Spatial_Map_Pathology_Highlights
Spatial Cell Type plots saved.


In [None]:
# %%
# ============================================
# 9. BIOLOGICAL PROOF PLOTS (Genes & Protein)
# ============================================
from matplotlib.colors import LinearSegmentedColormap
print("Generating Biological Proof Plots...")
# --- PLOT A: The "Smoking Gun" Genes (Spatial Expression) ---
# Why keep this? It visualizes the actual molecular pathology.
# HAVCR1 = Acute Injury marker (KIM-1)
# SPP1 = Fibrotic Driver (Osteopontin)
# LRP2 = Healthy Tubule marker (Megalin)
# We define a gradient from 'whitesmoke' (very light gray) to 'tomato' (vibrant red)
# 'whitesmoke' is better than 'lightgray' because it looks cleaner on white paper
custom_cmap = LinearSegmentedColormap.from_list(
    "gray_to_tomato", ["whitesmoke", "tomato"])

genes_of_interest = ['LRP2', 'HAVCR1', 'SPP1']
# Filter for genes actually present in your matrix
valid_genes = [g for g in genes_of_interest if g in adata.var_names]

if valid_genes:
    # We use 'plasma' or 'magma' because they look like fluorescent microscopy
    sc.pl.spatial(
        adata,
        color=valid_genes,
        title=[f"Healthy Marker: {valid_genes[0]}",
               f"Injury Marker: {valid_genes[1]}", f"Fibrosis Signal: {valid_genes[2]}"],
        spot_size=0.01,
        cmap=custom_cmap,
        wspace=0.3,
        show=False
    )
    save_plot("8_Biological_Gene_Expression_Maps")
    plt.close()

# --- PLOT B: Multi-Modal Validation (Protein vs RNA) ---
# Why keep this? CosMx gives you protein (Immunofluorescence) AND RNA.
# Overlapping them proves your "Epithelial" clusters are real physical structures.

if 'Mean.CK8.18' in adata.obs.columns:
    # CK8/18 is the Pan-Epithelial protein stain
    sc.pl.spatial(
        adata,
        color=['Mean.CK8.18', 'cellType_CosMx_1'],
        title=[
            "Ground Truth: Protein Stain (CK8/18)", "RNA Cluster Prediction"],
        cmap=custom_cmap,
        palette='tab20',
        spot_size=0.01,
        wspace=0.3,
        show=False
    )
    save_plot("9_MultiModal_Protein_Validation")
    plt.close()

# --- PLOT C: Quantifying the Injury (Violin Plot) ---
# Why keep this? You need to prove "iPT" is distinct from "PT".
# This plot shows that iPT has statistically higher injury signal (HAVCR1).

# Define the "Tubule" types we want to compare
tubule_types = [ct for ct in adata.obs['cellType_CosMx_2'].unique()
                if 'PT' in ct or 'TAL' in ct]
subset_adata = adata[adata.obs['cellType_CosMx_2'].isin(tubule_types)]

if 'HAVCR1_expression' in subset_adata.obs.columns:
    plt.figure(figsize=(10, 6))

    # Order by expression to show the "Health -> Injury" trajectory
    order = subset_adata.obs.groupby('cellType_CosMx_2')[
        'HAVCR1_expression'].median().sort_values().index

    sns.violinplot(
        data=subset_adata.obs,
        x='cellType_CosMx_2',
        y='HAVCR1_expression',
        order=order,
        palette='magma_r'  # Red = High Injury
    )

    plt.title("Quantitative Phenotyping: HAVCR1 Expression by Cell State")
    plt.ylabel("Injury Marker Expression (Log Norm)")
    plt.xlabel("Tubular Cell States")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    save_plot("10_Quantitative_Injury_Phenotyping")
    plt.close()

print("\n--- BIOLOGICAL PROOF ANALYSIS COMPLETE ---")


Generating Biological Proof Plots...


Saved: 8_Biological_Gene_Expression_Maps
Saved: 9_MultiModal_Protein_Validation
Saved: 10_Quantitative_Injury_Phenotyping

--- BIOLOGICAL PROOF ANALYSIS COMPLETE ---


In [5]:
# %%
# ============================================
# 10. CELLULAR LANDSCAPE (UMAP)
# ============================================
print("Generating Plot 11: UMAP of Cell Types...")

# Ensure the cell type column is categorical for proper coloring
CELL_TYPE_KEY = 'cellType_CosMx_1'
if CELL_TYPE_KEY in adata.obs.columns:

    # Create the plot
    fig, ax = plt.subplots(figsize=(10, 10))

    sc.pl.umap(
        adata,
        color=CELL_TYPE_KEY,
        title="Cellular Landscape (UMAP)",
        frameon=True,           # Clean look (no box)
        legend_loc="on data",    # Labels directly on clusters
        legend_fontsize=8,
        legend_fontoutline=2,    # White halo around text for readability
        ax=ax,
        show=False
    )

    # Save
    save_plot("11_UMAP_CellType_Annotated")
    plt.close()

else:
    print(f"Warning: Column '{CELL_TYPE_KEY}' not found. Skipping UMAP.")

print("--- UMAP GENERATION COMPLETE ---")


Generating Plot 11: UMAP of Cell Types...


Saved: 11_UMAP_CellType_Annotated
--- UMAP GENERATION COMPLETE ---


In [None]:
# %%
# ============================================
# 11. INJURY PROFILING BY CELL TYPE (Violin Plot)
# ============================================
print("Generating Plot 12: HAVCR1 Expression by Cell Type...")

# Define the columns
CELL_TYPE_COL = 'cellType_CosMx_2'
MARKER_COL = 'HAVCR1_expression'

if CELL_TYPE_COL in adata.obs.columns and MARKER_COL in adata.obs.columns:

    # Filter for relevant tubule and duct types where injury marker comparison is most meaningful
    # This prevents the plot from being cluttered by irrelevant immune or endothelial cells
    tubule_types = [ct for ct in adata.obs[CELL_TYPE_COL].unique()
                    if 'PT' in ct or 'TAL' in ct or 'DT' in ct or 'CD' in ct]

    # Subset the AnnData object
    subset_adata = adata[adata.obs[CELL_TYPE_COL].isin(tubule_types)].copy()

    plt.figure(figsize=(10, 6))

    # Order by median expression to visually display the Health -> Injury gradient
    order = subset_adata.obs.groupby(CELL_TYPE_COL)[
        MARKER_COL].median().sort_values().index

    sns.violinplot(
        data=subset_adata.obs,
        x=CELL_TYPE_COL,
        y=MARKER_COL,
        order=order,
        # Using a reverse magma palette to show high expression (red) = high injury
        palette='magma_r'
    )

    plt.title("HAVCR1 Injury Marker Expression")
    plt.ylabel("HAVCR1 Expression (Log Norm)")
    plt.xlabel("Tubular and Duct Cells")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Save
    save_plot("12_CellType_Injury_Violin")
    plt.close()

else:
    print(
        f"Warning: Columns '{CELL_TYPE_COL}' or '{MARKER_COL}' not found. Skipping plot.")

print("--- Cell Type Injury Profiling Complete ---")


Generating Plot 12: HAVCR1 Expression by Cell Type...


Saved: 12_CellType_Injury_Violin
--- Cell Type Injury Profiling Complete ---


In [None]:
# %%
# ============================================
# 12. FIBROSIS PROFILING BY CELL TYPE (Violin Plot for SPP1)
# ============================================
print("Generating Plot 13: SPP1 Expression by Epithelial Cell Type...")

# Define the columns for the new plot
CELL_TYPE_COL = 'cellType_CosMx_2'
# Assumes SPP1 expression column exists
FIBROSIS_MARKER_COL = 'SPP1_expression'

if CELL_TYPE_COL in adata.obs.columns and FIBROSIS_MARKER_COL in adata.obs.columns:

    # Filter for relevant tubule and duct types (same as the HAVCR1 plot)
    tubule_types = [ct for ct in adata.obs[CELL_TYPE_COL].unique()
                    if 'PT' in ct or 'TAL' in ct or 'DT' in ct or 'CD' in ct]

    # Subset the AnnData object
    subset_adata = adata[adata.obs[CELL_TYPE_COL].isin(tubule_types)].copy()

    plt.figure(figsize=(10, 6))

    # Order by median SPP1 expression to display the Health -> Fibrosis gradient
    order = subset_adata.obs.groupby(CELL_TYPE_COL)[
        FIBROSIS_MARKER_COL].median().sort_values().index

    sns.violinplot(
        data=subset_adata.obs,
        x=CELL_TYPE_COL,
        y=FIBROSIS_MARKER_COL,
        order=order,
        palette='rocket'  # Using a dark red/pink gradient for inflammation/fibrosis
    )

    plt.title("SPP1 Fibrosis Marker Expression by Epithelial Cell State")
    plt.ylabel("SPP1 Expression (Log Norm)")
    plt.xlabel("Tubular and Duct Cell States")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Save
    save_plot("13_CellType_SPP1_Violin")
    plt.close()

else:
    print(
        f"Warning: Columns '{CELL_TYPE_COL}' or '{FIBROSIS_MARKER_COL}' not found. Skipping plot.")

print("--- Cell Type Fibrosis Profiling Complete ---")


Generating Plot 13: SPP1 Expression by Epithelial Cell Type...
--- Cell Type Fibrosis Profiling Complete ---


In [None]:
# %%
# ============================================
# 12. HAVCR1 EXPRESSION PROFILING (3 SPLIT VIOLIN PLOTS)
# ============================================
print("Generating Plot 12: HAVCR1 Expression by Cell Type (3 plots)...")

# Define the columns and colors
CELL_TYPE_COL = 'cellType_CosMx_2'
MARKER_COL = 'HAVCR1_expression'
HEALTHY_COLOR = 'grey'
DISEASE_COLOR = 'tomato'
ALL_COLOR = 'dodgerblue'  # Single color for the combined plot


def plot_injury_violin(data, cell_type_col, marker_col, title, filename, color):

    plt.figure(figsize=(10, 6))

    # Order by median expression in the current data subset
    # This ensures the gradient is relevant to the displayed data
    order = data.groupby(cell_type_col)[
        marker_col].median().sort_values().index

    sns.violinplot(
        data=data,
        x=cell_type_col,
        y=marker_col,
        order=order,
        # Using a single specified color
        palette=[color]
    )

    plt.title(title)
    plt.ylabel("HAVCR1 Expression (Log Norm)")
    plt.xlabel("Tubular and Duct Cells")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    save_plot(filename)
    plt.close()


if CELL_TYPE_COL in adata.obs.columns and MARKER_COL in adata.obs.columns:

    # 1. Filter for relevant cell types (PT, TAL, DT, CD)
    tubule_types = [ct for ct in adata.obs[CELL_TYPE_COL].unique()
                    if 'PT' in ct or 'TAL' in ct or 'DT' in ct or 'CD' in ct]

    full_subset_obs = adata.obs[adata.obs[CELL_TYPE_COL].isin(
        tubule_types)].copy()

    # Filter by condition
    healthy_subset_obs = full_subset_obs[full_subset_obs['type'] == 'Healthy'].copy(
    )
    disease_subset_obs = full_subset_obs[full_subset_obs['type'] == 'Disease'].copy(
    )

    # A. Combined Plot (Original Plot, but Single Color)
    plot_injury_violin(
        full_subset_obs,
        CELL_TYPE_COL,
        MARKER_COL,
        "HAVCR1 Expression: All Conditions (Combined)",
        "12_CellType_Injury_Violin_All",
        ALL_COLOR
    )

    # B. New Plot: DISEASE only (Tomato Color)
    plot_injury_violin(
        disease_subset_obs,
        CELL_TYPE_COL,
        MARKER_COL,
        "HAVCR1 Expression: DISEASE Condition Only",
        "12_CellType_Injury_Violin_Disease",
        DISEASE_COLOR
    )

    # C. New Plot: HEALTHY only (Grey Color)
    plot_injury_violin(
        healthy_subset_obs,
        CELL_TYPE_COL,
        MARKER_COL,
        "HAVCR1 Expression: HEALTHY Condition Only",
        "12_CellType_Injury_Violin_Healthy",
        HEALTHY_COLOR
    )

else:
    print(
        f"Warning: Columns '{CELL_TYPE_COL}' or '{MARKER_COL}' not found. Skipping plots.")

print("--- Cell Type Injury Profiling Complete (3 Plots Saved) ---")


Generating Plot 12: HAVCR1 Expression by Cell Type (3 plots)...


The palette list has fewer values (1) than needed (17) and will cycle, which may produce an uninterpretable plot.
  sns.violinplot(


Saved: 12_CellType_Injury_Violin_All


The palette list has fewer values (1) than needed (17) and will cycle, which may produce an uninterpretable plot.
  sns.violinplot(


Saved: 12_CellType_Injury_Violin_Disease


The palette list has fewer values (1) than needed (17) and will cycle, which may produce an uninterpretable plot.
  sns.violinplot(


Saved: 12_CellType_Injury_Violin_Healthy
--- Cell Type Injury Profiling Complete (3 Plots Saved) ---


In [None]:
# %%
# ============================================
# 13. FIBROTIC MICROENVIRONMENT (FME) SCORE BY CELL TYPE
# ============================================
print("Generating Plot 14: FME Score by Epithelial Cell Type...")

# FME Genes from Abedini et al. (Nature Genetics, s41588-024-01802-x)
fme_genes = [
    'CCL2', 'CXCL12', 'MMP7', 'VCAM1', 'LCN2',
    'CSF1', 'IL34',
    'TGFB1', 'SPP1', 'FN1', 'COL1A1'
]

SCORE_NAME = 'FME_Signature_Score'
CELL_TYPE_COL = 'cellType_CosMx_2'

# --- 1. Calculate FME Score ---
if SCORE_NAME not in adata.obs.columns:
    print(f"Calculating {SCORE_NAME}...")

    # Filter for genes actually present in the dataset
    available_fme = [g for g in fme_genes if g in adata.var_names]

    if available_fme:
        # Calculate the score (Scanpy function for gene signature)
        sc.tl.score_genes(adata, gene_list=available_fme,
                          score_name=SCORE_NAME)
        print(f"{SCORE_NAME} calculated.")
    else:
        print("Warning: None of the FME genes were found in the data. Skipping score calculation.")

# --- 2. Plotting ---
if SCORE_NAME in adata.obs.columns and CELL_TYPE_COL in adata.obs.columns:

    # Filter for relevant tubule and duct types
    tubule_types = [ct for ct in adata.obs[CELL_TYPE_COL].unique()
                    if 'PT' in ct or 'TAL' in ct or 'DT' in ct or 'CD' in ct]

    # Subset the AnnData object
    subset_adata = adata[adata.obs[CELL_TYPE_COL].isin(tubule_types)].copy()

    plt.figure(figsize=(10, 6))

    # Order by median FME Score to display the Health -> Fibrosis trajectory
    order = subset_adata.obs.groupby(CELL_TYPE_COL)[
        SCORE_NAME].median().sort_values().index

    sns.violinplot(
        data=subset_adata.obs,
        x=CELL_TYPE_COL,
        y=SCORE_NAME,
        order=order,
        palette='Spectral_r'  # A high-contrast palette for a critical score
    )

    plt.title("14. Fibrotic Microenvironment (FME) Score by Epithelial Cell State")
    plt.ylabel("FME Signature Score")
    plt.xlabel("Tubular and Duct Cell States")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Save
    save_plot("14_CellType_FME_Score_Violin")
    plt.close()

else:
    print(
        f"Warning: Score column '{SCORE_NAME}' not found or cell type column is missing. Skipping plot.")

print("--- FME Score Profiling Complete ---")


Generating Plot 14: FME Score by Epithelial Cell Type...
Calculating FME_Signature_Score...
FME_Signature_Score calculated.
Saved: 14_CellType_FME_Score_Violin
--- FME Score Profiling Complete ---


In [None]:
# %%
# ============================================
# 13. MODIFIED FME SCORE BY ALL CELL TYPES (Plot 15)
# ============================================
print("Generating Plot 15: FME Score across ALL Cell Types...")

# FME Genes from Abedini et al. (Nature Genetics)
fme_genes = [
    'CCL2', 'CXCL12', 'MMP7', 'VCAM1', 'LCN2',
    'CSF1', 'IL34',
    'TGFB1', 'SPP1', 'FN1', 'COL1A1'
]

SCORE_NAME = 'FME_Signature_Score'
CELL_TYPE_COL = 'cellType_CosMx_2'

# --- 1. Calculate FME Score (must be calculated on all cells) ---
if SCORE_NAME not in adata.obs.columns:
    print(f"Calculating {SCORE_NAME}...")
    available_fme = [g for g in fme_genes if g in adata.var_names]
    if available_fme:
        # Calculate the score (Scanpy function for gene signature)
        sc.tl.score_genes(adata, gene_list=available_fme,
                          score_name=SCORE_NAME)
        print(f"{SCORE_NAME} calculated.")
    else:
        print("Warning: None of the FME genes were found in the data. Skipping score calculation.")

# --- 2. Plotting (Using ALL cell types) ---
if SCORE_NAME in adata.obs.columns and CELL_TYPE_COL in adata.obs.columns:

    # INCREASE FIGURE SIZE to accommodate all labels
    plt.figure(figsize=(14, 6))

    # Order by median FME Score to display the full signature gradient
    order = adata.obs.groupby(CELL_TYPE_COL)[
        SCORE_NAME].median().sort_values().index

    sns.violinplot(
        data=adata.obs,  # Use full adata.obs - NO SUBSETTING
        x=CELL_TYPE_COL,
        y=SCORE_NAME,
        order=order,
        palette='Spectral_r'
    )

    plt.title("15. Fibrotic Microenvironment (FME) Score by ALL Cell States")
    plt.ylabel("FME Signature Score")
    plt.xlabel("All Cell States Present")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Save
    save_plot("15_AllCellType_FME_Score_Violin")
    plt.close()

else:
    print(
        f"Warning: Score column '{SCORE_NAME}' not found or cell type column is missing. Skipping plot.")

print("--- All Cell Type FME Score Profiling Complete ---")


Generating Plot 15: FME Score across ALL Cell Types...
Saved: 15_AllCellType_FME_Score_Violin
--- All Cell Type FME Score Profiling Complete ---


In [None]:
# %%
print("Fixing x-axis to show ONLY TAL + iPT cell types...")

CELL_TYPE_COL = 'cellType_CosMx_2'
MARKER_COL = 'HAVCR1_expression'
Y_AXIS_LIMITS = (0, 1.5)

full_obs = adata.obs.copy()

# ðŸ”¥ Select only TAL + iPT
selected_types = [
    ct for ct in full_obs[CELL_TYPE_COL].unique()
    if ("TAL" in ct) or ("iPT" in ct)
]

# --- Subset disease ---
disease_obs = full_obs[
    (full_obs["type"] == "Disease") &
    (full_obs[CELL_TYPE_COL].isin(selected_types))
].copy()

# Remove unused categories
if hasattr(disease_obs[CELL_TYPE_COL], "cat"):
    disease_obs[CELL_TYPE_COL] = disease_obs[CELL_TYPE_COL].cat.remove_unused_categories()

# --- Subset healthy ---
healthy_obs = full_obs[
    (full_obs["type"] == "Healthy") &
    (full_obs[CELL_TYPE_COL].isin(selected_types))
].copy()

# Remove unused categories
if hasattr(healthy_obs[CELL_TYPE_COL], "cat"):
    healthy_obs[CELL_TYPE_COL] = healthy_obs[CELL_TYPE_COL].cat.remove_unused_categories()

# Determine x-order
x_order = (
    disease_obs.groupby(CELL_TYPE_COL)[MARKER_COL]
    .median()
    .sort_values()
    .index
)

# ============================
# ðŸš¨ PLOT 1 â€” DISEASE ONLY
# ============================
plt.figure(figsize=(7, 5))
sns.violinplot(
    data=disease_obs,
    x=CELL_TYPE_COL,
    y=MARKER_COL,
    order=x_order,
    palette=["tomato"],
    inner="quartile"
)
plt.title("HAVCR1 Expression in TAL + iPT (Disease)")
plt.ylim(Y_AXIS_LIMITS)
plt.xticks(rotation=45, ha="right")
plt.ylabel("HAVCR1 Expression")
plt.xlabel("Cell Types")

save_plot("TAL_iPT_HAVCR1_Disease_clean")
plt.close()


# ============================
# âšª PLOT 2 â€” HEALTHY ONLY
# ============================
plt.figure(figsize=(7, 5))
sns.violinplot(
    data=healthy_obs,
    x=CELL_TYPE_COL,
    y=MARKER_COL,
    order=x_order,
    palette=["grey"],
    inner="quartile"
)
plt.title("HAVCR1 Expression in TAL + iPT (Healthy)")
plt.ylim(Y_AXIS_LIMITS)
plt.xticks(rotation=45, ha="right")
plt.ylabel("HAVCR1 Expression")
plt.xlabel("Cell Types")

# ðŸ”¥ Hide Y-axis completely
ax = plt.gca()
ax.yaxis.set_visible(False)
ax.spines['left'].set_visible(False)

save_plot("TAL_iPT_HAVCR1_Healthy_clean")
plt.close()

print("Done â€” x-axis now correctly shows ONLY TAL + iPT.")


Fixing x-axis to show ONLY TAL + iPT cell types...


The palette list has fewer values (1) than needed (7) and will cycle, which may produce an uninterpretable plot.
  sns.violinplot(


Saved: TAL_iPT_HAVCR1_Disease_clean


The palette list has fewer values (1) than needed (7) and will cycle, which may produce an uninterpretable plot.
  sns.violinplot(


Saved: TAL_iPT_HAVCR1_Healthy_clean
Done â€” x-axis now correctly shows ONLY TAL + iPT.


In [None]:
# %%
# ============================================
# 14. HAVCR1 CONDITION X CELL TYPE INTERACTION PLOT (FINAL LAYOUT OPTIMIZED)
# ============================================
print("Re-generating Plot 16: HAVCR1 Expression with final width and axis spacing adjustments...")

# Define the columns and parameters
CELL_TYPE_COL = 'cellType_CosMx_2'
MARKER_COL = 'HAVCR1_expression'
CONDITION_COL = 'type'

# Defined Font Sizes for 6x4 presentation format
TITLE_FONTSIZE = 4
LABEL_FONTSIZE = 6
TICK_FONTSIZE = 8
LEGEND_FONTSIZE = 7
LEGEND_TITLE_FONTSIZE = 9
CUSTOM_PALETTE = {'Healthy': '#cccccc', 'Disease': '#b34747'}

if CELL_TYPE_COL in adata.obs.columns and MARKER_COL in adata.obs.columns and CONDITION_COL in adata.obs.columns:

    # 1. Filter for relevant epithelial types (Tubules/Ducts)
    tubule_types = [ct for ct in adata.obs[CELL_TYPE_COL].unique()
                    if 'PT' in ct or 'TAL' in ct or 'DT' in ct or 'CD' in ct]
    subset_adata = adata[adata.obs[CELL_TYPE_COL].isin(tubule_types)].copy()

    # --- AESTHETIC ENHANCEMENT ---
    sns.set_style('whitegrid')
    plt.figure(figsize=(6, 4))  # <-- INCREASED WIDTH

    # Order the cell types by overall median HAVCR1 expression
    order = subset_adata.obs.groupby(CELL_TYPE_COL)[
        MARKER_COL].median().sort_values().index

    sns.violinplot(
        data=subset_adata.obs,
        x=CELL_TYPE_COL,
        y=MARKER_COL,
        hue=CONDITION_COL,
        order=order,
        split=False,
        palette=CUSTOM_PALETTE,
        inner='quartile',
        linewidth=0.8
    )

    # Setting all font sizes explicitly
    plt.title("",
              fontsize=TITLE_FONTSIZE)
    plt.ylabel("HAVCR1 Expression (Log Norm)", fontsize=LABEL_FONTSIZE)
    # plt.xlabel is removed to reduce vertical spacing

    plt.xticks(rotation=45, ha='right', fontsize=TICK_FONTSIZE)
    plt.yticks(fontsize=TICK_FONTSIZE)

    # --- X-AXIS SPACING FIX ---
    # Reduce padding for the X-axis tick labels to pull them closer to the plot
    plt.tick_params(axis='x', pad=-3)

    # Legend fixed location and size
    plt.legend(
        title="Condition",
        loc='lower center',
        bbox_to_anchor=(0.5, 1.05),
        ncol=2,
        fontsize=LEGEND_FONTSIZE,
        title_fontsize=LEGEND_TITLE_FONTSIZE
    )

    # Final layout adjustment (rect bottom adjusted to ensure rotated ticks are visible)
    plt.tight_layout(rect=[0, 0.15, 1, 1])

    # Save
    save_plot("16_HAVCR1_Condition_CellType_Violin_Final")
    plt.close()
    sns.set_style('white')

print("--- Plot 16 Final Layout Optimized and Saved as 16_HAVCR1_Condition_CellType_Violin_Final.png/pdf ---")


Re-generating Plot 16: HAVCR1 Expression with final width and axis spacing adjustments...


Saved: 16_HAVCR1_Condition_CellType_Violin_Final
--- Plot 16 Final Layout Optimized and Saved as 16_HAVCR1_Condition_CellType_Violin_Final.png/pdf ---


In [None]:
# %%
# ============================================
# 15. HAVCR1 CONDITION X ALL CELL TYPE INTERACTION PLOT (FINAL)
# ============================================
print("Generating Plot 17: HAVCR1 Expression split by Condition across ALL Cell Types...")

# Define the columns and parameters
CELL_TYPE_COL = 'cellType_CosMx_2'
MARKER_COL = 'HAVCR1_expression'
CONDITION_COL = 'type'

# Defined Font Sizes for 9x4 presentation format
TITLE_FONTSIZE = 12
LABEL_FONTSIZE = 10
TICK_FONTSIZE = 8
LEGEND_FONTSIZE = 7
LEGEND_TITLE_FONTSIZE = 9
CUSTOM_PALETTE = {'Healthy': '#cccccc',
                  'Disease': '#b34747'}  # Nature-style colors

if CELL_TYPE_COL in adata.obs.columns and MARKER_COL in adata.obs.columns and CONDITION_COL in adata.obs.columns:

    # --- 1. USE FULL DATA: NO FILTERING ---
    # We use the full AnnData observation dataframe.
    full_obs_df = adata.obs.copy()

    # --- AESTHETIC ENHANCEMENT ---
    sns.set_style('whitegrid')
    plt.figure(figsize=(9, 4))  # <-- INCREASED WIDTH for ALL cell types

    # Order the cell types by overall median HAVCR1 expression
    order = full_obs_df.groupby(CELL_TYPE_COL)[
        MARKER_COL].median().sort_values().index

    sns.violinplot(
        data=full_obs_df,  # <-- USING ALL CELL TYPES
        x=CELL_TYPE_COL,
        y=MARKER_COL,
        hue=CONDITION_COL,
        order=order,
        split=False,
        palette=CUSTOM_PALETTE,
        inner='quartile',
        linewidth=0.8
    )

    # Setting all font sizes explicitly
    plt.title("17. HAVCR1 Expression: All Cell States $\\times$ Disease",
              fontsize=TITLE_FONTSIZE)
    plt.ylabel("HAVCR1 Expression (Log Norm)", fontsize=LABEL_FONTSIZE)
    # plt.xlabel is removed

    plt.xticks(rotation=45, ha='right', fontsize=TICK_FONTSIZE)
    plt.yticks(fontsize=TICK_FONTSIZE)

    # X-AXIS SPACING FIX (Reduced padding to pull labels closer)
    plt.tick_params(axis='x', pad=-3)

    # Legend fixed location and size
    plt.legend(
        title="Condition",
        loc='lower center',
        bbox_to_anchor=(0.5, 1.05),
        ncol=2,
        fontsize=LEGEND_FONTSIZE,
        title_fontsize=LEGEND_TITLE_FONTSIZE
    )

    # Final layout adjustment
    plt.tight_layout(rect=[0, 0.15, 1, 1])

    # --- AESTHETIC ENHANCEMENT ---
    sns.despine(left=True, bottom=True)
    plt.grid(axis='y', linestyle='--', alpha=0.6)

    # Save
    save_plot("17_HAVCR1_AllCellType_Condition_Final")
    plt.close()
    sns.set_style('white')

print("--- Plot 17 Final All Cell Type Plot Saved as 17_HAVCR1_AllCellType_Condition_Final.png/pdf ---")


Generating Plot 17: HAVCR1 Expression split by Condition across ALL Cell Types...
Saved: 17_HAVCR1_AllCellType_Condition_Final
--- Plot 17 Final All Cell Type Plot Saved as 17_HAVCR1_AllCellType_Condition_Final.png/pdf ---


In [None]:
# %%
# ============================================
# 16. HAVCR1 CONDITION X ALL CELL TYPE (SPLIT VIOLIN FINAL)
# ============================================
print("Generating Plot 18: HAVCR1 Expression split by Condition across ALL Cell Types (Final Layout)...")

# Define the columns and parameters
CELL_TYPE_COL = 'cellType_CosMx_2'
MARKER_COL = 'HAVCR1_expression'
CONDITION_COL = 'type'

# Defined Font Sizes for 9x4 presentation format
TITLE_FONTSIZE = 12
LABEL_FONTSIZE = 10
TICK_FONTSIZE = 8
LEGEND_FONTSIZE = 7
LEGEND_TITLE_FONTSIZE = 9
# Sophisticated color contrast
CUSTOM_PALETTE = {'Healthy': '#cccccc', 'Disease': '#b34747'}

if CELL_TYPE_COL in adata.obs.columns and MARKER_COL in adata.obs.columns and CONDITION_COL in adata.obs.columns:

    # 1. USE FULL DATA
    full_obs_df = adata.obs.copy()

    # --- AESTHETIC ENHANCEMENT ---
    sns.set_style('white')  # Plain white background for clean split plot
    plt.figure(figsize=(9, 4))

    # Order the cell types by overall median HAVCR1 expression
    order = full_obs_df.groupby(CELL_TYPE_COL)[
        MARKER_COL].median().sort_values().index

    sns.violinplot(
        data=full_obs_df,
        x=CELL_TYPE_COL,
        y=MARKER_COL,
        hue=CONDITION_COL,
        # <-- Disease will be on the top/left half
        hue_order=['Disease', 'Healthy'],
        order=order,  # <-- Keeps X-axis order same
        split=True,  # <-- SPLIT VIOLIN PLOT
        palette=CUSTOM_PALETTE,
        inner='quartile',
        linewidth=0.8
    )

    # Setting all font sizes explicitly
    plt.title("18. HAVCR1 Expression: All Cell States Split by Disease Status",
              fontsize=TITLE_FONTSIZE)
    plt.ylabel("HAVCR1 Expression (Log Norm)", fontsize=LABEL_FONTSIZE)

    plt.xticks(rotation=45, ha='right', fontsize=TICK_FONTSIZE)
    plt.yticks(fontsize=TICK_FONTSIZE)

    # X-AXIS SPACING FIX
    plt.tick_params(axis='x', pad=-3)

    # Legend fixed location and size
    plt.legend(
        title="Condition",
        loc='lower center',
        bbox_to_anchor=(0.5, 1.05),
        ncol=2,
        fontsize=LEGEND_FONTSIZE,
        title_fontsize=LEGEND_TITLE_FONTSIZE
    )

    # Final layout adjustment
    sns.despine(trim=True)  # Remove all spines for a clean look
    plt.tight_layout(rect=[0, 0.15, 1, 1])

    # Save
    save_plot("18_HAVCR1_AllCellType_Split_Final")
    plt.close()

else:
    print(f"Warning: Necessary columns not found. Skipping plot.")

print("--- Plot 18 Final Split Violin Plot Saved as 18_HAVCR1_AllCellType_Split_Final.png/pdf ---")


Generating Plot 18: HAVCR1 Expression split by Condition across ALL Cell Types (Final Layout)...
Saved: 18_HAVCR1_AllCellType_Split_Final
--- Plot 18 Final Split Violin Plot Saved as 18_HAVCR1_AllCellType_Split_Final.png/pdf ---
