In [None]:
import glob
import logging
from pathlib import Path

import arviz as az
import numpy as np
import pandas as pd
import pymc as pm
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine

from src.clustering import preproc_features
from src.config import ConfigManager
from src.database import (
    get_dic_analysis_by_ids,
    get_dic_analysis_ids,
    get_dic_data,
    get_image,
    get_multi_dic_data,
)
from src.preprocessing import apply_dic_filters, spatial_subsample
from src.roi import PolygonROISelector, filter_dataframe

%matplotlib widget
az.style.use("arviz-darkgrid")

RANDOM_SEED = 8927
rng = np.random.default_rng(RANDOM_SEED)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Load configuration
config = ConfigManager()
db_engine = create_engine(config.db_url)

# Parameters
target_date = "2024-08-23"
camera_name = "PPCX_Tele"

dic_id = 629

base_output_dir = "output"

In [8]:
# === DATA EXTRACTION ===

# Get DIC analysis metadata
dic_ids = get_dic_analysis_ids(
    db_engine, reference_date=target_date, camera_name=camera_name
)
print(f"Found {len(dic_ids)} DIC analyses")
dic_analyses = get_dic_analysis_by_ids(db_engine=db_engine, dic_ids=dic_ids)

# Get master image
master_image_id = dic_analyses["master_image_id"].iloc[0]
img = get_image(master_image_id, camera_name=camera_name)

# Fetch DIC data
if len(dic_ids) == 0:
    raise ValueError("No DIC analyses found for the given criteria")
elif len(dic_ids) == 1:
    print(f"Using DIC ID: {dic_ids[0]}")
    df = get_dic_data(dic_ids[0])
else:
    df = get_multi_dic_data(dic_ids)

# Apply filters
df = apply_dic_filters(
    df,
    filter_outliers=config.get("dic.filter_outliers"),
    tails_percentile=config.get("dic.tails_percentile"),
    min_velocity=config.get("dic.min_velocity"),
    apply_2d_median=config.get("dic.apply_2d_median"),
    median_window_size=config.get("dic.median_window_size"),
    median_threshold_factor=config.get("dic.median_threshold_factor"),
)

# Apply ROI filter
selector = PolygonROISelector.from_file(config.get("data.roi_path"))
df = filter_dataframe(df, selector.polygon_path, x_col="x", y_col="y")
print(f"Data shape after filtering: {df.shape}")


# Apply subsampling AFTER ROI filtering
SUBSAMPLE_FACTOR = 2  # Take every n point
SUBSAMPLE_METHOD = "regular"  # or 'random', 'stratified'
if SUBSAMPLE_FACTOR > 0:
    print(f"Data shape before subsampling: {df.shape}")
    df_subsampled = spatial_subsample(
        df, n_subsample=SUBSAMPLE_FACTOR, method=SUBSAMPLE_METHOD
    )
    df = df_subsampled
    print(f"Data shape after subsampling: {df.shape}")

# === FEATURE PREPARATION ===

# Get clustering parameters from config
variables_names = config.get("clustering.variables_names")
print(f"Using features: {variables_names}")

# Preprocess features
df_features = preproc_features(df)
X = df_features[variables_names].values
n_features = X.shape[1]
ndata = X.shape[0]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Feature matrix shape: {X_scaled.shape}")

# Build output directory
output_dir = Path(base_output_dir) / camera_name
output_dir.mkdir(parents=True, exist_ok=True)
base_name = f"{camera_name}_{target_date}_PyMC_GMM"

Found 1 DIC analyses


2025-08-28 11:37:03,238 - INFO - Starting DIC filtering pipeline with 5251 points
2025-08-28 11:37:03,243 - INFO - Percentile filtering: 5251 -> 5147 points (removed 104 outliers)
2025-08-28 11:37:03,244 - INFO - Min velocity filtering: 5147 -> 3835 points (removed 1312 points below 1)
2025-08-28 11:37:03,245 - INFO - Applying 2D median filter: window_size=5, threshold_factor=3.0
2025-08-28 11:37:03,253 - INFO - Estimated grid spacing: 64.00


Using DIC ID: 629


2025-08-28 11:37:03,430 - INFO - Created 2D grid: (89, 59), 3835 valid points
2025-08-28 11:37:03,434 - INFO - Detected 362 outliers in 2D median filter
2025-08-28 11:37:03,598 - INFO - 2D median filtering: 3835 -> 3473 points (removed 362 outliers)
2025-08-28 11:37:03,599 - INFO - DIC filtering pipeline completed: 5251 -> 3473 points (removed 1778 total)


Filtered 3473 points to 2138 points inside polygon
Data shape after filtering: (2138, 5)
Data shape before subsampling: (2138, 5)
Subsampled from 2138 to 1069 points (50.0%)
Data shape after subsampling: (1069, 5)
Using features: ['V']
Feature matrix shape: (1069, 1)


In [None]:
# === SPATIAL PRIOR SETUP ===
def assign_spatial_priors_dic(df, selectors, prior_strength=0.8):
    """Assign spatial prior probabilities based on polygon sectors."""
    ndata = len(df)
    k = len(selectors)
    prior_probs = np.ones((ndata, k)) / k  # default uniform

    for idx, selector in enumerate(selectors):
        mask = selector.contains_points(df["x"].values, df["y"].values)
        # Strong prior for points inside polygon
        prior_probs[mask] = (1 - prior_strength) / (
            k - 1
        )  # small prob for other clusters
        prior_probs[mask, idx] = prior_strength  # high prob for this cluster
        print(f"Sector {idx}: {mask.sum()} points with strong prior")

    return prior_probs


# Load sector polygons for priors
sector_files = sorted(glob.glob("data/sectors_prior/*.json"))
sector_selectors = [PolygonROISelector.from_file(f) for f in sector_files]
k = len(sector_selectors)  # number of clusters = number of sectors
print(f"Found {k} sector polygons for priors")

# Assign priors
PRIOR_STRENGTH = 0.5
prior_probs = assign_spatial_priors_dic(
    df, sector_selectors, prior_strength=PRIOR_STRENGTH
)

# Visualize priors
nrows = int(np.ceil(np.sqrt(k)))
ncols = int(np.ceil(k / nrows))
fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows))
axes = [axes] if k == 1 else axes.flatten()
for cluster in range(k):
    axes[cluster].imshow(img, alpha=0.3)
    scatter = axes[cluster].scatter(
        df["x"],
        df["y"],
        c=prior_probs[:, cluster],
        cmap="Reds",
        s=1,
        alpha=0.7,
        vmin=0,
        vmax=1,
    )
    axes[cluster].set_title(f"Prior for Cluster {cluster}")
    axes[cluster].xaxis.set_ticks([])
    axes[cluster].yaxis.set_ticks([])
    plt.colorbar(scatter, ax=axes[cluster])

plt.show()

In [None]:
# === PYMC MODEL WITH SPATIAL PRIORS ===
with pm.Model(
    coords={"cluster": range(k), "feature": range(n_features), "obs": range(ndata)}
) as model_with_priors:
    # Cluster means (multivariate)
    μ = pm.Normal("μ", mu=0, sigma=3, dims=("cluster", "feature"))

    # Cluster standard deviations (diagonal covariance)
    σ = pm.HalfNormal("σ", sigma=2, dims=("cluster", "feature"))

    # Cluster assignments with spatial priors
    z = pm.Categorical("z", p=prior_probs, dims="obs")

    # Likelihood: each point comes from its assigned cluster
    pm.Normal("x_obs", mu=μ[z], sigma=σ[z], observed=X_scaled, dims=("obs", "feature"))

print("Model with spatial priors created")
pm.model_to_graphviz(model_with_priors)

In [None]:
# === SAMPLE FROM MODEL ===
with model_with_priors:
    print("Starting MCMC sampling...")
    idata_with_priors = pm.sample(
        target_accept=0.9,
        draws=1000,
        tune=1000,
        chains=4,
        random_seed=RANDOM_SEED,
    )

print("Sampling completed!")

In [None]:
# Check R-hat (should be < 1.01)
print(az.rhat(idata_with_priors))

# Check effective sample size (should be > 100)
print(az.ess(idata_with_priors))

In [None]:
# Plot trace plots
az.plot_trace(idata_with_priors, var_names=["μ", "σ"])

In [None]:
idata = idata_with_priors

# Get cluster assignments
z_posterior = idata.posterior["z"]
z_samples = z_posterior.values.reshape(-1, z_posterior.shape[-1])

cluster_pred = np.zeros(z_posterior.shape[-1], dtype=int)
for i in range(z_posterior.shape[-1]):
    values, counts = np.unique(z_samples[:, i], return_counts=True)
    cluster_pred[i] = values[np.argmax(counts)]

# Get model parameters
mu_posterior = idata.posterior["μ"].mean(dim=["chain", "draw"]).values.flatten()
sigma_posterior = idata.posterior["σ"].mean(dim=["chain", "draw"]).values.flatten()

In [None]:
# For 1D velocity clustering
from matplotlib.colors import Normalize
from scipy.stats import norm as scipy_norm


def plot_1d_velocity_clustering(df_features, idata, img, scaler=None):
    """Specialized plot for 1D velocity-only clustering."""

    # Get cluster assignments
    z_posterior = idata.posterior["z"]
    z_samples = z_posterior.values.reshape(-1, z_posterior.shape[-1])

    cluster_pred = np.zeros(z_posterior.shape[-1], dtype=int)
    for i in range(z_posterior.shape[-1]):
        values, counts = np.unique(z_samples[:, i], return_counts=True)
        cluster_pred[i] = values[np.argmax(counts)]

    # Get model parameters
    mu_posterior = idata.posterior["μ"].mean(dim=["chain", "draw"]).values.flatten()
    sigma_posterior = idata.posterior["σ"].mean(dim=["chain", "draw"]).values.flatten()

    # Distinct colors
    unique_labels = np.unique(cluster_pred)
    colors = ["#E31A1C", "#1F78B4", "#33A02C", "#FF7F00", "#6A3D9A"][
        : len(unique_labels)
    ]
    color_map = {label: colors[i] for i, label in enumerate(unique_labels)}

    # Create figure with custom layout
    fig, axes = plt.subplots(2, 2, figsize=(8, 12))

    # Plot 1: Spatial clusters
    ax1 = axes[0, 0]
    ax1.set_title("Velocity-Based Spatial Clustering", fontsize=14, pad=10)

    if img is not None:
        ax1.imshow(img, alpha=0.3, cmap="gray")

    for label in unique_labels:
        mask = cluster_pred == label
        if np.any(mask):
            ax1.scatter(
                df_features.loc[mask, "x"],
                df_features.loc[mask, "y"],
                c=color_map[label],
                s=8,
                alpha=0.8,
                label=f"Velocity Cluster {label}",
                edgecolors="none",
            )
    ax1.legend(loc="upper right", framealpha=0.9, fontsize=10)
    ax1.set_aspect("equal")
    ax1.set_xticks([])
    ax1.set_yticks([])

    # Plot 2: Velocity distribution
    ax3 = axes[0, 1]
    ax3.set_title("Velocity Distribution by Cluster", fontsize=14, pad=10)

    # Plot histograms for each cluster
    velocity = df_features["V"].values
    for label in unique_labels:
        mask = cluster_pred == label
        if np.any(mask):
            ax3.hist(
                velocity[mask],
                bins=35,
                alpha=0.7,
                density=True,
                color=color_map[label],
                label=f"Cluster {label}",
                edgecolor="white",
                linewidth=0.5,
            )

    # Overlay model distributions
    v_range = np.linspace(velocity.min(), velocity.max(), 200)
    for label in unique_labels:
        if scaler is not None:
            mu_orig = scaler.inverse_transform([[mu_posterior[label]]])[0, 0]
            sigma_orig = sigma_posterior[label] * scaler.scale_[0]
        else:
            mu_orig = mu_posterior[label]
            sigma_orig = sigma_posterior[label]

        model_dist = scipy_norm.pdf(v_range, mu_orig, sigma_orig)
        ax3.plot(
            v_range,
            model_dist,
            "--",
            color=color_map[label],
            linewidth=2.5,
            alpha=0.9,
            label=f"Model {label}",
        )
    ax3.set_xlabel("Velocity Magnitude", fontsize=12)
    ax3.set_ylabel("Density", fontsize=12)
    ax3.grid(True, alpha=0.3)
    ax3.legend(fontsize=10, framealpha=0.9)

    # Plot 3: Velocity field with quiver plot
    ax2 = axes[1, 0]
    ax2.set_title("Velocity Vector Field", fontsize=14, pad=10)

    if img is not None:
        ax2.imshow(img, alpha=0.7, cmap="gray")

    # Create quiver plot
    magnitudes = df_features["V"].to_numpy()
    vmin = 0.0
    vmax = np.max(magnitudes)
    norm = Normalize(vmin=vmin, vmax=vmax)
    q = ax2.quiver(
        df_features["x"].to_numpy(),
        df_features["y"].to_numpy(),
        df_features["u"].to_numpy(),
        df_features["v"].to_numpy(),
        magnitudes,
        scale=None,
        scale_units="xy",
        angles="xy",
        cmap="viridis",
        norm=norm,
        width=0.003,
        headwidth=2.5,
        alpha=1.0,
    )

    # Add colorbar
    cbar = fig.colorbar(q, ax=ax2, shrink=0.8, aspect=20, pad=0.02)
    cbar.set_label("Velocity Magnitude", rotation=270, labelpad=15)
    ax2.set_aspect("equal")
    ax2.set_xticks([])
    ax2.set_yticks([])
    ax2.grid(False)

    # Plot 4: Statistics
    ax4 = axes[1, 1]
    ax4.axis("off")
    stats_text = "VELOCITY CLUSTERING STATISTICS\n" + "=" * 30 + "\n"
    for label in unique_labels:
        mask = cluster_pred == label
        count = mask.sum()

        if count == 0:
            continue

        v_mean = velocity[mask].mean()
        v_std = velocity[mask].std()
        v_median = np.median(velocity[mask])
        nmad = np.median(np.abs(velocity[mask] - v_median)) * 1.4826

        # Model parameters (in original scale)
        if scaler is not None:
            model_mu = scaler.inverse_transform([[mu_posterior[label]]])[0, 0]
            model_sigma = sigma_posterior[label] * scaler.scale_[0]
        else:
            model_mu = mu_posterior[label]
            model_sigma = sigma_posterior[label]

        stats_text += f"VELOCITY CLUSTER {label} (pts: {count})\n"
        stats_text += f"├─ Velocity: {v_mean:.4f} ± {v_std:.4f}\n"
        stats_text += f"├─ Median/NMAD: {v_median:.4f}/{nmad:.4f}\n"
        stats_text += f"├─ Model μ/σ: {model_mu:.4f}/{model_sigma:.4f}\n\n"

    ax4.text(
        0.05,
        0.95,
        stats_text,
        transform=ax4.transAxes,
        fontsize=8,
        verticalalignment="top",
        fontfamily="monospace",
        bbox=dict(
            boxstyle="round,pad=0.4", facecolor="lightblue", alpha=0.8, edgecolor="navy"
        ),
    )

    return cluster_pred, fig


cluster_pred_1d, fig_1d = plot_1d_velocity_clustering(
    df_features, idata_with_priors, img, scaler
)
fig_1d.savefig(
    output_dir / f"{base_name}_velocity_only_results.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# plt.close("all")

In [None]:
def plot_nd_velocity_clustering(
    df_features, X_scaled, idata, prior_probs, img, variables_names, scaler=None
):
    """Improved DIC clustering results with comprehensive statistics."""

    # Get posterior cluster assignments
    z_posterior = idata.posterior["z"]
    z_samples = z_posterior.values
    z_flat = z_samples.reshape(-1, z_samples.shape[-1])

    cluster_pred = np.zeros(z_samples.shape[-1], dtype=int)
    for i in range(z_samples.shape[-1]):
        values, counts = np.unique(z_flat[:, i], return_counts=True)
        cluster_pred[i] = values[np.argmax(counts)]

    # Assignment probabilities
    z_probs = np.stack(
        [
            (z_posterior == k).mean(dim=["chain", "draw"]).values
            for k in range(len(sector_selectors))
        ],
        axis=1,
    )

    # Uncertainty (entropy)
    uncertainty = -np.sum(z_probs * np.log(z_probs + 1e-10), axis=1)

    # Get posterior means and std devs from model
    mu_posterior = idata.posterior["μ"].mean(dim=["chain", "draw"])
    sigma_posterior = idata.posterior["σ"].mean(dim=["chain", "draw"])

    # Create discriminative colors
    unique_labels = np.unique(cluster_pred)
    n_clusters = len(unique_labels)

    # Use distinct, colorblind-friendly colors
    colors = [
        "#E31A1C",
        "#1F78B4",
        "#33A02C",
        "#FF7F00",
        "#6A3D9A",
        "#B15928",
        "#A6CEE3",
        "#B2DF8A",
        "#FB9A99",
        "#FDBF6F",
    ][:n_clusters]
    color_map = {label: colors[i] for i, label in enumerate(unique_labels)}

    # Create simplified 2x2 plot
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Plot 1: Cluster assignments with image overlay
    ax1 = axes[0, 0]
    if img is not None:
        ax1.imshow(img, alpha=0.3, cmap="gray")

    for label in unique_labels:
        mask = cluster_pred == label
        if np.any(mask):
            ax1.scatter(
                df_features.loc[mask, "x"],
                df_features.loc[mask, "y"],
                c=color_map[label],
                s=4,
                alpha=0.8,
                label=f"Cluster {label}",
                edgecolors="none",
            )

    ax1.set_title("Cluster Assignments (Spatial)", fontsize=14, fontweight="bold")
    ax1.legend(loc="upper right", framealpha=0.9)
    ax1.set_aspect("equal")
    ax1.grid(False)
    ax1.set_xticks([])
    ax1.set_yticks([])

    # Plot 2: Assignment uncertainty
    ax2 = axes[0, 1]
    if img is not None:
        ax2.imshow(img, alpha=0.3, cmap="gray")

    scatter = ax2.scatter(
        df_features["x"],
        df_features["y"],
        c=uncertainty,
        cmap="plasma",
        s=4,
        alpha=0.8,
        vmin=0,
        vmax=uncertainty.max(),
    )
    ax2.set_title("Assignment Uncertainty", fontsize=14, fontweight="bold")
    ax2.set_aspect("equal")
    ax2.grid(False)
    ax2.set_xticks([])
    ax2.set_yticks([])
    cbar = plt.colorbar(scatter, ax=ax2, shrink=0.8)
    cbar.set_label("Entropy", rotation=270, labelpad=20)

    # Plot 3: Feature space (if multidimensional)
    ax3 = axes[1, 0]
    if len(variables_names) > 1:
        if "V" in df_features.columns and "angle_rad" in df_features.columns:
            for label in unique_labels:
                mask = cluster_pred == label
                if np.any(mask):
                    ax3.scatter(
                        df_features.loc[mask, "V"],
                        df_features.loc[mask, "angle_rad"],
                        c=color_map[label],
                        s=20,
                        alpha=0.7,
                        label=f"Cluster {label}",
                        edgecolors="black",
                        linewidth=0.3,
                    )
            ax3.set_xlabel("Velocity Magnitude", fontweight="bold")
            ax3.set_ylabel("Flow Direction (rad)", fontweight="bold")
            ax3.set_title("Clusters in Feature Space", fontsize=14, fontweight="bold")
        else:
            feat1, feat2 = variables_names[0], variables_names[1]
            for label in unique_labels:
                mask = cluster_pred == label
                if np.any(mask):
                    ax3.scatter(
                        df_features.loc[mask, feat1],
                        df_features.loc[mask, feat2],
                        c=color_map[label],
                        s=20,
                        alpha=0.7,
                        label=f"Cluster {label}",
                        edgecolors="black",
                        linewidth=0.3,
                    )
            ax3.set_xlabel(feat1, fontweight="bold")
            ax3.set_ylabel(feat2, fontweight="bold")
            ax3.set_title(
                f"Clusters: {feat1} vs {feat2}", fontsize=14, fontweight="bold"
            )

        ax3.grid(True, alpha=0.3)
        ax3.legend()
    else:
        # For 1D case, show velocity distribution
        feat = variables_names[0]
        for label in unique_labels:
            mask = cluster_pred == label
            if np.any(mask):
                ax3.hist(
                    df_features.loc[mask, feat],
                    bins=30,
                    alpha=0.7,
                    color=color_map[label],
                    label=f"Cluster {label}",
                    density=True,
                )
        ax3.set_xlabel(feat, fontweight="bold")
        ax3.set_ylabel("Density", fontweight="bold")
        ax3.set_title(f"{feat} Distribution by Cluster", fontsize=14, fontweight="bold")
        ax3.grid(True, alpha=0.3)
        ax3.legend()

    # Plot 4: Comprehensive cluster statistics
    ax4 = axes[1, 1]
    ax4.axis("off")

    # Calculate detailed statistics
    stats_text = "CLUSTER STATISTICS\n" + "=" * 50 + "\n\n"

    for label in unique_labels:
        mask = cluster_pred == label
        count = mask.sum()

        if count == 0:
            continue

        # Spatial statistics
        x_mean = df_features.loc[mask, "x"].mean()
        y_mean = df_features.loc[mask, "y"].mean()
        x_std = df_features.loc[mask, "x"].std()
        y_std = df_features.loc[mask, "y"].std()

        # Feature statistics (original scale if scaler provided)
        feature_stats = {}
        for i, feat_name in enumerate(variables_names):
            if scaler is not None:
                # Transform back to original scale for interpretation
                feat_scaled = X_scaled[mask, i]
                feat_original = scaler.inverse_transform(
                    np.column_stack([X_scaled[mask, :]])
                )[:, i]
                feat_mean_orig = feat_original.mean()
                feat_std_orig = feat_original.std()
                feature_stats[feat_name] = (feat_mean_orig, feat_std_orig)
            else:
                feat_mean = df_features.loc[mask, feat_name].mean()
                feat_std = df_features.loc[mask, feat_name].std()
                feature_stats[feat_name] = (feat_mean, feat_std)

        # Model parameters (posterior means)
        model_mu = mu_posterior.values[label, :]
        model_sigma = sigma_posterior.values[label, :]

        # Uncertainty statistics
        avg_uncertainty = uncertainty[mask].mean()
        max_prob = z_probs[mask, label].mean()

        stats_text += f"CLUSTER {label} ({count} points)\n"
        stats_text += f"├─ Spatial Center: ({x_mean:.1f}, {y_mean:.1f})\n"
        stats_text += f"├─ Spatial Spread: (σx={x_std:.1f}, σy={y_std:.1f})\n"

        for feat_name, (mean_val, std_val) in feature_stats.items():
            if feat_name == "V":  # Velocity
                stats_text += f"├─ Velocity: {mean_val:.3f} ± {std_val:.3f}\n"
            elif feat_name == "angle_rad":  # Angle
                stats_text += f"├─ Direction: {mean_val:.2f} ± {std_val:.2f} rad\n"
            else:
                stats_text += f"├─ {feat_name}: {mean_val:.3f} ± {std_val:.3f}\n"

        stats_text += f"├─ Model μ: [{', '.join([f'{x:.2f}' for x in model_mu])}]\n"
        stats_text += f"├─ Model σ: [{', '.join([f'{x:.2f}' for x in model_sigma])}]\n"
        stats_text += f"├─ Avg Uncertainty: {avg_uncertainty:.3f}\n"
        stats_text += f"└─ Avg Probability: {max_prob:.3f}\n\n"

    ax4.text(
        0.05,
        0.95,
        stats_text,
        transform=ax4.transAxes,
        fontsize=10,
        verticalalignment="top",
        fontfamily="monospace",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray", alpha=0.8),
    )

    plt.tight_layout()
    return cluster_pred, z_probs, uncertainty, fig


# Usage:
# For multidimensional clustering
cluster_pred, z_probs, uncertainty, fig = plot_nd_velocity_clustering(
    df_features, X_scaled, idata_with_priors, prior_probs, img, variables_names, scaler
)

# Save the main results
fig.savefig(
    output_dir / f"{base_name}_improved_results.png", dpi=300, bbox_inches="tight"
)
