# Clustering Variance Analysis

This section analyzes how well clustering explains variance in PC1 deltas.

In [8]:
import numpy as np
import pandas as pd
import json
import os
from pathlib import Path
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score

In [23]:
# Configuration: Set paths to your data
model_name = "qwen-3-32b"
EMBEDDINGS_DIR = Path(f"/workspace/{model_name}/dynamics/embedding/user_turns")
PC1_FILE = Path(f"/workspace/{model_name}/dynamics/embedding/standardscaler/pc1_deltas.parquet")
CLUSTERING_DIR = Path(f"/workspace/{model_name}/dynamics/embedding/standardscaler/next/kmeans")  # or hdbscan clusters

In [24]:
# Load all embedding shards
def load_all_parquet_files(directory):
    """Load and concatenate all parquet files from directory."""
    parquet_files = sorted(directory.glob("shard-*.parquet"))
    if not parquet_files:
        parquet_files = sorted(directory.glob("*.parquet"))
    
    print(f"Loading {len(parquet_files)} parquet file(s) from {directory}")
    dfs = [pd.read_parquet(f) for f in parquet_files]
    combined = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(combined)} total rows")
    return combined

# Load embeddings
embeddings_df = load_all_parquet_files(EMBEDDINGS_DIR)

# Load PC1 deltas
pc1_df = pd.read_parquet(PC1_FILE)
print(f"Loaded {len(pc1_df)} PC1 delta rows")

Loading 2 parquet file(s) from /workspace/qwen-3-32b/dynamics/embedding/user_turns
Loaded 15527 total rows
Loaded 13901 PC1 delta rows


In [25]:
# Join embeddings with PC1 deltas
join_keys = ['short_model', 'short_auditor_model', 'domain', 'persona_id', 'topic_id', 'response_id']
joined_df = embeddings_df.merge(
    pc1_df[join_keys + ['pc1_delta', 'prev_pc1', 'next_pc1']],
    on=join_keys,
    how='left'
)

print(f"Joined: {joined_df['pc1_delta'].notna().sum()}/{len(joined_df)} rows have PC1 delta data")

# Extract PC1 deltas (only rows with data)
pc1_deltas = joined_df['pc1_delta'].dropna()
print(f"Using {len(pc1_deltas)} rows with PC1 delta for analysis")

Joined: 13901/15527 rows have PC1 delta data
Using 13901 rows with PC1 delta for analysis


In [26]:
# Sort by largest absolute pc1_delta
sorted_df = joined_df.dropna(subset=['pc1_delta']).copy()
sorted_df = sorted_df.sort_values('pc1_delta', ascending=False)

print("Top 10 largest PC1 deltas:")
print(sorted_df[['short_model', 'domain', 'persona_id', 'topic_id', 'response_id', 'pc1_delta', 'text']].head(10))
print("\n" + "="*80 + "\n")
print("Bottom 10 smallest PC1 deltas:")
print(sorted_df[['short_model', 'domain', 'persona_id', 'topic_id', 'response_id', 'pc1_delta', 'text']].tail(10))

# Save CSV excluding specified columns
columns_to_exclude = ['source_file', 'created_at', 'run_id', 'embedding']
columns_to_save = [col for col in sorted_df.columns if col not in columns_to_exclude]
output_path = f"./results/{model_name}/pc1_deltas_sorted.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# sorted_df[columns_to_save].to_csv(output_path, index=False)
# print(f"\n\nSaved {len(sorted_df)} rows to {output_path}")
# print(f"Columns saved: {columns_to_save}")

Top 10 largest PC1 deltas:
      short_model      domain  persona_id  topic_id  response_id  pc1_delta  \
7525   qwen-3-32b  philosophy          17         5           12  39.504836   
12429  qwen-3-32b  philosophy          19        19           16  33.677241   
9637   qwen-3-32b     writing           8        18            2  32.635768   
13446  qwen-3-32b     therapy          13        14            4  32.557598   
7744   qwen-3-32b  philosophy          19        12            6  32.157588   
13321  qwen-3-32b     therapy          12         7            6  31.394822   
11563  qwen-3-32b  philosophy          15         8           12  30.484329   
12365  qwen-3-32b  philosophy          19        14            2  29.986081   
11985  qwen-3-32b  philosophy          17         5           18  29.228801   
10530  qwen-3-32b      coding           1         6           20  28.943143   

                                                    text  
7525   Perfect - that's exactly the tonal wh

## Baseline (No Clustering)

In [27]:
# Overall variance (no clustering)

metric = 'next_pc1'

overall = joined_df[metric].dropna()

overall_mean = overall.mean()
overall_std = overall.std()
overall_variance = overall.var()

print(f"Baseline (no clustering):")
print(f"  mean = {overall_mean:.3f}")
print(f"  std = {overall_std:.3f}")
print(f"  variance = {overall_variance:.3f}")
print(f"\nIf within-cluster stds are much smaller than {overall_std:.3f} → clustering helps!")

Baseline (no clustering):
  mean = -5.911
  std = 15.513
  variance = 240.643

If within-cluster stds are much smaller than 15.513 → clustering helps!


## Clustering Variance Analysis

Analyze how well clustering explains variance in PC1 deltas.

In [28]:
def analyze_clustering(clustering_subdir, overall_std, overall_variance, metric='next_pc1'):
    """Analyze variance explained by a single clustering result."""
    # Load metrics
    metrics_path = clustering_subdir / "metrics.json"
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
    
    clusters = list(metrics['clusters'].values())
    
    # Average within-cluster std (weighted by cluster size)
    metric_std_key = f'{metric}_std'
    
    # Filter out clusters with None or NaN std values
    valid_clusters = [
        c for c in clusters 
        if c.get(metric_std_key) is not None 
        and not np.isnan(c.get(metric_std_key, float('nan')))
    ]
    
    if not valid_clusters:
        print(f"No valid clusters with {metric} data in {clustering_subdir.name}")
        return None
    
    weighted_std = np.average(
        [c[metric_std_key] for c in valid_clusters],
        weights=[c['n_with_metric'] for c in valid_clusters]
    )
    
    # Variance explained
    variance_explained = 1 - (weighted_std**2 / overall_variance)
    
    return {
        'name': clustering_subdir.name,
        'weighted_std': weighted_std,
        'variance_explained': variance_explained,
        'metrics': metrics
    }


## Compare All Clustering Results

In [29]:
# Analyze all clustering results in the directory
if CLUSTERING_DIR.exists():
    clustering_subdirs = sorted([d for d in CLUSTERING_DIR.iterdir() if d.is_dir()])
    
    results = []
    for subdir in clustering_subdirs:
        result = analyze_clustering(subdir, overall_std, overall_variance, metric=metric)
        if result:
            results.append(result)
    
    # Display results sorted by variance explained
    results.sort(key=lambda x: x['variance_explained'], reverse=True)
    
    print(f"\nVariance Explained by Clustering (sorted by best):\n")
    print(f"{'Clustering':<15} {'Weighted Std':<15} {'Variance Explained':<20}")
    print("-" * 50)
    for r in results:
        print(f"{r['name']:<15} {r['weighted_std']:<15.3f} {r['variance_explained']:<20.2%}")
else:
    print(f"Clustering directory not found: {CLUSTERING_DIR}")


Variance Explained by Clustering (sorted by best):

Clustering      Weighted Std    Variance Explained  
--------------------------------------------------
k100            8.665           68.80%              
k50             8.710           68.48%              
