# Whisker Clustering Analysis

This notebook clusters whisker lines by their length and visualizes them in PCA space with interactive plots.

In [1]:
# Fix for OpenBLAS/NumPy compatibility issue - MUST BE FIRST
import os
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'

# Import the clustering script
import whisker_clustering as wc
import pandas as pd
import numpy as np

## Alternative: Manual Clustering (if errors persist)

If you continue to get errors, use this alternative approach:

In [None]:
# Alternative approach: Manual clustering with quantiles
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load and prepare data
df_alt = wc.prepare_whisker_features(CSV_PATH, verbose=True)

# Simple quantile-based clustering (no KMeans needed)
df_alt['Cluster'] = pd.qcut(df_alt['Length'], q=N_CLUSTERS, labels=False, duplicates='drop')

# PCA for visualization
features = ['Length', 'Num_Points']
X = df_alt[features].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca_alt = PCA(n_components=2, random_state=42)
X_pca = pca_alt.fit_transform(X_scaled)

df_alt['PC1'] = X_pca[:, 0]
df_alt['PC2'] = X_pca[:, 1]

# Create hover text
df_alt['Hover_Text'] = df_alt.apply(
    lambda row: f"Frame: {row['Frame']}<br>Length: {row['Length']:.2f} px<br>Cluster: {row['Cluster']}<br>Points: {row['Num_Points']}<br>Row Index: {row['Row_Index']}", 
    axis=1
)
df_alt['Cluster_Label'] = df_alt['Cluster'].apply(lambda x: f"Cluster {x}")

# Create interactive plot
fig_alt = px.scatter(
    df_alt, x='PC1', y='PC2', color='Cluster_Label',
    hover_data={'PC1': False, 'PC2': False, 'Cluster_Label': False, 'Hover_Text': True},
    title='Whisker Clustering in PCA Space (Quantile-based)',
    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2', 'Cluster_Label': 'Cluster'},
    color_discrete_sequence=px.colors.qualitative.Bold
)

fig_alt.update_traces(marker=dict(size=6, line=dict(width=0.5, color='white')), hovertemplate='%{customdata[0]}<extra></extra>')
fig_alt.update_layout(width=900, height=700, hovermode='closest', plot_bgcolor='white', font=dict(size=12))
fig_alt.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', zeroline=True, zerolinewidth=2, zerolinecolor='gray')
fig_alt.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', zeroline=True, zerolinewidth=2, zerolinecolor='gray')

fig_alt.show()

# Print cluster info
print(f"\nQuantile-based Clustering with {N_CLUSTERS} clusters")
for cluster_id in range(N_CLUSTERS):
    cluster_data = df_alt[df_alt['Cluster'] == cluster_id]
    print(f"\nCluster {cluster_id}: {len(cluster_data)} whiskers, Mean Length: {cluster_data['Length'].mean():.2f} px, Range: [{cluster_data['Length'].min():.2f}, {cluster_data['Length'].max():.2f}]")

df = df_alt  # Use this for remaining cells

Loading data from: 1027_lines.csv
Total whiskers: 2331481

Length Statistics:
  Mean: 291.99 pixels
  Std: 128.30 pixels
  Min: 49.61 pixels
  Max: 660.79 pixels

Length Statistics:
  Mean: 291.99 pixels
  Std: 128.30 pixels
  Min: 49.61 pixels
  Max: 660.79 pixels


## Configuration

In [3]:
# Path to your lines CSV file
CSV_PATH = "1027_lines.csv"

# Number of clusters
N_CLUSTERS = 5

# Random seed for reproducibility
RANDOM_STATE = 42

## Run Complete Analysis

This will:
1. Load whisker data and calculate lengths
2. Cluster whiskers by length using K-means
3. Perform PCA for 2D visualization
4. Create an interactive plot with hover information

In [8]:
# Run the complete pipeline
df, fig, pca, kmeans = wc.analyze_and_visualize_whiskers(
    CSV_PATH,
    n_clusters=N_CLUSTERS,
    random_state=RANDOM_STATE,
    save_html="whisker_clusters_pca.html",
    verbose=True
)

Loading data from: 1027_lines.csv
Total whiskers: 2331481


KeyboardInterrupt: 

## View Cluster Summary Statistics

In [None]:
# Summary table of clusters
cluster_summary = df.groupby('Cluster').agg({
    'Length': ['count', 'mean', 'std', 'min', 'max'],
    'Frame': 'nunique'
}).round(2)

cluster_summary.columns = ['Count', 'Mean_Length', 'Std_Length', 'Min_Length', 'Max_Length', 'Unique_Frames']
print("\nCluster Summary:")
display(cluster_summary)

## Explore Specific Clusters

View whiskers from a specific cluster:

In [None]:
# Select a cluster to explore
cluster_id = 0

# Get whiskers from this cluster
cluster_whiskers = df[df['Cluster'] == cluster_id][['Frame', 'Row_Index', 'Length', 'Num_Points']].sort_values('Length')

print(f"\nWhiskers in Cluster {cluster_id}:")
print(f"Total: {len(cluster_whiskers)} whiskers")
print(f"\nFirst 10 whiskers:")
display(cluster_whiskers.head(10))

## Analyze Cluster Distribution Across Frames

In [None]:
# Count whiskers per cluster per frame
import plotly.express as px

cluster_frame_counts = df.groupby(['Frame', 'Cluster']).size().reset_index(name='Count')

# Create stacked bar chart
fig_bars = px.bar(
    cluster_frame_counts,
    x='Frame',
    y='Count',
    color='Cluster',
    title='Whisker Cluster Distribution Across Frames',
    labels={'Count': 'Number of Whiskers', 'Cluster': 'Cluster'},
    color_discrete_sequence=px.colors.qualitative.Bold
)

fig_bars.update_layout(width=1000, height=500, barmode='stack')
fig_bars.show()

## Export Results

In [None]:
# Save the dataframe with cluster assignments
output_csv = "whisker_data_with_clusters.csv"
df[['Frame', 'Row_Index', 'Length', 'Num_Points', 'Cluster', 'PC1', 'PC2']].to_csv(output_csv, index=False)
print(f"Data with cluster assignments saved to: {output_csv}")

## Quick Re-run with Different Parameters

Try different numbers of clusters:

In [None]:
# Try with different number of clusters
n_clusters_new = 6

df_new, fig_new, _, _ = wc.analyze_and_visualize_whiskers(
    CSV_PATH,
    n_clusters=n_clusters_new,
    random_state=RANDOM_STATE,
    verbose=True
)