# ðŸ§­ Phrasely Evaluation & Visualization Notebook
### This notebook summarizes clustering quality, structure, and representative examples for the Phrasely pipeline (embeddings â†’ SVD â†’ UMAP â†’ HDBSCAN).

### Phase 1: Loading data, embedding it, reducing it, generating clusters, all in GPU (if you have it)

In [None]:
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load
from pathlib import Path
import os

from phrasely.data_loading.csv_loader import CSVLoader
from phrasely.embeddings.phrase_embedder import PhraseEmbedder
from phrasely.reduction.svd_reducer import SVDReducer
from phrasely.clustering.hdbscan_clusterer import HDBSCANClusterer
from phrasely.medoids.medoid_selector import MedoidSelector
from phrasely.evaluation import ClusterEvaluator
from phrasely.reduction.visualization_reducer import UMAPReducer

logging.basicConfig(level=logging.INFO, format="%(message)s")

In [None]:
loader = CSVLoader(input_path="../data/msmarco.csv")
phrases = loader.load()
print(f"Loaded {len(phrases)} phrases.")

In [None]:
from phrasely.embeddings.phrase_embedder import PhraseEmbedder

embedder = PhraseEmbedder(batch_size=8)
embeddings = embedder.embed(phrases, dataset_name="msmarco_full")
print("Embeddings:", embeddings.shape)

In [None]:
sample_idx = np.random.choice(len(embeddings), size=100_000, replace=False)
embeddings_sample = embeddings[sample_idx]
phrases_sample = [phrases[i] for i in sample_idx]

In [None]:
from phrasely.reduction.two_stage_reducer import TwoStageReducer

reducer = TwoStageReducer(
    svd_components=256,
    umap_components=10,
    n_neighbors=15,
    min_dist=0.0,
    metric="cosine",
    use_gpu=True,
)

reduced_two_stage = reducer.reduce(embeddings_sample)
print(f"Output shape: {reduced_two_stage.shape}")

In [None]:
from phrasely.clustering.hdbscan_clusterer import HDBSCANClusterer
import numpy as np

clusterer = HDBSCANClusterer(min_cluster_size=10, min_samples=3, use_gpu=True)
labels = clusterer.cluster(reduced_two_stage)

unique, counts = np.unique(labels, return_counts=True)
n_clusters = len(unique) - (1 if -1 in unique else 0)
n_noise = counts[unique == -1][0] if -1 in unique else 0

print(f"Found {n_clusters} clusters, with {n_noise} noise points.")

In [None]:
viz_reducer = UMAPReducer(n_components=2, use_gpu=True)
points_2d = viz_reducer.reduce(reduced_two_stage)

In [None]:
from phrasely.evaluation.dbcv_score import compute_dbcv
mask = labels != -1
score = compute_dbcv(reduced_two_stage[mask], labels[mask])
score

# ðŸ§­ Phrasely Evaluation & Visualization

## 1. Summary Metrics

In [None]:

from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
from phrasely.medoids.medoid_selector import MedoidSelector
from phrasely.evaluation.dbcv_score import compute_dbcv

mask = labels != -1
X_valid, y_valid = reduced_two_stage[mask], labels[mask]

silhouette = None
try:
    sample_idx = np.random.choice(len(X_valid), size=5000, replace=False)
    silhouette = silhouette_score(X_valid[sample_idx], y_valid[sample_idx])
except Exception as e:
    print(f"Silhouette skipped: {e}")

ch = calinski_harabasz_score(X_valid, y_valid)
db = davies_bouldin_score(X_valid, y_valid)
dbcv = compute_dbcv(X_valid, y_valid)

print(f"Silhouette: {silhouette:.3f}" if silhouette else "Silhouette: N/A")
print(f"Calinskiâ€“Harabasz: {ch:.1f}")
print(f"Daviesâ€“Bouldin: {db:.3f}")
print(f"DBCV: {dbcv:.3f}")


## 2. UMAP Visualization

In [None]:

import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))
plt.scatter(points_2d[:, 0], points_2d[:, 1], c=labels, s=2, cmap="Spectral")
plt.title(f"HDBSCAN Clusters (DBCV={dbcv:.2f})")
plt.axis("off")
plt.show()


## 3. Cluster Size Distribution

In [None]:

unique, counts = np.unique(labels, return_counts=True)
cluster_sizes = counts[unique != -1]

plt.figure(figsize=(8,6))
plt.hist(cluster_sizes, bins=60, color="gray")
plt.title("Cluster Size Distribution")
plt.xlabel("Cluster Size")
plt.ylabel("Frequency")
plt.show()

print(f"Total clusters: {len(cluster_sizes)}")
print(f"Noise points: {counts[unique == -1][0] if -1 in unique else 0}")


## 4. Medoid Phrase Inspection

In [None]:

selector = MedoidSelector()
medoids = selector.select(phrases, reduced_two_stage, labels)

print("Sample medoid phrases:")
for i, m in enumerate(medoids[:15]):
    print(f"{i:3d}: {m}")


## 5. Cohesion / Separation Diagnostics (Optional)

In [None]:

from sklearn.metrics import pairwise_distances

mask = labels != -1
X_sub, y_sub = X_valid[:5000], y_valid[:5000]  # sample for speed
D = pairwise_distances(X_sub)

intra = np.mean([D[y_sub == c][:, y_sub == c].mean() for c in np.unique(y_sub)])
inter = np.mean([D[y_sub == c][:, y_sub != c].mean() for c in np.unique(y_sub)])
print(f"Intra-cluster mean distance: {intra:.3f}")
print(f"Inter-cluster mean distance: {inter:.3f}")
print(f"Separation ratio (inter/intra): {inter/intra:.2f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

mask = labels != -1
X, y = reduced_two_stage[mask], labels[mask]
unique_labels = np.unique(y)

cohesions, separations, sizes = [], [], []

D = pairwise_distances(X)

for c in unique_labels:
    idx = np.where(y == c)[0]
    if len(idx) < 5:
        continue
    intra = D[np.ix_(idx, idx)].mean()
    others = np.where(y != c)[0]
    inter = D[np.ix_(idx, others)].min()
    cohesions.append(intra)
    separations.append(inter)
    sizes.append(len(idx))

plt.figure(figsize=(8,6))
sc = plt.scatter(cohesions, separations, s=np.sqrt(sizes), alpha=0.6, cmap='viridis')
plt.xlabel("Intra-cluster distance (Cohesion â†“)")
plt.ylabel("Nearest inter-cluster distance (Separation â†‘)")
plt.title("Cluster Cohesion vs Separation")
plt.grid(True, alpha=0.3)
plt.show()

## 6. Save Results

In [None]:
import pandas as pd

df = pd.DataFrame({
    "phrase": phrases_sample,
    "label": labels
})
df.to_parquet("data/cluster_results.parquet", index=False)
print("âœ… Saved cluster results to cluster_results.parquet")

