In [None]:
!pip install umap-learn

In [15]:
import pandas as pd

# ask Vasish or Rolf for any other dataset (if needed)
df = pd.read_csv('./synthetic_processed.csv')
print(df['test_set_id'].nunique())

10


In [17]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def generate_window_from_df(df):
  # needs to return the tab titles, the urls and the group name
  return {'titles': list(df['title']), 'urls': list(df['url']), 'group_name': list(df['task'])}

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

windows = []
for test_set_id in df['test_set_id'].unique():
  windows.append(generate_window_from_df(df[df['test_set_id'] == test_set_id]))

In [None]:
from sklearn.cluster import (
    KMeans, DBSCAN, AgglomerativeClustering, Birch, OPTICS, SpectralClustering
)
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm
import umap
import warnings
import time

sns.set_theme()

warnings.filterwarnings("ignore")

# ---------- Setup ----------
reducers = {
    'pca': PCA,
    'svd': TruncatedSVD,
    'umap': umap.UMAP
}

dims = ['raw'] + [f'{k}_{n}' for k in reducers for n in (5, 15)]
methods = [
    ('kmeans', KMeans, True),
    ('agglo', AgglomerativeClustering, True),
    ('birch', Birch, True),
    ('spectral', SpectralClustering, True),
    ('optics', OPTICS, False),
    ('dbscan', DBSCAN, False),
]

results = {'window': []}
timings = {f'{m}_{d}': [] for m, _, _ in methods for d in dims}
dbscan_configs = {dim: [] for dim in dims}

for method, _, _ in methods:
    for dim in dims:
        results[f'{method}_{dim}'] = []

# ---------- Reduction Helper ----------
def safe_reduce(X, reducer_cls, n_components):
    n_samples = len(X)
    if n_samples <= n_components or n_samples <= 2:
        return None
    try:
        if reducer_cls is umap.UMAP:
            reducer = reducer_cls(n_components=n_components, n_neighbors=min(15, n_samples - 1), random_state=42)
        else:
            reducer = reducer_cls(n_components=n_components, random_state=42)
        return reducer.fit_transform(X)
    except Exception as e:
        print(f"{reducer_cls.__name__} failed (n_components={n_components}): {e}")
        return None

# ---------- Clustering Helpers ----------
def run_kmeans_auto(X, true_labels):
    best_score = -1
    best_k = 2
    max_k = min(len(X) - 1, 10)
    for k in range(2, max_k + 1):
        try:
            model = KMeans(n_clusters=k, n_init='auto', random_state=42)
            labels = model.fit_predict(X)
            score = silhouette_score(X, labels)
            if score > best_score:
                best_score = score
                best_k = k
        except:
            continue
    final_model = KMeans(n_clusters=best_k, n_init='auto', random_state=42)
    predicted = final_model.fit_predict(X)
    return adjusted_rand_score(true_labels, predicted)

def run_fixed_cluster_method(X, method_cls, true_labels):
    best_score = -1
    best_k = 2
    max_k = min(len(X) - 1, 10)
    for k in range(2, max_k + 1):
        try:
            model = method_cls(n_clusters=k)
            labels = model.fit_predict(X)
            score = silhouette_score(X, labels)
            if score > best_score:
                best_score = score
                best_k = k
        except:
            continue
    try:
        final_model = method_cls(n_clusters=best_k)
        labels = final_model.fit_predict(X)
        return adjusted_rand_score(true_labels, labels)
    except:
        return np.nan

def run_dbscan(X, true_labels, label):
    best_ars = -1
    best_cfg = "skipped"
    for eps in [0.2, 0.3, 0.5]:
        try:
            db = DBSCAN(eps=eps, min_samples=2, metric='euclidean')
            labels = db.fit_predict(X)
            if len(set(labels)) > 1:
                ars = adjusted_rand_score(true_labels, labels)
                if ars > best_ars:
                    best_ars = ars
                    best_cfg = f"eps={eps}"
        except:
            continue
    dbscan_configs[label].append(best_cfg)
    return best_ars if best_ars != -1 else np.nan

def run_optics(X, true_labels):
    try:
        model = OPTICS(metric='euclidean')
        labels = model.fit_predict(X)
        return adjusted_rand_score(true_labels, labels) if len(set(labels)) > 1 else np.nan
    except:
        return np.nan

# ---------- Main Loop ----------
for i, window in tqdm(enumerate(windows)):
    titles = window.get('titles', [])
    groups = window.get('group_name', [])

    if len(titles) < 2 or len(set(groups)) <= 1:
        print(f"Window {i}: Skipped (not enough data or only one group)")
        continue

    embeddings = model.encode(titles)
    le = LabelEncoder()
    true_labels = le.fit_transform(groups)

    reduced_data = {'raw': embeddings}
    for name, reducer in reducers.items():
        for dim in (5, 15):
            key = f'{name}_{dim}'
            reduced_data[key] = safe_reduce(embeddings, reducer, dim)

    results['window'].append(f"Window {i}")

    for method_name, method_cls, supports_k in methods:
        for dim_key, X in reduced_data.items():
            start = time.time()
            if X is None:
                ars = np.nan
            elif method_name == 'kmeans':
                ars = run_kmeans_auto(X, true_labels)
            elif method_name == 'dbscan':
                ars = run_dbscan(X, true_labels, dim_key)
            elif method_name == 'optics':
                ars = run_optics(X, true_labels)
            else:
                ars = run_fixed_cluster_method(X, method_cls, true_labels)
            elapsed = time.time() - start
            results[f'{method_name}_{dim_key}'].append(ars)
            timings[f'{method_name}_{dim_key}'].append(elapsed)

# ---------- Save to CSV ----------
df = pd.DataFrame(results)
for col, times in timings.items():
    df[f'{col}_time'] = times

for dim in dims:
    if dim in dbscan_configs:
        padded_configs = dbscan_configs[dim] + ['skipped'] * (len(df) - len(dbscan_configs[dim]))
        df[f'dbscan_{dim}_config'] = padded_configs
df.to_csv('ars_scores_full.csv', index=False)
print("\nSaved full ARS comparison to 'ars_scores_full.csv'")

# ---------- Analysis ----------
top_n = 50
method_dims = [f'{m}_{d}' for m, _, _ in methods for d in dims]
avg_scores = {col: df[col].mean(skipna=True) for col in method_dims}
avg_times = {col: np.mean(timings.get(col, [np.nan])) for col in method_dims}
top_methods = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
top_cols = [name for name, _ in top_methods]
top_avgs = [score for _, score in top_methods]

def get_color(col):
    if 'raw' in col:
        return 'gray'
    elif 'pca' in col:
        return 'steelblue'
    elif 'svd' in col:
        return 'seagreen'
    elif 'umap' in col:
        return 'darkorange'
    return 'black'

colors = [get_color(col) for col in top_cols]

# ---------- Summary Table ----------
summary_data = []
for col in method_dims:
    method, dim = col.split('_', 1)
    score = avg_scores.get(col, np.nan)
    avg_time = avg_times.get(col, np.nan)
    param = dbscan_configs[dim][0] if method == 'dbscan' and len(dbscan_configs[dim]) > 0 else '-'
    summary_data.append({
        'Clustering Method': method.capitalize(),
        'Dim Reduction': dim if dim != 'raw' else 'None',
        'Best Params': param,
        'Avg ARS': round(score, 4),
        'Avg Time (s)': round(avg_time, 4)
    })

summary_df = pd.DataFrame(summary_data).sort_values(by='Avg ARS', ascending=False).reset_index(drop=True)
print("\nTop Clustering Methods Summary:")
print(summary_df.head(top_n))

# ---------- Plot 1: Avg Runtime ----------
plt.figure(figsize=(14, 6))
sorted_df = summary_df.head(top_n).sort_values(by='Avg Time (s)')
sns.barplot(data=sorted_df, x='Clustering Method', y='Avg Time (s)', hue='Dim Reduction')
plt.title(f'Average Runtime (Top {top_n} Methods)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ---------- Plot 2: Avg ARS by Method ----------
plt.figure(figsize=(14, 6))
sorted_df_score = summary_df.head(top_n).sort_values(by='Avg ARS', ascending=False)
sns.barplot(data=sorted_df_score, x='Clustering Method', y='Avg ARS', hue='Dim Reduction')
plt.title(f'Average ARS (Top {top_n} Methods)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
