# Issue Clustering

This notebook clusters issue/feedback items from a CSV, evaluates against provided labels, and compares three strategies:

- Agglomerative clustering (cosine, threshold-based, batch)
- Vector-style incremental assignment (simulates vector DB neighbor-join)
- Centroid-style incremental assignment

It uses Gemini embeddings by default (set `GOOGLE_API_KEY`) or any SentenceTransformer model if you change `model_name`. Ensure `issues_raw.csv` exists with columns `title`, `body`, and optional `label`.

Add a `.env` file with `GOOGLE_API_KEY=your_key` if you want Gemini embeddings. To run offline, switch `model_name` to a SentenceTransformer model.

In [1]:
# Install dependencies
# !python3 -m venv venv && source venv/bin/activate && pip install -r requirements.txt

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd
from IPython.display import display
import sklearn
from packaging import version
from google import genai

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

load_dotenv()

genai_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) if os.getenv("GOOGLE_API_KEY") else None

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# =============================================================================
# CONFIGURATION - Set this for each repo you analyze
# =============================================================================
REPO_NAME = "plotly/plotly.js"  # e.g., "anthropics/claude-code", "facebook/react"

# Derived paths (auto-generated from REPO_NAME)
DATA_DIR = "data"
EMBEDDINGS_DIR = "embeddings"
RESULTS_DIR = "results"
FORCE_RECOMPUTE = False  # Set True to recompute embeddings even if cached

# Create directories
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

def get_safe_name(repo_name):
    """Convert repo name to filesystem-safe name."""
    return repo_name.replace("/", "__").replace("\\", "__")

def get_csv_path(repo_name):
    """Get CSV path for a repo's issues."""
    return os.path.join(DATA_DIR, f"{get_safe_name(repo_name)}.csv")

def get_embedding_path(repo_name, model_name="gemini-embedding-001"):
    """Get embedding cache path for a repo."""
    model_suffix = model_name.replace("/", "_").replace("-", "_")
    return os.path.join(EMBEDDINGS_DIR, f"{get_safe_name(repo_name)}__{model_suffix}.npy")

def get_results_path(repo_name):
    """Get results CSV path for a repo."""
    return os.path.join(RESULTS_DIR, f"{get_safe_name(repo_name)}_clusters.csv")


def fetch_github_issues(repo_name, state="all", max_issues=500):
    """Fetch issues from a GitHub repo and save to CSV."""
    import requests
    
    github_token = os.getenv("GITHUB_TOKEN")
    if not github_token:
        raise RuntimeError("GITHUB_TOKEN not set in .env")
    
    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json"
    }
    
    issues = []
    page = 1
    per_page = 100
    
    print(f"Fetching issues from {repo_name}...")
    
    while len(issues) < max_issues:
        url = f"https://api.github.com/repos/{repo_name}/issues"
        params = {"state": state, "per_page": per_page, "page": page, "sort": "created", "direction": "desc"}
        
        resp = requests.get(url, headers=headers, params=params)
        if resp.status_code != 200:
            raise RuntimeError(f"GitHub API error {resp.status_code}: {resp.text}")
        
        batch = resp.json()
        if not batch:
            break
        
        for item in batch:
            if "pull_request" in item:
                continue
            issues.append({
                "number": item["number"],
                "title": item["title"],
                "body": item.get("body") or "",
                "state": item["state"],
                "created_at": item["created_at"],
                "labels": ",".join([l["name"] for l in item.get("labels", [])]),
                "url": item["html_url"]
            })
        
        print(f"  Fetched page {page} ({len(issues)} issues so far)")
        if len(batch) < per_page:
            break
        page += 1
    
    df = pd.DataFrame(issues[:max_issues])
    csv_path = get_csv_path(repo_name)
    df.to_csv(csv_path, index=False)
    print(f"Saved {len(df)} issues to {csv_path}")
    return df


def load_or_fetch_issues(repo_name):
    """Load issues from CSV, or fetch from GitHub if not cached."""
    csv_path = get_csv_path(repo_name)
    if os.path.exists(csv_path):
        print(f"Loading cached issues from {csv_path}")
        return pd.read_csv(csv_path)
    else:
        print(f"CSV not found, fetching from GitHub...")
        return fetch_github_issues(repo_name)


def save_clustering_results(df_clusters, repo_name, sim_threshold=0.72):
    """Save clustering results to CSV for comparison across repos."""
    results_path = get_results_path(repo_name)
    
    # Add repo metadata
    df_out = df_clusters.copy()
    df_out.insert(0, "repo", repo_name)
    df_out.insert(1, "sim_threshold", sim_threshold)
    
    df_out.to_csv(results_path, index=False)
    print(f"Saved clustering results to {results_path}")
    return results_path


def load_all_results():
    """Load and combine all clustering results for comparison."""
    import glob
    
    result_files = glob.glob(os.path.join(RESULTS_DIR, "*_clusters.csv"))
    if not result_files:
        print("No results found in results/")
        return None
    
    dfs = []
    for f in result_files:
        df = pd.read_csv(f)
        dfs.append(df)
    
    combined = pd.concat(dfs, ignore_index=True)
    print(f"Loaded results from {len(result_files)} repos ({len(combined)} total issues)")
    return combined


def summarize_clusters(df_clusters, repo_name=None):
    """Print a summary of clustering results."""
    repo_name = repo_name or REPO_NAME
    
    n_issues = len(df_clusters)
    n_clusters = df_clusters["cluster_display"].nunique()
    n_singletons = df_clusters["is_singleton"].sum()
    
    # Cluster size distribution
    cluster_sizes = df_clusters.groupby("cluster_display").size()
    
    print(f"\n{'='*60}")
    print(f"CLUSTERING SUMMARY: {repo_name}")
    print(f"{'='*60}")
    print(f"Total issues:    {n_issues}")
    print(f"Clusters:        {n_clusters}")
    print(f"Singletons:      {n_singletons} ({100*n_singletons/n_issues:.1f}%)")
    print(f"Clustered:       {n_issues - n_singletons} ({100*(n_issues-n_singletons)/n_issues:.1f}%)")
    print(f"\nCluster sizes:   min={cluster_sizes.min()}, max={cluster_sizes.max()}, median={cluster_sizes.median():.0f}")
    print(f"{'='*60}\n")
    
    return {
        "repo": repo_name,
        "n_issues": n_issues,
        "n_clusters": n_clusters,
        "n_singletons": n_singletons,
        "pct_clustered": 100 * (n_issues - n_singletons) / n_issues
    }


# Show current paths
print(f"Repo:       {REPO_NAME}")
print(f"CSV:        {get_csv_path(REPO_NAME)}")
print(f"Embeddings: {get_embedding_path(REPO_NAME)}")
print(f"Results:    {get_results_path(REPO_NAME)}")

Repo:       plotly/plotly.js
CSV:        data/plotly__plotly.js.csv
Embeddings: embeddings/plotly__plotly.js__gemini_embedding_001.npy
Results:    results/plotly__plotly.js_clusters.csv


In [4]:
def cluster_issues(
    issues,                      # list[dict] with keys: title, body (body optional)
    model_name="gemini-embedding-001",
    sim_threshold=0.70,          # higher => fewer, tighter clusters
    min_cluster_size=2,
    truncate_body_chars=1500,    # prevent very long bodies from dominating
    label_singletons_as_minus_one=True,
):
    """Cluster issue dicts by semantic similarity."""
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        text = f"{title}\n\n{body}".strip()
        if not text:
            text = "[empty]"
        texts.append(text)

    if not texts:
        return {
            "labels": np.array([], dtype=int),
            "display_labels": np.array([], dtype=int),
            "clusters": {},
            "singletons": [],
            "texts": texts,
        }

    if len(texts) == 1:
        labels = np.array([0], dtype=int)
        display_labels = np.array([-1], dtype=int) if label_singletons_as_minus_one else labels.copy()
        clusters = {0: [0]} if min_cluster_size <= 1 else {}
        singletons = [0]
        return {
            "labels": labels,
            "display_labels": display_labels,
            "clusters": clusters,
            "singletons": singletons,
            "texts": texts,
        }

    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        
        # Batch embeddings (Gemini has 100 item limit per batch)
        all_embeddings = []
        batch_size = 100
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            print(f"Embedding batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1} ({len(batch)} items)...")
            resp = genai_client.models.embed_content(
                model=model_name,
                contents=batch,
                config={"output_dimensionality": 768},
            )
            all_embeddings.extend([e.values for e in resp.embeddings])
        emb = np.asarray(all_embeddings, dtype=np.float32)
        print("Using Gemini Text Embedding")
    else:
        model = SentenceTransformer(model_name)
        emb = model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        emb = np.asarray(emb, dtype=np.float32)
        print("Using HF Sentence Transformer Embedding")

    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    emb = emb / norms

    dist_threshold = 1.0 - float(sim_threshold)

    kwargs = dict(n_clusters=None, linkage="average", distance_threshold=dist_threshold)
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        kwargs["metric"] = "cosine"
    else:
        kwargs["affinity"] = "cosine"

    cl = AgglomerativeClustering(**kwargs)
    labels = cl.fit_predict(emb)

    clusters = {}
    for i, lab in enumerate(labels):
        clusters.setdefault(int(lab), []).append(i)

    kept = {k: v for k, v in clusters.items() if len(v) >= min_cluster_size}
    singletons = [v[0] for k, v in clusters.items() if len(v) == 1]
    kept = dict(sorted(kept.items(), key=lambda kv: len(kv[1]), reverse=True))

    display_labels = labels.copy()
    if label_singletons_as_minus_one:
        for i in singletons:
            display_labels[i] = -1

    return {
        "labels": labels,
        "display_labels": display_labels,
        "clusters": kept,
        "singletons": singletons,
        "texts": texts,
    }

def print_clusters(result, issues, max_items_per_cluster=8):
    for cid, idxs in result["clusters"].items():
        print(f"\n=== Cluster {cid}  (n={len(idxs)}) ===")
        for j in idxs[:max_items_per_cluster]:
            t = (issues[j].get("title") or "").strip().replace("\n", " ")
            print(f"- [{j:02d}] {t[:140]}")
    if result["singletons"]:
        print(f"\nSingletons (n={len(result['singletons'])}): {result['singletons']}")

## Tabular cluster view helper
Creates a DataFrame with cluster id, cluster size, issue index, title, and body so you can sort/filter in the notebook UI. Use the raw issues CSV helper if you want to label without clusters.

In [5]:
def clusters_as_dataframe(result, issues):
    """Return a DataFrame with raw/display labels, cluster size, issue index, title, body."""
    rows = []
    cluster_sizes = {cid: len(idxs) for cid, idxs in result.get("clusters", {}).items()}
    display_labels = result.get("display_labels", result.get("labels", []))
    labels = result.get("labels", [])

    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    for j, issue in enumerate(issues):
        raw_lab = int(labels[j]) if len(labels) > j else None
        disp_lab = int(display_labels[j]) if len(display_labels) > j else None
        cluster_size = cluster_sizes.get(raw_lab, 1) if raw_lab is not None else 1
        rows.append({
            "cluster_raw": raw_lab,
            "cluster_display": disp_lab,
            "is_singleton": disp_lab == -1,
            "cluster_size": cluster_size,
            "idx": j,
            "title": _to_text(issue.get("title")),
            "body": _to_text(issue.get("body")),
        })

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["is_singleton", "cluster_size", "cluster_display", "idx"], ascending=[True, False, True, True]).reset_index(drop=True)
    return df


def save_issues_csv(issues, path="issues_raw.csv"):
    """Save the raw (unclustered) issues to CSV for manual labeling."""
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    rows = []
    for idx, it in enumerate(issues):
        rows.append({
            "idx": idx,
            "title": _to_text(it.get("title")).strip(),
            "body": _to_text(it.get("body")).strip(),
        })
    df = pd.DataFrame(rows)
    df.to_csv(path, index=False)
    print(f"Wrote {len(df)} issues to {path}")
    return df

## Example: cluster from CSV
Load labeled issues from `issues_raw.csv`, cluster them, and evaluate against the provided labels.

In [6]:
# Load issues from CSV (or fetch from GitHub if not cached)
df_src = load_or_fetch_issues(REPO_NAME)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None
print(f"Loaded {len(issues)} issues")

# Cluster
SIM_THRESHOLD = 0.72
res = cluster_issues(issues, sim_threshold=SIM_THRESHOLD, min_cluster_size=2, label_singletons_as_minus_one=False)
print_clusters(res, issues)

# Create results dataframe
df_clusters = clusters_as_dataframe(res, issues)
display(df_clusters.head(50))

# Save results for comparison
save_clustering_results(df_clusters, REPO_NAME, sim_threshold=SIM_THRESHOLD)

# Show summary
summarize_clusters(df_clusters, REPO_NAME)

# Evaluate if labels exist
if true_labels is not None:
    from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure
    true_enc, _ = pd.factorize(true_labels)
    pred_enc, _ = pd.factorize(res["display_labels"])
    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
    print(f"ARI: {ari:.3f}  |  AMI: {ami:.3f}  |  H: {h:.3f}  C: {c:.3f}  V: {v:.3f}")

Loading cached issues from data/plotly__plotly.js.csv
Loaded 500 issues
Embedding batch 1/5 (100 items)...
Embedding batch 2/5 (100 items)...
Embedding batch 3/5 (100 items)...
Embedding batch 4/5 (100 items)...
Embedding batch 5/5 (100 items)...
Using Gemini Text Embedding

=== Cluster 63  (n=20) ===
- [13] [BUG]: Coupled hoverinfo tooltips are misaligned
- [78] Cannot display a tooltip label showing the two values of the two series together without also merging the label
- [103] When using hovermode "x unified", the tooltip box covers the graph right at the spot the mouse cursor points at.
- [161] show tooltip when hovering over selected points on scatter plot
- [214] Boxplot hoverinfo/hovertext fails to take
- [263] Hover labels do not match x-axis value in scattermode "group"
- [288] x hover label hidden when using `hoversubplots='axis'` and `hovermode='x'`
- [304] Incorrect hover label for missing last value with hovermode `x-unified` and `tickson=boundaries`

=== Cluster 15  (n=1

Unnamed: 0,cluster_raw,cluster_display,is_singleton,cluster_size,idx,title,body
0,63,63,False,20,13,[BUG]: Coupled hoverinfo tooltips are misaligned,### Description\n\nI have setup a [coupled hov...
1,63,63,False,20,78,Cannot display a tooltip label showing the two...,I am attaching this script because it is too l...
2,63,63,False,20,103,"When using hovermode ""x unified"", the tooltip ...","<img width=""882"" height=""430"" alt=""Image"" src=..."
3,63,63,False,20,161,show tooltip when hovering over selected point...,"Hello,\n\nI‚Äôve encountered an issue when using..."
4,63,63,False,20,214,Boxplot hoverinfo/hovertext fails to take,Hello! I'm trying to customize the hover behav...
5,63,63,False,20,263,Hover labels do not match x-axis value in scat...,We have come across an issue with scatter plot...
6,63,63,False,20,288,x hover label hidden when using `hoversubplots...,"Hi,\r\nWhen the `hovermode` is set to `'x'` an..."
7,63,63,False,20,304,Incorrect hover label for missing last value w...,In this [codepen ](https://codepen.io/Djeramon...
8,63,63,False,20,315,Mismatched Tooltip Behavior in Heatmap Charts ...,**Issue Description:**\r\nWhen using a heatmap...
9,63,63,False,20,345,pointNumber passed to Plotly.Fx.Hover() is not...,When attempting to build off of the [Coupled H...


Saved clustering results to results/plotly__plotly.js_clusters.csv

CLUSTERING SUMMARY: plotly/plotly.js
Total issues:    500
Clusters:        182
Singletons:      0 (0.0%)
Clustered:       500 (100.0%)

Cluster sizes:   min=1, max=20, median=2



In [7]:
# Compare agglomerative vs vector-like vs centroid-like on CSV labels
import math
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure

def _prep_texts(issues, truncate_body_chars=1500):
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)
    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        txt = f"{title}\n\n{body}".strip()
        texts.append(txt or "[empty]")
    return texts


def embed_texts(texts, model_name="gemini-embedding-001", repo_name=None, force_recompute=False):
    """Embed texts with caching support."""
    repo_name = repo_name or REPO_NAME
    cache_path = get_embedding_path(repo_name, model_name)
    
    if not force_recompute and not FORCE_RECOMPUTE and os.path.exists(cache_path):
        emb = np.load(cache_path)
        if len(emb) == len(texts):
            print(f"Loaded cached embeddings from {cache_path} ({len(emb)} vectors)")
            return emb
        else:
            print(f"Cache size mismatch ({len(emb)} vs {len(texts)} texts), recomputing...")
    
    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        print(f"Computing embeddings with {model_name}...")

        # Batch embeddings (Gemini has 100 item limit per batch)
        all_embeddings = []
        batch_size = 100
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            print(f"  Embedding batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1} ({len(batch)} items)...")
            resp = genai_client.models.embed_content(
                model=model_name,
                contents=batch,
                config={"output_dimensionality": 768},
            )
            all_embeddings.extend([e.values for e in resp.embeddings])
        emb = np.asarray(all_embeddings, dtype=np.float32)
    else:
        print(f"Computing embeddings with {model_name}...")
        model = SentenceTransformer(model_name)
        emb = model.encode(texts, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
        emb = np.asarray(emb, dtype=np.float32)
    
    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    emb = emb / norms
    
    np.save(cache_path, emb)
    print(f"Saved embeddings to {cache_path} ({len(emb)} vectors)")
    
    return emb


def cosine(a, b):
    return float(np.dot(a, b))

def vector_like_cluster(embeddings, threshold=0.72):
    labels = []
    next_cluster = 0
    for i, emb in enumerate(embeddings):
        sims = [cosine(emb, embeddings[j]) for j in range(i)]
        similars = [j for j, s in enumerate(sims) if s >= threshold]
        if similars:
            labels.append(labels[similars[0]])
        else:
            labels.append(next_cluster)
            next_cluster += 1
    return np.array(labels, dtype=int)

def centroid_cluster(embeddings, threshold=0.65):
    centroids = []
    labels = []
    for emb in embeddings:
        if not centroids:
            labels.append(0)
            centroids.append(emb.copy())
            continue
        sims = [cosine(emb, c) for c in centroids]
        best_idx = int(np.argmax(sims))
        if sims[best_idx] >= threshold:
            k = best_idx
            count_k = labels.count(k)
            centroids[k] = (centroids[k] * count_k + emb) / (count_k + 1)
            labels.append(k)
        else:
            labels.append(len(centroids))
            centroids.append(emb.copy())
    return np.array(labels, dtype=int)

def evaluate(true_labels, pred_labels, name):
    true_enc, _ = pd.factorize(pd.Series(true_labels))
    pred_enc, _ = pd.factorize(pd.Series(pred_labels))
    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
    print(f"{name} -> ARI: {ari:.3f} | AMI: {ami:.3f} | H: {h:.3f} C: {c:.3f} V: {v:.3f}")

# Load data (from cache or GitHub)
df_src = load_or_fetch_issues(REPO_NAME)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None
print(f"Loaded {len(issues)} issues")

texts = _prep_texts(issues)
emb = embed_texts(texts, model_name="gemini-embedding-001")

# Agglomerative
res_aggl = cluster_issues(issues, sim_threshold=0.72, min_cluster_size=2, label_singletons_as_minus_one=False)

# Vector-style
vec_labels = vector_like_cluster(emb, threshold=0.72)

# Centroid-style
cent_labels = centroid_cluster(emb, threshold=0.65)

if true_labels is not None:
    evaluate(true_labels, res_aggl["display_labels"], "Agglomerative")
    evaluate(true_labels, vec_labels, "Vector-like")
    evaluate(true_labels, cent_labels, "Centroid-like")
else:
    print("No labels column found; skipping evaluation.")

Loading cached issues from data/plotly__plotly.js.csv
Loaded 500 issues
Computing embeddings with gemini-embedding-001...
  Embedding batch 1/5 (100 items)...
  Embedding batch 2/5 (100 items)...
  Embedding batch 3/5 (100 items)...
  Embedding batch 4/5 (100 items)...
  Embedding batch 5/5 (100 items)...
Saved embeddings to embeddings/plotly__plotly.js__gemini_embedding_001.npy (500 vectors)
Embedding batch 1/5 (100 items)...
Embedding batch 2/5 (100 items)...
Embedding batch 3/5 (100 items)...
Embedding batch 4/5 (100 items)...
Embedding batch 5/5 (100 items)...
Using Gemini Text Embedding
No labels column found; skipping evaluation.


In [8]:
# Hyperparameter sweep for vector/centroid thresholds
from itertools import product

vector_thresholds = [0.68, 0.70, 0.72, 0.74, 0.76, 0.78]
centroid_thresholds = [0.60, 0.63, 0.66, 0.69, 0.72, 0.75]
agg_thresholds = [0.68, 0.70, 0.72, 0.74, 0.76, 0.78]

results = []
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None
if true_labels is None:
    print("No labels; skipping sweep.")
else:
    true_enc, _ = pd.factorize(pd.Series(true_labels))

    # Vector sweeps
    for vt in vector_thresholds:
        vec_labels = vector_like_cluster(emb, threshold=vt)
        pred_enc, _ = pd.factorize(pd.Series(vec_labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "vector", "threshold": vt, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    # Centroid sweeps
    for ct in centroid_thresholds:
        cent_labels = centroid_cluster(emb, threshold=ct)
        pred_enc, _ = pd.factorize(pd.Series(cent_labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "centroid", "threshold": ct, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    # Agglomerative sweeps
    for th in agg_thresholds:
        dist_threshold = 1.0 - float(th)
        kwargs = dict(n_clusters=None, linkage="average", distance_threshold=dist_threshold)
        if version.parse(sklearn.__version__) >= version.parse("1.2"):
            kwargs["metric"] = "cosine"
        else:
            kwargs["affinity"] = "cosine"
        cl = AgglomerativeClustering(**kwargs)
        labels = cl.fit_predict(emb)
        pred_enc, _ = pd.factorize(pd.Series(labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "agglomerative", "threshold": th, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    df_sweep = pd.DataFrame(results).sort_values(["method", "ari"], ascending=[True, False]).reset_index(drop=True)
    display(df_sweep)

    # Show best per method
    best = df_sweep.sort_values(["method", "ari"], ascending=[True, False]).groupby("method").head(1)
    print("Best per method (by ARI):")
    display(best)

No labels; skipping sweep.


## Vector Visualization

Visualize the high-dimensional embeddings in 2D/3D space using UMAP dimensionality reduction. This helps understand how vectors cluster and whether similar items are actually close together in vector space.

In [9]:
import umap
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def _safe_label(val):
    """Convert label to string, handling NaN and None."""
    if val is None or (isinstance(val, float) and np.isnan(val)):
        return "unlabeled"
    return str(val)


def visualize_embeddings_2d(embeddings, true_labels=None, pred_labels=None, texts=None, titles=None):
    """
    Reduce embeddings to 2D using UMAP and create interactive visualizations.
    
    Args:
        embeddings: numpy array of shape (n_samples, n_features)
        true_labels: ground truth labels (optional)
        pred_labels: predicted cluster labels (optional)
        texts: list of text content for hover display (optional)
        titles: list of titles for each point (optional)
    """
    print("Reducing dimensions with UMAP (2D)...")
    reducer_2d = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
    embedding_2d = reducer_2d.fit_transform(embeddings)
    
    # Normalize labels to handle NaN
    if true_labels is not None:
        true_labels = [_safe_label(l) for l in true_labels]
    if pred_labels is not None:
        pred_labels = [_safe_label(l) for l in pred_labels]
    
    # Prepare hover text
    hover_texts = []
    for i in range(len(embeddings)):
        hover = f"Index: {i}<br>"
        if titles is not None and i < len(titles):
            title = titles[i]
            # Handle NaN or non-string values
            if title is None or (isinstance(title, float) and np.isnan(title)):
                title = ""
            else:
                title = str(title)[:100]  # truncate long titles
            hover += f"Title: {title}<br>"
        if true_labels is not None:
            hover += f"True Label: {true_labels[i]}<br>"
        if pred_labels is not None:
            hover += f"Predicted: {pred_labels[i]}<br>"
        hover_texts.append(hover)
    
    # Create figure(s)
    if true_labels is not None and pred_labels is not None:
        # Side-by-side comparison
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=("Ground Truth Labels", "Predicted Clusters"),
            horizontal_spacing=0.1
        )
        
        # Ground truth plot
        for label in sorted(set(true_labels)):
            mask = [tl == label for tl in true_labels]
            indices = [i for i, m in enumerate(mask) if m]
            fig.add_trace(
                go.Scatter(
                    x=embedding_2d[indices, 0],
                    y=embedding_2d[indices, 1],
                    mode='markers',
                    name=f'True {label}',
                    text=[hover_texts[i] for i in indices],
                    hovertemplate='%{text}<extra></extra>',
                    marker=dict(size=8, opacity=0.7),
                    legendgroup='true'
                ),
                row=1, col=1
            )
        
        # Predicted clusters plot
        for label in sorted(set(pred_labels)):
            mask = [pl == label for pl in pred_labels]
            indices = [i for i, m in enumerate(mask) if m]
            fig.add_trace(
                go.Scatter(
                    x=embedding_2d[indices, 0],
                    y=embedding_2d[indices, 1],
                    mode='markers',
                    name=f'Pred {label}',
                    text=[hover_texts[i] for i in indices],
                    hovertemplate='%{text}<extra></extra>',
                    marker=dict(size=8, opacity=0.7),
                    legendgroup='pred'
                ),
                row=1, col=2
            )
        
        fig.update_xaxes(title_text="UMAP Dimension 1", row=1, col=1)
        fig.update_xaxes(title_text="UMAP Dimension 1", row=1, col=2)
        fig.update_yaxes(title_text="UMAP Dimension 2", row=1, col=1)
        fig.update_yaxes(title_text="UMAP Dimension 2", row=1, col=2)
        
        fig.update_layout(
            height=600,
            width=1400,
            title_text="2D Vector Space Visualization",
            showlegend=True
        )
        
    elif pred_labels is not None:
        # Just predicted labels
        df_plot = pd.DataFrame({
            'x': embedding_2d[:, 0],
            'y': embedding_2d[:, 1],
            'cluster': pred_labels,
            'hover': hover_texts
        })
        
        fig = px.scatter(
            df_plot, x='x', y='y', color='cluster',
            hover_data={'hover': True, 'x': False, 'y': False, 'cluster': False},
            title='2D Vector Space - Predicted Clusters',
            labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'}
        )
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(height=600, width=900)
        
    elif true_labels is not None:
        # Just ground truth
        df_plot = pd.DataFrame({
            'x': embedding_2d[:, 0],
            'y': embedding_2d[:, 1],
            'label': true_labels,
            'hover': hover_texts
        })
        
        fig = px.scatter(
            df_plot, x='x', y='y', color='label',
            hover_data={'hover': True, 'x': False, 'y': False, 'label': False},
            title='2D Vector Space - Ground Truth Labels',
            labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'}
        )
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(height=600, width=900)
    else:
        # No labels - just show points
        df_plot = pd.DataFrame({
            'x': embedding_2d[:, 0],
            'y': embedding_2d[:, 1],
            'hover': hover_texts
        })
        
        fig = px.scatter(
            df_plot, x='x', y='y',
            hover_data={'hover': True, 'x': False, 'y': False},
            title='2D Vector Space',
            labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'}
        )
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(height=600, width=900)
    
    fig.show()
    return embedding_2d


def visualize_embeddings_3d(embeddings, true_labels=None, pred_labels=None, texts=None, titles=None):
    """
    Reduce embeddings to 3D using UMAP and create interactive 3D visualization.
    """
    print("Reducing dimensions with UMAP (3D)...")
    reducer_3d = umap.UMAP(n_components=3, random_state=42, n_neighbors=15, min_dist=0.1)
    embedding_3d = reducer_3d.fit_transform(embeddings)
    
    # Normalize labels to handle NaN
    if true_labels is not None:
        true_labels = [_safe_label(l) for l in true_labels]
    if pred_labels is not None:
        pred_labels = [_safe_label(l) for l in pred_labels]
    
    # Prepare hover text
    hover_texts = []
    for i in range(len(embeddings)):
        hover = f"Index: {i}<br>"
        if titles is not None and i < len(titles):
            title = titles[i]
            # Handle NaN or non-string values
            if title is None or (isinstance(title, float) and np.isnan(title)):
                title = ""
            else:
                title = str(title)[:100]
            hover += f"Title: {title}<br>"
        if true_labels is not None:
            hover += f"True Label: {true_labels[i]}<br>"
        if pred_labels is not None:
            hover += f"Predicted: {pred_labels[i]}<br>"
        hover_texts.append(hover)
    
    # Use predicted labels if available, otherwise true labels
    labels_to_plot = pred_labels if pred_labels is not None else true_labels
    label_name = "Predicted Cluster" if pred_labels is not None else "Ground Truth Label"
    
    if labels_to_plot is not None:
        df_plot = pd.DataFrame({
            'x': embedding_3d[:, 0],
            'y': embedding_3d[:, 1],
            'z': embedding_3d[:, 2],
            'label': labels_to_plot,
            'hover': hover_texts
        })
        
        fig = px.scatter_3d(
            df_plot, x='x', y='y', z='z', color='label',
            hover_data={'hover': True, 'x': False, 'y': False, 'z': False, 'label': False},
            title=f'3D Vector Space - {label_name}',
            labels={'x': 'UMAP Dim 1', 'y': 'UMAP Dim 2', 'z': 'UMAP Dim 3'}
        )
    else:
        df_plot = pd.DataFrame({
            'x': embedding_3d[:, 0],
            'y': embedding_3d[:, 1],
            'z': embedding_3d[:, 2],
            'hover': hover_texts
        })
        
        fig = px.scatter_3d(
            df_plot, x='x', y='y', z='z',
            hover_data={'hover': True, 'x': False, 'y': False, 'z': False},
            title='3D Vector Space',
            labels={'x': 'UMAP Dim 1', 'y': 'UMAP Dim 2', 'z': 'UMAP Dim 3'}
        )
    
    fig.update_traces(marker=dict(size=5, opacity=0.7))
    fig.update_layout(height=700, width=900)
    fig.show()
    return embedding_3d

In [10]:
# Visualize the embeddings in 2D space
# This shows ground truth labels vs predicted clusters side-by-side

titles = [issue.get("title", "") for issue in issues]
embedding_2d = visualize_embeddings_2d(
    emb, 
    true_labels=true_labels,
    pred_labels=res_aggl["display_labels"],
    titles=titles
)

Reducing dimensions with UMAP (2D)...


  warn(


In [11]:
# Visualize in 3D - interactive, can rotate and zoom
# Shows predicted clusters by default

embedding_3d = visualize_embeddings_3d(
    emb,
    true_labels=true_labels,
    pred_labels=res_aggl["display_labels"],
    titles=titles
)

Reducing dimensions with UMAP (3D)...



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



## Vector Similarity Explorer

Explore which items are most similar to each other in vector space. This helps understand what the embeddings consider "similar".

In [12]:
def _safe_str(val, max_len=None):
    """Convert value to string, handling NaN and None."""
    if val is None or (isinstance(val, float) and np.isnan(val)):
        return ""
    s = str(val)
    if max_len:
        s = s[:max_len]
    return s


def find_most_similar(embeddings, query_idx, top_k=10, issues=None):
    """
    Find the most similar items to a given query item.
    
    Args:
        embeddings: numpy array of embeddings
        query_idx: index of the query item
        top_k: number of most similar items to return
        issues: list of issue dicts with 'title' and 'body' (optional)
    
    Returns:
        DataFrame with similarity scores and content
    """
    query_emb = embeddings[query_idx]
    
    # Compute cosine similarities
    similarities = np.dot(embeddings, query_emb)
    
    # Get top-k most similar (excluding the query itself)
    similar_indices = np.argsort(similarities)[::-1]
    similar_indices = [i for i in similar_indices if i != query_idx][:top_k]
    
    results = []
    for idx in similar_indices:
        row = {
            'idx': idx,
            'similarity': float(similarities[idx]),
        }
        if issues is not None:
            row['title'] = _safe_str(issues[idx].get('title', ''), 100)
            row['body'] = _safe_str(issues[idx].get('body', ''), 200)
        results.append(row)
    
    df = pd.DataFrame(results)
    
    # Show query item
    print(f"Query item [{query_idx}]:")
    if issues is not None:
        print(f"  Title: {_safe_str(issues[query_idx].get('title', ''), 100)}")
        print(f"  Body: {_safe_str(issues[query_idx].get('body', ''), 200)}")
    print(f"\nMost similar items:")
    
    return df


def visualize_similarity_heatmap(embeddings, indices=None, labels=None, max_items=50):
    """
    Create a heatmap showing pairwise similarities between items.
    
    Args:
        embeddings: numpy array of embeddings
        indices: specific indices to visualize (optional, defaults to first max_items)
        labels: labels for each item (optional)
        max_items: maximum number of items to show (for performance)
    """
    if indices is None:
        indices = list(range(min(len(embeddings), max_items)))
    
    # Compute similarity matrix for selected indices
    selected_embs = embeddings[indices]
    similarity_matrix = np.dot(selected_embs, selected_embs.T)
    
    # Create labels for axes (handle NaN labels)
    if labels is not None:
        tick_labels = [f"{i} (L{_safe_label(labels[i])})" for i in indices]
    else:
        tick_labels = [str(i) for i in indices]
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=similarity_matrix,
        x=tick_labels,
        y=tick_labels,
        colorscale='RdBu',
        zmid=0.7,  # center the colorscale at typical similarity threshold
        colorbar=dict(title="Cosine Similarity")
    ))
    
    fig.update_layout(
        title=f'Pairwise Similarity Heatmap ({len(indices)} items)',
        xaxis_title='Item Index',
        yaxis_title='Item Index',
        width=800,
        height=800
    )
    
    fig.show()
    return similarity_matrix

In [13]:
# Example: Find items most similar to item 0
df_similar = find_most_similar(emb, query_idx=0, top_k=10, issues=issues)
display(df_similar)

Query item [0]:
  Title: [maintenance] Remove Chrome version 143 pin in `webgl-jasmine-chromeLatest`
  Body: #7690 adds a temporary pin of the Chrome version to 143 for the `webgl-jasmine-chromeLatest` job because the CI was showing many failures with Chrome 144 ([see here](https://app.circleci.com/pipelines

Most similar items:


Unnamed: 0,idx,similarity,title,body
0,314,0.843058,Some WebGL `jasmine` tests start failing with ...,That's the source of problem we (cc: @birkskyu...
1,401,0.798615,Missing Chrome on CircleCI,"Since March 19, 2024 the jasmine tests fail on..."
2,356,0.791632,Temporarily remove Jasmine test `restyle radia...,The Jasmine test `Test relayout on polar subpl...
3,193,0.756954,Fix failing noci tests,"Follow-up to #7343, these are the tests that a..."
4,313,0.722608,`Chrome 128` seems to have problem rendering W...,As discovered in #7126 this might be either a ...
5,397,0.710377,Update release process to specify updating the...,"Whenever we issue a new release of Plotly.js, ..."
6,355,0.698551,Fix Jasmine test `restyle radial axis title`,See #7028 for context.\r\n\r\nWe should diagno...
7,95,0.697095,plotly.js 3.0.2 regression -- Re-introduces [V...,After updating my plotly.js dependency from 3....
8,317,0.694688,[tooling] Replace Karma w/ a modern test runner,Karma got [deprecated](https://github.com/karm...
9,167,0.694293,Plotly-strict - Calls to WebGLRenderingContext...,"In the `plotly-strict.js` dist, there is not a..."


In [14]:
# Visualize pairwise similarities as a heatmap
# This shows which items are similar to each other
# Darker red = more similar, darker blue = less similar

sim_matrix = visualize_similarity_heatmap(
    emb, 
    indices=None,  # Uses first 50 items by default
    labels=true_labels,
    max_items=50
)

## Compare Results Across Repos

Load and compare clustering results from all analyzed repos.

In [15]:
# Load all results and compare across repos
all_results = load_all_results()

if all_results is not None:
    # Summary by repo
    summary = all_results.groupby("repo").agg({
        "idx": "count",
        "cluster_display": "nunique", 
        "is_singleton": "sum",
        "cluster_size": "mean"
    }).rename(columns={
        "idx": "total_issues",
        "cluster_display": "n_clusters",
        "is_singleton": "n_singletons",
        "cluster_size": "avg_cluster_size"
    })
    summary["pct_clustered"] = 100 * (summary["total_issues"] - summary["n_singletons"]) / summary["total_issues"]
    summary = summary.round(1)
    
    print("=" * 70)
    print("COMPARISON ACROSS REPOS")
    print("=" * 70)
    display(summary)
    
    # Show top clusters from each repo
    print("\n" + "=" * 70)
    print("TOP CLUSTERS BY REPO (showing largest cluster from each)")
    print("=" * 70)
    
    for repo in all_results["repo"].unique():
        repo_df = all_results[all_results["repo"] == repo]
        largest_cluster = repo_df[~repo_df["is_singleton"]].groupby("cluster_display").size().idxmax()
        cluster_issues = repo_df[repo_df["cluster_display"] == largest_cluster]
        
        print(f"\n{repo} - Cluster {largest_cluster} ({len(cluster_issues)} issues):")
        for _, row in cluster_issues.head(5).iterrows():
            print(f"  - {row['title'][:80]}")

Loaded results from 4 repos (673 total issues)
COMPARISON ACROSS REPOS


Unnamed: 0_level_0,total_issues,n_clusters,n_singletons,avg_cluster_size,pct_clustered
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altock/soulcaster,87,58,0,1.9,100.0
naga-k/bad-ux-mart,56,29,0,2.8,100.0
naga-k/math_functions_soulcaster,30,16,0,2.7,100.0
plotly/plotly.js,500,182,0,6.2,100.0



TOP CLUSTERS BY REPO (showing largest cluster from each)

altock/soulcaster - Cluster 2 (4 issues):
  - Check coding agent runtimes (E2B vs ECS/Fargate vs local)
  - Run coding agent on AWS Fargate triggered via Vercel API
  - Decide model/provider for coding agent (Gemini via Kilocode)
  - Decide where to run Soulcaster (hosting/runtime)

naga-k/math_functions_soulcaster - Cluster 0 (4 issues):
  - UserManager.delete_user seems to be a no-op
  - Users still exist after calling delete_user
  - Memory leak? Users not being deleted.
  - delete_user method doesn't remove the user from the list

naga-k/bad-ux-mart - Cluster 11 (5 issues):
  - modulo function also has zero division issue
  - Unhandled exception in math_ops
  - App crashes when dividing by zero
  - Need input validation in divide function
  - ZeroDivisionError in math_ops.divide

plotly/plotly.js - Cluster 63 (20 issues):
  - [BUG]: Coupled hoverinfo tooltips are misaligned
  - Cannot display a tooltip label showing the two