# Issue Clustering

This notebook clusters issue/feedback items from a CSV, evaluates against provided labels, and compares three strategies:

- Agglomerative clustering (cosine, threshold-based, batch)
- Vector-style incremental assignment (simulates vector DB neighbor-join)
- Centroid-style incremental assignment

It uses Gemini embeddings by default (set `GOOGLE_API_KEY`) or any SentenceTransformer model if you change `model_name`. Ensure `issues_raw.csv` exists with columns `title`, `body`, and optional `label`.

Add a `.env` file with `GOOGLE_API_KEY=your_key` if you want Gemini embeddings. To run offline, switch `model_name` to a SentenceTransformer model.

In [None]:
# Install dependencies
!pip install -U sentence-transformers scikit-learn numpy requests python-dotenv pandas packaging google-genai

In [15]:
import os
from dotenv import load_dotenv
import pandas as pd
from IPython.display import display
import sklearn
from packaging import version
from google import genai

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

load_dotenv()

genai_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) if os.getenv("GOOGLE_API_KEY") else None

In [16]:
def cluster_issues(
    issues,                      # list[dict] with keys: title, body (body optional)
    model_name="gemini-embedding-001",
    sim_threshold=0.70,          # higher => fewer, tighter clusters
    min_cluster_size=2,
    truncate_body_chars=1500,    # prevent very long bodies from dominating
    label_singletons_as_minus_one=True,
):
    """Cluster issue dicts by semantic similarity.

    Embeddings source is chosen by model_name:
    - If model_name starts with "gemini" (e.g., "gemini-embedding-001"), use Gemini via GOOGLE_API_KEY.
    - Otherwise, use SentenceTransformer(model_name).

    Returns both raw labels from sklearn and display labels (optionally -1 for singletons).
    """
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        text = f"{title}\n\n{body}".strip()
        if not text:
            text = "[empty]"
        texts.append(text)

    if not texts:
        return {
            "labels": np.array([], dtype=int),
            "display_labels": np.array([], dtype=int),
            "clusters": {},
            "singletons": [],
            "texts": texts,
        }

    if len(texts) == 1:
        labels = np.array([0], dtype=int)
        display_labels = np.array([-1], dtype=int) if label_singletons_as_minus_one else labels.copy()
        clusters = {0: [0]} if min_cluster_size <= 1 else {}
        singletons = [0]
        return {
            "labels": labels,
            "display_labels": display_labels,
            "clusters": clusters,
            "singletons": singletons,
            "texts": texts,
        }

    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        resp = genai_client.models.embed_content(
            model=model_name,
            contents=texts,
            config={"output_dimensionality": 768},
        )
        emb = np.asarray([e.values for e in resp.embeddings], dtype=np.float32)
        print("Using Gemini Text Embedding")
    else:
        model = SentenceTransformer(model_name)
        emb = model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        emb = np.asarray(emb, dtype=np.float32)
        print("Using HF Sentence Transformer Embedding")

    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    emb = emb / norms

    dist_threshold = 1.0 - float(sim_threshold)

    kwargs = dict(n_clusters=None, linkage="average", distance_threshold=dist_threshold)
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        kwargs["metric"] = "cosine"
    else:
        kwargs["affinity"] = "cosine"

    cl = AgglomerativeClustering(**kwargs)
    labels = cl.fit_predict(emb)

    clusters = {}
    for i, lab in enumerate(labels):
        clusters.setdefault(int(lab), []).append(i)

    kept = {k: v for k, v in clusters.items() if len(v) >= min_cluster_size}
    singletons = [v[0] for k, v in clusters.items() if len(v) == 1]
    kept = dict(sorted(kept.items(), key=lambda kv: len(kv[1]), reverse=True))

    display_labels = labels.copy()
    if label_singletons_as_minus_one:
        for i in singletons:
            display_labels[i] = -1

    return {
        "labels": labels,
        "display_labels": display_labels,
        "clusters": kept,
        "singletons": singletons,
        "texts": texts,
    }

def print_clusters(result, issues, max_items_per_cluster=8):
    for cid, idxs in result["clusters"].items():
        print(f"\n=== Cluster {cid}  (n={len(idxs)}) ===")
        for j in idxs[:max_items_per_cluster]:
            t = (issues[j].get("title") or "").strip().replace("\n", " ")
            print(f"- [{j:02d}] {t[:140]}")
    if result["singletons"]:
        print(f"\nSingletons (n={len(result['singletons'])}): {result['singletons']}")

## Tabular cluster view helper
Creates a DataFrame with cluster id, cluster size, issue index, title, and body so you can sort/filter in the notebook UI. Use the raw issues CSV helper if you want to label without clusters.

In [17]:
def clusters_as_dataframe(result, issues):
    """Return a DataFrame with raw/display labels, cluster size, issue index, title, body."""
    rows = []
    cluster_sizes = {cid: len(idxs) for cid, idxs in result.get("clusters", {}).items()}
    display_labels = result.get("display_labels", result.get("labels", []))
    labels = result.get("labels", [])

    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    for j, issue in enumerate(issues):
        raw_lab = int(labels[j]) if len(labels) > j else None
        disp_lab = int(display_labels[j]) if len(display_labels) > j else None
        cluster_size = cluster_sizes.get(raw_lab, 1) if raw_lab is not None else 1
        rows.append({
            "cluster_raw": raw_lab,
            "cluster_display": disp_lab,
            "is_singleton": disp_lab == -1,
            "cluster_size": cluster_size,
            "idx": j,
            "title": _to_text(issue.get("title")),
            "body": _to_text(issue.get("body")),
        })

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["is_singleton", "cluster_size", "cluster_display", "idx"], ascending=[True, False, True, True]).reset_index(drop=True)
    return df


def save_issues_csv(issues, path="issues_raw.csv"):
    """Save the raw (unclustered) issues to CSV for manual labeling."""
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    rows = []
    for idx, it in enumerate(issues):
        rows.append({
            "idx": idx,
            "title": _to_text(it.get("title")).strip(),
            "body": _to_text(it.get("body")).strip(),
        })
    df = pd.DataFrame(rows)
    df.to_csv(path, index=False)
    print(f"Wrote {len(df)} issues to {path}")
    return df

## Example: cluster from CSV
Load labeled issues from `issues_raw.csv`, cluster them, and evaluate against the provided labels.

In [None]:
# Load issues from CSV, cluster, and evaluate
csv_path = "issues_raw.csv"

df_src = pd.read_csv(csv_path)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None

res = cluster_issues(issues, sim_threshold=0.72, min_cluster_size=2, label_singletons_as_minus_one=False)
print_clusters(res, issues)

df_clusters = clusters_as_dataframe(res, issues)
display(df_clusters.head(200))

if true_labels is not None:
    # Evaluate clustering vs. provided labels
    from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure

    # Map string labels to ints for metrics
    true_enc, _ = pd.factorize(true_labels)
    pred_enc, _ = pd.factorize(res["display_labels"])

    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)

    print(f"ARI: {ari:.3f}")
    print(f"AMI: {ami:.3f}")
    print(f"Homogeneity: {h:.3f}  Completeness: {c:.3f}  V-Measure: {v:.3f}")
else:
    print("No labels column found; skipping evaluation.")

In [19]:
# Compare agglomerative vs vector-like vs centroid-like on CSV labels
import math
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure

def _prep_texts(issues, truncate_body_chars=1500):
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)
    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        txt = f"{title}\n\n{body}".strip()
        texts.append(txt or "[empty]")
    return texts

def embed_texts(texts, model_name="gemini-embedding-001"):
    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        resp = genai_client.models.embed_content(
            model=model_name,
            contents=texts,
            config={"output_dimensionality": 768},
        )
        emb = np.asarray([e.values for e in resp.embeddings], dtype=np.float32)
    else:
        model = SentenceTransformer(model_name)
        emb = model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        emb = np.asarray(emb, dtype=np.float32)
    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return emb / norms

def cosine(a, b):
    return float(np.dot(a, b))

def vector_like_cluster(embeddings, threshold=0.72):
    labels = []
    next_cluster = 0
    for i, emb in enumerate(embeddings):
        sims = [cosine(emb, embeddings[j]) for j in range(i)]
        similars = [j for j, s in enumerate(sims) if s >= threshold]
        if similars:
            labels.append(labels[similars[0]])
        else:
            labels.append(next_cluster)
            next_cluster += 1
    return np.array(labels, dtype=int)

def centroid_cluster(embeddings, threshold=0.65):
    centroids = []
    labels = []
    for emb in embeddings:
        if not centroids:
            labels.append(0)
            centroids.append(emb.copy())
            continue
        sims = [cosine(emb, c) for c in centroids]
        best_idx = int(np.argmax(sims))
        if sims[best_idx] >= threshold:
            k = best_idx
            count_k = labels.count(k)
            centroids[k] = (centroids[k] * count_k + emb) / (count_k + 1)
            labels.append(k)
        else:
            labels.append(len(centroids))
            centroids.append(emb.copy())
    return np.array(labels, dtype=int)

def evaluate(true_labels, pred_labels, name):
    true_enc, _ = pd.factorize(pd.Series(true_labels))
    pred_enc, _ = pd.factorize(pd.Series(pred_labels))
    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
    print(f"{name} -> ARI: {ari:.3f} | AMI: {ami:.3f} | H: {h:.3f} C: {c:.3f} V: {v:.3f}")

# Load data
csv_path = "issues_raw.csv"
df_src = pd.read_csv(csv_path)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None

texts = _prep_texts(issues)
emb = embed_texts(texts, model_name="gemini-embedding-001")

# Agglomerative (existing)
res_aggl = cluster_issues(issues, sim_threshold=0.72, min_cluster_size=2, label_singletons_as_minus_one=False)

# Vector-style (incremental neighbor above threshold)
vec_labels = vector_like_cluster(emb, threshold=0.72)

# Centroid-style (assign to best centroid above threshold; else new)
cent_labels = centroid_cluster(emb, threshold=0.65)

if true_labels is not None:
    evaluate(true_labels, res_aggl["display_labels"], "Agglomerative")
    evaluate(true_labels, vec_labels, "Vector-like")
    evaluate(true_labels, cent_labels, "Centroid-like")
else:
    print("No labels column found; skipping evaluation.")

Using Gemini Text Embedding
Agglomerative -> ARI: 0.879 | AMI: 0.873 | H: 0.979 C: 0.956 V: 0.967
Vector-like -> ARI: 0.865 | AMI: 0.836 | H: 0.977 C: 0.942 V: 0.959
Centroid-like -> ARI: 0.543 | AMI: 0.681 | H: 0.845 C: 0.949 V: 0.894


In [20]:
# Compare agglomerative vs vector-like vs centroid-like on CSV labels
import math
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure

def _prep_texts(issues, truncate_body_chars=1500):
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)
    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        txt = f"{title}\n\n{body}".strip()
        texts.append(txt or "[empty]")
    return texts

def embed_texts(texts, model_name="gemini-embedding-001"):
    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        resp = genai_client.models.embed_content(
            model=model_name,
            contents=texts,
            config={"output_dimensionality": 768},
        )
        emb = np.asarray([e.values for e in resp.embeddings], dtype=np.float32)
    else:
        model = SentenceTransformer(model_name)
        emb = model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        emb = np.asarray(emb, dtype=np.float32)
    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return emb / norms

def cosine(a, b):
    return float(np.dot(a, b))

def vector_like_cluster(embeddings, threshold=0.72):
    labels = []
    next_cluster = 0
    for i, emb in enumerate(embeddings):
        sims = [cosine(emb, embeddings[j]) for j in range(i)]
        similars = [j for j, s in enumerate(sims) if s >= threshold]
        if similars:
            labels.append(labels[similars[0]])
        else:
            labels.append(next_cluster)
            next_cluster += 1
    return np.array(labels, dtype=int)

def centroid_cluster(embeddings, threshold=0.65):
    centroids = []
    labels = []
    for emb in embeddings:
        if not centroids:
            labels.append(0)
            centroids.append(emb.copy())
            continue
        sims = [cosine(emb, c) for c in centroids]
        best_idx = int(np.argmax(sims))
        if sims[best_idx] >= threshold:
            k = best_idx
            count_k = labels.count(k)
            centroids[k] = (centroids[k] * count_k + emb) / (count_k + 1)
            labels.append(k)
        else:
            labels.append(len(centroids))
            centroids.append(emb.copy())
    return np.array(labels, dtype=int)

def evaluate(true_labels, pred_labels, name):
    true_enc, _ = pd.factorize(pd.Series(true_labels))
    pred_enc, _ = pd.factorize(pd.Series(pred_labels))
    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
    print(f"{name} -> ARI: {ari:.3f} | AMI: {ami:.3f} | H: {h:.3f} C: {c:.3f} V: {v:.3f}")

# Load data
csv_path = "issues_raw.csv"
df_src = pd.read_csv(csv_path)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None

texts = _prep_texts(issues)
emb = embed_texts(texts, model_name="gemini-embedding-001")

# Agglomerative (existing)
res_aggl = cluster_issues(issues, sim_threshold=0.72, min_cluster_size=2, label_singletons_as_minus_one=False)

# Vector-style (incremental neighbor above threshold)
vec_labels = vector_like_cluster(emb, threshold=0.72)

# Centroid-style (assign to best centroid above threshold; else new)
cent_labels = centroid_cluster(emb, threshold=0.65)

if true_labels is not None:
    evaluate(true_labels, res_aggl["display_labels"], "Agglomerative")
    evaluate(true_labels, vec_labels, "Vector-like")
    evaluate(true_labels, cent_labels, "Centroid-like")
else:
    print("No labels column found; skipping evaluation.")

Using Gemini Text Embedding
Agglomerative -> ARI: 0.879 | AMI: 0.873 | H: 0.979 C: 0.956 V: 0.967
Vector-like -> ARI: 0.865 | AMI: 0.836 | H: 0.977 C: 0.942 V: 0.959
Centroid-like -> ARI: 0.543 | AMI: 0.681 | H: 0.845 C: 0.949 V: 0.894


In [21]:
# Hyperparameter sweep for vector/centroid thresholds
from itertools import product

vector_thresholds = [0.68, 0.70, 0.72, 0.74, 0.76, 0.78]
centroid_thresholds = [0.60, 0.63, 0.66, 0.69, 0.72, 0.75]
agg_thresholds = [0.68, 0.70, 0.72, 0.74, 0.76, 0.78]

results = []
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None
if true_labels is None:
    print("No labels; skipping sweep.")
else:
    true_enc, _ = pd.factorize(pd.Series(true_labels))

    # Vector sweeps
    for vt in vector_thresholds:
        vec_labels = vector_like_cluster(emb, threshold=vt)
        pred_enc, _ = pd.factorize(pd.Series(vec_labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "vector", "threshold": vt, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    # Centroid sweeps
    for ct in centroid_thresholds:
        cent_labels = centroid_cluster(emb, threshold=ct)
        pred_enc, _ = pd.factorize(pd.Series(cent_labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "centroid", "threshold": ct, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    # Agglomerative sweeps
    for th in agg_thresholds:
        dist_threshold = 1.0 - float(th)
        kwargs = dict(n_clusters=None, linkage="average", distance_threshold=dist_threshold)
        if version.parse(sklearn.__version__) >= version.parse("1.2"):
            kwargs["metric"] = "cosine"
        else:
            kwargs["affinity"] = "cosine"
        cl = AgglomerativeClustering(**kwargs)
        labels = cl.fit_predict(emb)
        pred_enc, _ = pd.factorize(pd.Series(labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "agglomerative", "threshold": th, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    df_sweep = pd.DataFrame(results).sort_values(["method", "ari"], ascending=[True, False]).reset_index(drop=True)
    display(df_sweep)

    # Show best per method
    best = df_sweep.sort_values(["method", "ari"], ascending=[True, False]).groupby("method").head(1)
    print("Best per method (by ARI):")
    display(best)

Unnamed: 0,method,threshold,ari,ami,h,c,v
0,agglomerative,0.72,0.878614,0.872694,0.97918,0.955741,0.967319
1,agglomerative,0.76,0.868667,0.865061,1.0,0.938823,0.968446
2,agglomerative,0.74,0.861964,0.863814,0.98959,0.94499,0.966776
3,agglomerative,0.78,0.843574,0.840381,1.0,0.929737,0.963589
4,agglomerative,0.7,0.81975,0.819377,0.946099,0.954265,0.950164
5,agglomerative,0.68,0.554317,0.707515,0.86805,0.950357,0.907341
6,centroid,0.72,0.861964,0.863814,0.98959,0.94499,0.966776
7,centroid,0.75,0.849854,0.846509,0.98959,0.938219,0.96322
8,centroid,0.69,0.715355,0.775847,0.917247,0.952894,0.93473
9,centroid,0.66,0.559797,0.714185,0.860493,0.957934,0.906603


Best per method (by ARI):


Unnamed: 0,method,threshold,ari,ami,h,c,v
0,agglomerative,0.72,0.878614,0.872694,0.97918,0.955741,0.967319
6,centroid,0.72,0.861964,0.863814,0.98959,0.94499,0.966776
12,vector,0.72,0.864654,0.836323,0.977328,0.941743,0.959205
