# Issue Clustering

This notebook clusters issue/feedback items from a CSV, evaluates against provided labels, and compares three strategies:

- Agglomerative clustering (cosine, threshold-based, batch)
- Vector-style incremental assignment (simulates vector DB neighbor-join)
- Centroid-style incremental assignment

It uses Gemini embeddings by default (set `GOOGLE_API_KEY`) or any SentenceTransformer model if you change `model_name`. Ensure `issues_raw.csv` exists with columns `title`, `body`, and optional `label`.

Add a `.env` file with `GOOGLE_API_KEY=your_key` if you want Gemini embeddings. To run offline, switch `model_name` to a SentenceTransformer model.

In [None]:
# Install dependencies
!python3 -m venv venv && source venv/bin/activate && pip install -r requirements.txt

: 

In [6]:
import os
from dotenv import load_dotenv
import pandas as pd
from IPython.display import display
import sklearn
from packaging import version
from google import genai

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

load_dotenv()

genai_client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) if os.getenv("GOOGLE_API_KEY") else None

In [7]:
def cluster_issues(
    issues,                      # list[dict] with keys: title, body (body optional)
    model_name="gemini-embedding-001",
    sim_threshold=0.70,          # higher => fewer, tighter clusters
    min_cluster_size=2,
    truncate_body_chars=1500,    # prevent very long bodies from dominating
    label_singletons_as_minus_one=True,
):
    """Cluster issue dicts by semantic similarity.

    Embeddings source is chosen by model_name:
    - If model_name starts with "gemini" (e.g., "gemini-embedding-001"), use Gemini via GOOGLE_API_KEY.
    - Otherwise, use SentenceTransformer(model_name).

    Returns both raw labels from sklearn and display labels (optionally -1 for singletons).
    """
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        text = f"{title}\n\n{body}".strip()
        if not text:
            text = "[empty]"
        texts.append(text)

    if not texts:
        return {
            "labels": np.array([], dtype=int),
            "display_labels": np.array([], dtype=int),
            "clusters": {},
            "singletons": [],
            "texts": texts,
        }

    if len(texts) == 1:
        labels = np.array([0], dtype=int)
        display_labels = np.array([-1], dtype=int) if label_singletons_as_minus_one else labels.copy()
        clusters = {0: [0]} if min_cluster_size <= 1 else {}
        singletons = [0]
        return {
            "labels": labels,
            "display_labels": display_labels,
            "clusters": clusters,
            "singletons": singletons,
            "texts": texts,
        }

    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        resp = genai_client.models.embed_content(
            model=model_name,
            contents=texts,
            config={"output_dimensionality": 768},
        )
        emb = np.asarray([e.values for e in resp.embeddings], dtype=np.float32)
        print("Using Gemini Text Embedding")
    else:
        model = SentenceTransformer(model_name)
        emb = model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        emb = np.asarray(emb, dtype=np.float32)
        print("Using HF Sentence Transformer Embedding")

    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    emb = emb / norms

    dist_threshold = 1.0 - float(sim_threshold)

    kwargs = dict(n_clusters=None, linkage="average", distance_threshold=dist_threshold)
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        kwargs["metric"] = "cosine"
    else:
        kwargs["affinity"] = "cosine"

    cl = AgglomerativeClustering(**kwargs)
    labels = cl.fit_predict(emb)

    clusters = {}
    for i, lab in enumerate(labels):
        clusters.setdefault(int(lab), []).append(i)

    kept = {k: v for k, v in clusters.items() if len(v) >= min_cluster_size}
    singletons = [v[0] for k, v in clusters.items() if len(v) == 1]
    kept = dict(sorted(kept.items(), key=lambda kv: len(kv[1]), reverse=True))

    display_labels = labels.copy()
    if label_singletons_as_minus_one:
        for i in singletons:
            display_labels[i] = -1

    return {
        "labels": labels,
        "display_labels": display_labels,
        "clusters": kept,
        "singletons": singletons,
        "texts": texts,
    }

def print_clusters(result, issues, max_items_per_cluster=8):
    for cid, idxs in result["clusters"].items():
        print(f"\n=== Cluster {cid}  (n={len(idxs)}) ===")
        for j in idxs[:max_items_per_cluster]:
            t = (issues[j].get("title") or "").strip().replace("\n", " ")
            print(f"- [{j:02d}] {t[:140]}")
    if result["singletons"]:
        print(f"\nSingletons (n={len(result['singletons'])}): {result['singletons']}")

## Tabular cluster view helper
Creates a DataFrame with cluster id, cluster size, issue index, title, and body so you can sort/filter in the notebook UI. Use the raw issues CSV helper if you want to label without clusters.

In [8]:
def clusters_as_dataframe(result, issues):
    """Return a DataFrame with raw/display labels, cluster size, issue index, title, body."""
    rows = []
    cluster_sizes = {cid: len(idxs) for cid, idxs in result.get("clusters", {}).items()}
    display_labels = result.get("display_labels", result.get("labels", []))
    labels = result.get("labels", [])

    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    for j, issue in enumerate(issues):
        raw_lab = int(labels[j]) if len(labels) > j else None
        disp_lab = int(display_labels[j]) if len(display_labels) > j else None
        cluster_size = cluster_sizes.get(raw_lab, 1) if raw_lab is not None else 1
        rows.append({
            "cluster_raw": raw_lab,
            "cluster_display": disp_lab,
            "is_singleton": disp_lab == -1,
            "cluster_size": cluster_size,
            "idx": j,
            "title": _to_text(issue.get("title")),
            "body": _to_text(issue.get("body")),
        })

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["is_singleton", "cluster_size", "cluster_display", "idx"], ascending=[True, False, True, True]).reset_index(drop=True)
    return df


def save_issues_csv(issues, path="issues_raw.csv"):
    """Save the raw (unclustered) issues to CSV for manual labeling."""
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)

    rows = []
    for idx, it in enumerate(issues):
        rows.append({
            "idx": idx,
            "title": _to_text(it.get("title")).strip(),
            "body": _to_text(it.get("body")).strip(),
        })
    df = pd.DataFrame(rows)
    df.to_csv(path, index=False)
    print(f"Wrote {len(df)} issues to {path}")
    return df

## Example: cluster from CSV
Load labeled issues from `issues_raw.csv`, cluster them, and evaluate against the provided labels.

In [9]:
# Load issues from CSV, cluster, and evaluate
csv_path = "issues_raw.csv"

df_src = pd.read_csv(csv_path)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None

res = cluster_issues(issues, sim_threshold=0.72, min_cluster_size=2, label_singletons_as_minus_one=False)
print_clusters(res, issues)

df_clusters = clusters_as_dataframe(res, issues)
display(df_clusters.head(200))

if true_labels is not None:
    # Evaluate clustering vs. provided labels
    from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure

    # Map string labels to ints for metrics
    true_enc, _ = pd.factorize(true_labels)
    pred_enc, _ = pd.factorize(res["display_labels"])

    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)

    print(f"ARI: {ari:.3f}")
    print(f"AMI: {ami:.3f}")
    print(f"Homogeneity: {h:.3f}  Completeness: {c:.3f}  V-Measure: {v:.3f}")
else:
    print("No labels column found; skipping evaluation.")

RuntimeError: GOOGLE_API_KEY not set; populate .env or environment

In [10]:
# Compare agglomerative vs vector-like vs centroid-like on CSV labels
import math
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure

def _prep_texts(issues, truncate_body_chars=1500):
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)
    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        txt = f"{title}\n\n{body}".strip()
        texts.append(txt or "[empty]")
    return texts

def embed_texts(texts, model_name="gemini-embedding-001"):
    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        resp = genai_client.models.embed_content(
            model=model_name,
            contents=texts,
            config={"output_dimensionality": 768},
        )
        emb = np.asarray([e.values for e in resp.embeddings], dtype=np.float32)
    else:
        model = SentenceTransformer(model_name)
        emb = model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        emb = np.asarray(emb, dtype=np.float32)
    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return emb / norms

def cosine(a, b):
    return float(np.dot(a, b))

def vector_like_cluster(embeddings, threshold=0.72):
    labels = []
    next_cluster = 0
    for i, emb in enumerate(embeddings):
        sims = [cosine(emb, embeddings[j]) for j in range(i)]
        similars = [j for j, s in enumerate(sims) if s >= threshold]
        if similars:
            labels.append(labels[similars[0]])
        else:
            labels.append(next_cluster)
            next_cluster += 1
    return np.array(labels, dtype=int)

def centroid_cluster(embeddings, threshold=0.65):
    centroids = []
    labels = []
    for emb in embeddings:
        if not centroids:
            labels.append(0)
            centroids.append(emb.copy())
            continue
        sims = [cosine(emb, c) for c in centroids]
        best_idx = int(np.argmax(sims))
        if sims[best_idx] >= threshold:
            k = best_idx
            count_k = labels.count(k)
            centroids[k] = (centroids[k] * count_k + emb) / (count_k + 1)
            labels.append(k)
        else:
            labels.append(len(centroids))
            centroids.append(emb.copy())
    return np.array(labels, dtype=int)

def evaluate(true_labels, pred_labels, name):
    true_enc, _ = pd.factorize(pd.Series(true_labels))
    pred_enc, _ = pd.factorize(pd.Series(pred_labels))
    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
    print(f"{name} -> ARI: {ari:.3f} | AMI: {ami:.3f} | H: {h:.3f} C: {c:.3f} V: {v:.3f}")

# Load data
csv_path = "issues_raw.csv"
df_src = pd.read_csv(csv_path)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None

texts = _prep_texts(issues)
emb = embed_texts(texts, model_name="gemini-embedding-001")

# Agglomerative (existing)
res_aggl = cluster_issues(issues, sim_threshold=0.72, min_cluster_size=2, label_singletons_as_minus_one=False)

# Vector-style (incremental neighbor above threshold)
vec_labels = vector_like_cluster(emb, threshold=0.72)

# Centroid-style (assign to best centroid above threshold; else new)
cent_labels = centroid_cluster(emb, threshold=0.65)

if true_labels is not None:
    evaluate(true_labels, res_aggl["display_labels"], "Agglomerative")
    evaluate(true_labels, vec_labels, "Vector-like")
    evaluate(true_labels, cent_labels, "Centroid-like")
else:
    print("No labels column found; skipping evaluation.")

RuntimeError: GOOGLE_API_KEY not set; populate .env or environment

In [11]:
# Compare agglomerative vs vector-like vs centroid-like on CSV labels
import math
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, homogeneity_completeness_v_measure

def _prep_texts(issues, truncate_body_chars=1500):
    def _to_text(val):
        if val is None:
            return ""
        if isinstance(val, str):
            return val
        if isinstance(val, float):
            return "" if np.isnan(val) else str(val)
        return str(val)
    texts = []
    for it in issues:
        title = _to_text(it.get("title")).strip()
        body = _to_text(it.get("body")).strip()
        if truncate_body_chars:
            body = body[:truncate_body_chars]
        txt = f"{title}\n\n{body}".strip()
        texts.append(txt or "[empty]")
    return texts

def embed_texts(texts, model_name="gemini-embedding-001"):
    if model_name.lower().startswith("gemini"):
        if genai_client is None:
            raise RuntimeError("GOOGLE_API_KEY not set; populate .env or environment")
        resp = genai_client.models.embed_content(
            model=model_name,
            contents=texts,
            config={"output_dimensionality": 768},
        )
        emb = np.asarray([e.values for e in resp.embeddings], dtype=np.float32)
    else:
        model = SentenceTransformer(model_name)
        emb = model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        emb = np.asarray(emb, dtype=np.float32)
    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return emb / norms

def cosine(a, b):
    return float(np.dot(a, b))

def vector_like_cluster(embeddings, threshold=0.72):
    labels = []
    next_cluster = 0
    for i, emb in enumerate(embeddings):
        sims = [cosine(emb, embeddings[j]) for j in range(i)]
        similars = [j for j, s in enumerate(sims) if s >= threshold]
        if similars:
            labels.append(labels[similars[0]])
        else:
            labels.append(next_cluster)
            next_cluster += 1
    return np.array(labels, dtype=int)

def centroid_cluster(embeddings, threshold=0.65):
    centroids = []
    labels = []
    for emb in embeddings:
        if not centroids:
            labels.append(0)
            centroids.append(emb.copy())
            continue
        sims = [cosine(emb, c) for c in centroids]
        best_idx = int(np.argmax(sims))
        if sims[best_idx] >= threshold:
            k = best_idx
            count_k = labels.count(k)
            centroids[k] = (centroids[k] * count_k + emb) / (count_k + 1)
            labels.append(k)
        else:
            labels.append(len(centroids))
            centroids.append(emb.copy())
    return np.array(labels, dtype=int)

def evaluate(true_labels, pred_labels, name):
    true_enc, _ = pd.factorize(pd.Series(true_labels))
    pred_enc, _ = pd.factorize(pd.Series(pred_labels))
    ari = adjusted_rand_score(true_enc, pred_enc)
    ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
    h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
    print(f"{name} -> ARI: {ari:.3f} | AMI: {ami:.3f} | H: {h:.3f} C: {c:.3f} V: {v:.3f}")

# Load data
csv_path = "issues_raw.csv"
df_src = pd.read_csv(csv_path)
issues = df_src[["title", "body"]].to_dict("records")
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None

texts = _prep_texts(issues)
emb = embed_texts(texts, model_name="gemini-embedding-001")

# Agglomerative (existing)
res_aggl = cluster_issues(issues, sim_threshold=0.72, min_cluster_size=2, label_singletons_as_minus_one=False)

# Vector-style (incremental neighbor above threshold)
vec_labels = vector_like_cluster(emb, threshold=0.72)

# Centroid-style (assign to best centroid above threshold; else new)
cent_labels = centroid_cluster(emb, threshold=0.65)

if true_labels is not None:
    evaluate(true_labels, res_aggl["display_labels"], "Agglomerative")
    evaluate(true_labels, vec_labels, "Vector-like")
    evaluate(true_labels, cent_labels, "Centroid-like")
else:
    print("No labels column found; skipping evaluation.")

RuntimeError: GOOGLE_API_KEY not set; populate .env or environment

In [12]:
# Hyperparameter sweep for vector/centroid thresholds
from itertools import product

vector_thresholds = [0.68, 0.70, 0.72, 0.74, 0.76, 0.78]
centroid_thresholds = [0.60, 0.63, 0.66, 0.69, 0.72, 0.75]
agg_thresholds = [0.68, 0.70, 0.72, 0.74, 0.76, 0.78]

results = []
true_labels = df_src["label"].tolist() if "label" in df_src.columns else None
if true_labels is None:
    print("No labels; skipping sweep.")
else:
    true_enc, _ = pd.factorize(pd.Series(true_labels))

    # Vector sweeps
    for vt in vector_thresholds:
        vec_labels = vector_like_cluster(emb, threshold=vt)
        pred_enc, _ = pd.factorize(pd.Series(vec_labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "vector", "threshold": vt, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    # Centroid sweeps
    for ct in centroid_thresholds:
        cent_labels = centroid_cluster(emb, threshold=ct)
        pred_enc, _ = pd.factorize(pd.Series(cent_labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "centroid", "threshold": ct, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    # Agglomerative sweeps
    for th in agg_thresholds:
        dist_threshold = 1.0 - float(th)
        kwargs = dict(n_clusters=None, linkage="average", distance_threshold=dist_threshold)
        if version.parse(sklearn.__version__) >= version.parse("1.2"):
            kwargs["metric"] = "cosine"
        else:
            kwargs["affinity"] = "cosine"
        cl = AgglomerativeClustering(**kwargs)
        labels = cl.fit_predict(emb)
        pred_enc, _ = pd.factorize(pd.Series(labels))
        ari = adjusted_rand_score(true_enc, pred_enc)
        ami = adjusted_mutual_info_score(true_enc, pred_enc, average_method="arithmetic")
        h, c, v = homogeneity_completeness_v_measure(true_enc, pred_enc)
        results.append({"method": "agglomerative", "threshold": th, "ari": ari, "ami": ami, "h": h, "c": c, "v": v})

    df_sweep = pd.DataFrame(results).sort_values(["method", "ari"], ascending=[True, False]).reset_index(drop=True)
    display(df_sweep)

    # Show best per method
    best = df_sweep.sort_values(["method", "ari"], ascending=[True, False]).groupby("method").head(1)
    print("Best per method (by ARI):")
    display(best)

NameError: name 'emb' is not defined

## Vector Visualization

Visualize the high-dimensional embeddings in 2D/3D space using UMAP dimensionality reduction. This helps understand how vectors cluster and whether similar items are actually close together in vector space.

In [None]:
import umap
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def visualize_embeddings_2d(embeddings, true_labels=None, pred_labels=None, texts=None, titles=None):
    """
    Reduce embeddings to 2D using UMAP and create interactive visualizations.
    
    Args:
        embeddings: numpy array of shape (n_samples, n_features)
        true_labels: ground truth labels (optional)
        pred_labels: predicted cluster labels (optional)
        texts: list of text content for hover display (optional)
        titles: list of titles for each point (optional)
    """
    print("Reducing dimensions with UMAP (2D)...")
    reducer_2d = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
    embedding_2d = reducer_2d.fit_transform(embeddings)
    
    # Prepare hover text
    hover_texts = []
    for i in range(len(embeddings)):
        hover = f"Index: {i}<br>"
        if titles is not None and i < len(titles):
            title = titles[i][:100]  # truncate long titles
            hover += f"Title: {title}<br>"
        if true_labels is not None:
            hover += f"True Label: {true_labels[i]}<br>"
        if pred_labels is not None:
            hover += f"Predicted: {pred_labels[i]}<br>"
        hover_texts.append(hover)
    
    # Create figure(s)
    if true_labels is not None and pred_labels is not None:
        # Side-by-side comparison
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=("Ground Truth Labels", "Predicted Clusters"),
            horizontal_spacing=0.1
        )
        
        # Ground truth plot
        for label in sorted(set(true_labels)):
            mask = [tl == label for tl in true_labels]
            indices = [i for i, m in enumerate(mask) if m]
            fig.add_trace(
                go.Scatter(
                    x=embedding_2d[indices, 0],
                    y=embedding_2d[indices, 1],
                    mode='markers',
                    name=f'True {label}',
                    text=[hover_texts[i] for i in indices],
                    hovertemplate='%{text}<extra></extra>',
                    marker=dict(size=8, opacity=0.7),
                    legendgroup='true'
                ),
                row=1, col=1
            )
        
        # Predicted clusters plot
        for label in sorted(set(pred_labels)):
            mask = [pl == label for pl in pred_labels]
            indices = [i for i, m in enumerate(mask) if m]
            fig.add_trace(
                go.Scatter(
                    x=embedding_2d[indices, 0],
                    y=embedding_2d[indices, 1],
                    mode='markers',
                    name=f'Pred {label}',
                    text=[hover_texts[i] for i in indices],
                    hovertemplate='%{text}<extra></extra>',
                    marker=dict(size=8, opacity=0.7),
                    legendgroup='pred'
                ),
                row=1, col=2
            )
        
        fig.update_xaxes(title_text="UMAP Dimension 1", row=1, col=1)
        fig.update_xaxes(title_text="UMAP Dimension 1", row=1, col=2)
        fig.update_yaxes(title_text="UMAP Dimension 2", row=1, col=1)
        fig.update_yaxes(title_text="UMAP Dimension 2", row=1, col=2)
        
        fig.update_layout(
            height=600,
            width=1400,
            title_text="2D Vector Space Visualization",
            showlegend=True
        )
        
    elif pred_labels is not None:
        # Just predicted labels
        df_plot = pd.DataFrame({
            'x': embedding_2d[:, 0],
            'y': embedding_2d[:, 1],
            'cluster': [str(l) for l in pred_labels],
            'hover': hover_texts
        })
        
        fig = px.scatter(
            df_plot, x='x', y='y', color='cluster',
            hover_data={'hover': True, 'x': False, 'y': False, 'cluster': False},
            title='2D Vector Space - Predicted Clusters',
            labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'}
        )
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(height=600, width=900)
        
    elif true_labels is not None:
        # Just ground truth
        df_plot = pd.DataFrame({
            'x': embedding_2d[:, 0],
            'y': embedding_2d[:, 1],
            'label': [str(l) for l in true_labels],
            'hover': hover_texts
        })
        
        fig = px.scatter(
            df_plot, x='x', y='y', color='label',
            hover_data={'hover': True, 'x': False, 'y': False, 'label': False},
            title='2D Vector Space - Ground Truth Labels',
            labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'}
        )
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(height=600, width=900)
    else:
        # No labels - just show points
        df_plot = pd.DataFrame({
            'x': embedding_2d[:, 0],
            'y': embedding_2d[:, 1],
            'hover': hover_texts
        })
        
        fig = px.scatter(
            df_plot, x='x', y='y',
            hover_data={'hover': True, 'x': False, 'y': False},
            title='2D Vector Space',
            labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2'}
        )
        fig.update_traces(marker=dict(size=8, opacity=0.7))
        fig.update_layout(height=600, width=900)
    
    fig.show()
    return embedding_2d


def visualize_embeddings_3d(embeddings, true_labels=None, pred_labels=None, texts=None, titles=None):
    """
    Reduce embeddings to 3D using UMAP and create interactive 3D visualization.
    """
    print("Reducing dimensions with UMAP (3D)...")
    reducer_3d = umap.UMAP(n_components=3, random_state=42, n_neighbors=15, min_dist=0.1)
    embedding_3d = reducer_3d.fit_transform(embeddings)
    
    # Prepare hover text
    hover_texts = []
    for i in range(len(embeddings)):
        hover = f"Index: {i}<br>"
        if titles is not None and i < len(titles):
            title = titles[i][:100]
            hover += f"Title: {title}<br>"
        if true_labels is not None:
            hover += f"True Label: {true_labels[i]}<br>"
        if pred_labels is not None:
            hover += f"Predicted: {pred_labels[i]}<br>"
        hover_texts.append(hover)
    
    # Use predicted labels if available, otherwise true labels
    labels_to_plot = pred_labels if pred_labels is not None else true_labels
    label_name = "Predicted Cluster" if pred_labels is not None else "Ground Truth Label"
    
    if labels_to_plot is not None:
        df_plot = pd.DataFrame({
            'x': embedding_3d[:, 0],
            'y': embedding_3d[:, 1],
            'z': embedding_3d[:, 2],
            'label': [str(l) for l in labels_to_plot],
            'hover': hover_texts
        })
        
        fig = px.scatter_3d(
            df_plot, x='x', y='y', z='z', color='label',
            hover_data={'hover': True, 'x': False, 'y': False, 'z': False, 'label': False},
            title=f'3D Vector Space - {label_name}',
            labels={'x': 'UMAP Dim 1', 'y': 'UMAP Dim 2', 'z': 'UMAP Dim 3'}
        )
    else:
        df_plot = pd.DataFrame({
            'x': embedding_3d[:, 0],
            'y': embedding_3d[:, 1],
            'z': embedding_3d[:, 2],
            'hover': hover_texts
        })
        
        fig = px.scatter_3d(
            df_plot, x='x', y='y', z='z',
            hover_data={'hover': True, 'x': False, 'y': False, 'z': False},
            title='3D Vector Space',
            labels={'x': 'UMAP Dim 1', 'y': 'UMAP Dim 2', 'z': 'UMAP Dim 3'}
        )
    
    fig.update_traces(marker=dict(size=5, opacity=0.7))
    fig.update_layout(height=700, width=900)
    fig.show()
    return embedding_3d

In [None]:
# Visualize the embeddings in 2D space
# This shows ground truth labels vs predicted clusters side-by-side

titles = [issue.get("title", "") for issue in issues]
embedding_2d = visualize_embeddings_2d(
    emb, 
    true_labels=true_labels,
    pred_labels=res_aggl["display_labels"],
    titles=titles
)

In [None]:
# Visualize in 3D - interactive, can rotate and zoom
# Shows predicted clusters by default

embedding_3d = visualize_embeddings_3d(
    emb,
    true_labels=true_labels,
    pred_labels=res_aggl["display_labels"],
    titles=titles
)

## Vector Similarity Explorer

Explore which items are most similar to each other in vector space. This helps understand what the embeddings consider "similar".

In [None]:
def find_most_similar(embeddings, query_idx, top_k=10, issues=None):
    """
    Find the most similar items to a given query item.
    
    Args:
        embeddings: numpy array of embeddings
        query_idx: index of the query item
        top_k: number of most similar items to return
        issues: list of issue dicts with 'title' and 'body' (optional)
    
    Returns:
        DataFrame with similarity scores and content
    """
    query_emb = embeddings[query_idx]
    
    # Compute cosine similarities
    similarities = np.dot(embeddings, query_emb)
    
    # Get top-k most similar (excluding the query itself)
    similar_indices = np.argsort(similarities)[::-1]
    similar_indices = [i for i in similar_indices if i != query_idx][:top_k]
    
    results = []
    for idx in similar_indices:
        row = {
            'idx': idx,
            'similarity': float(similarities[idx]),
        }
        if issues is not None:
            row['title'] = issues[idx].get('title', '')[:100]
            row['body'] = issues[idx].get('body', '')[:200]
        results.append(row)
    
    df = pd.DataFrame(results)
    
    # Show query item
    print(f"Query item [{query_idx}]:")
    if issues is not None:
        print(f"  Title: {issues[query_idx].get('title', '')[:100]}")
        print(f"  Body: {issues[query_idx].get('body', '')[:200]}")
    print(f"\nMost similar items:")
    
    return df


def visualize_similarity_heatmap(embeddings, indices=None, labels=None, max_items=50):
    """
    Create a heatmap showing pairwise similarities between items.
    
    Args:
        embeddings: numpy array of embeddings
        indices: specific indices to visualize (optional, defaults to first max_items)
        labels: labels for each item (optional)
        max_items: maximum number of items to show (for performance)
    """
    if indices is None:
        indices = list(range(min(len(embeddings), max_items)))
    
    # Compute similarity matrix for selected indices
    selected_embs = embeddings[indices]
    similarity_matrix = np.dot(selected_embs, selected_embs.T)
    
    # Create labels for axes
    if labels is not None:
        tick_labels = [f"{i} (L{labels[i]})" for i in indices]
    else:
        tick_labels = [str(i) for i in indices]
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=similarity_matrix,
        x=tick_labels,
        y=tick_labels,
        colorscale='RdBu',
        zmid=0.7,  # center the colorscale at typical similarity threshold
        colorbar=dict(title="Cosine Similarity")
    ))
    
    fig.update_layout(
        title=f'Pairwise Similarity Heatmap ({len(indices)} items)',
        xaxis_title='Item Index',
        yaxis_title='Item Index',
        width=800,
        height=800
    )
    
    fig.show()
    return similarity_matrix

In [None]:
# Example: Find items most similar to item 0
df_similar = find_most_similar(emb, query_idx=0, top_k=10, issues=issues)
display(df_similar)

In [None]:
# Visualize pairwise similarities as a heatmap
# This shows which items are similar to each other
# Darker red = more similar, darker blue = less similar

sim_matrix = visualize_similarity_heatmap(
    emb, 
    indices=None,  # Uses first 50 items by default
    labels=true_labels,
    max_items=50
)