# Clustering Exploration

Este notebook explora diferentes configuraciones de UMAP + HDBSCAN para encontrar clusters de intents.

In [None]:
import numpy as np
import pandas as pd
import hdbscan
import umap
import matplotlib.pyplot as plt
from google.cloud import bigquery
from src.config import get_settings

settings = get_settings()
client = bigquery.Client(project=settings.gcp_project_id)

In [None]:
# Load embeddings
query = f"""
SELECT ticket_id, embedding_vector
FROM `{settings.gcp_project_id}.features.embeddings`
"""
df_emb = client.query(query).to_dataframe()
print(f"Loaded {len(df_emb)} embeddings")

In [None]:
# Convert to numpy array
embeddings = np.array(df_emb['embedding_vector'].tolist())
print(f"Embedding shape: {embeddings.shape}")

In [None]:
# UMAP reduction for clustering (to 25 dimensions)
umap_model = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=25,
    metric='cosine',
    random_state=42
)
embeddings_reduced = umap_model.fit_transform(embeddings)
print(f"Reduced shape: {embeddings_reduced.shape}")

In [None]:
# HDBSCAN clustering
min_cluster_size = max(20, int(len(embeddings) * 0.03))
print(f"min_cluster_size: {min_cluster_size}")

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=5,
    metric='euclidean'
)
cluster_labels = clusterer.fit_predict(embeddings_reduced)

In [None]:
# Cluster statistics
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)
print(f"Number of clusters: {n_clusters}")
print(f"Noise points: {n_noise} ({n_noise/len(cluster_labels)*100:.1f}%)")

In [None]:
# 2D visualization
umap_2d = umap.UMAP(n_components=2, random_state=42)
embeddings_2d = umap_2d.fit_transform(embeddings)

plt.figure(figsize=(12, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='Spectral', s=1)
plt.colorbar()
plt.title('Clusters de Tickets')
plt.show()