In [1]:
import numpy as np
from dataclasses import dataclass, field
from typing import List, Optional, Tuple

In [2]:
def l2_normalize(X: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    return X / np.maximum(norms, eps)

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    # a, b: (d,)
    return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))

In [3]:
def spherical_kmeans(X: np.ndarray, k: int, iters: int = 20, seed: int = 0) -> Tuple[np.ndarray, np.ndarray]:
    """
    X: (n,d) asumimos L2-normalizado; devuelve (labels, centroids(L2=1))
    """
    n, d = X.shape
    rng = np.random.default_rng(seed)
    # init++ simple: elige k puntos aleatorios
    centroids = X[rng.choice(n, size=k, replace=False)].copy()

    for _ in range(iters):
        # asignación
        sims = X @ centroids.T               # (n,k)
        labels = np.argmax(sims, axis=1)     # (n,)
        # actualización
        new_centroids = np.zeros_like(centroids)
        counts = np.bincount(labels, minlength=k)
        for j in range(k):
            idx = (labels == j)
            if counts[j] == 0:
                # reinit a un punto aleatorio si vacío
                new_centroids[j] = X[rng.integers(0, n)]
            else:
                c = X[idx].mean(axis=0)
                # normaliza para mantener métrica de coseno
                nc = np.linalg.norm(c)
                new_centroids[j] = c / (nc + 1e-12)
        if np.allclose(new_centroids, centroids, atol=1e-6):
            centroids = new_centroids
            break
        centroids = new_centroids
    # asignación final
    sims = X @ centroids.T
    labels = np.argmax(sims, axis=1)
    return labels, l2_normalize(centroids)

In [179]:
import numpy as np

def _cluster_anisotropy(Xc: np.ndarray, eps: float = 1e-8) -> float:
    """
    Xc: puntos del cluster centrados (n_k, d) en float32.
    Devuelve A = 1 - sphericity in [0,1).
    """
    if Xc.shape[0] <= 1:
        return 0.0
    # Covarianza vía momentos (sin formar la matriz completa)
    # tr(Sigma) = sum_j Var_j ;  tr(Sigma^2) = ||Sigma||_F^2
    # Calculamos Sigma explícita solo si d es moderada; para d muy grande,
    # puedes estimar ||Sigma||_F^2 por muestreo.
    C = np.cov(Xc, rowvar=False)
    tr1 = np.trace(C)
    tr2 = np.sum(C * C)  # Frobenius^2
    d = C.shape[0]
    if tr2 <= eps:
        return 0.0
    sphericity = (tr1 * tr1) / (d * tr2 + eps)
    sphericity = float(np.clip(sphericity, 0.0, 1.0))
    return 1.0 - sphericity  # anisotropía

def spherical_kmeans_iso(X: np.ndarray, k: int, iters: int = 20, seed: int = 0,
                         lambda_iso: float = 0.02, reg_mu: float = 1e-3):
    """
    Spherical K-Means con sesgo de asignación por anisotropía de cluster.
    X: (n,d) L2-normalizado.
    k: nº de clusters.
    lambda_iso: peso del regularizador (0 => K-Means estándar).
    """
    n, d = X.shape
    rng = np.random.default_rng(seed)
    # init: elige k puntos aleatorios como centroides
    centroids = X[rng.choice(n, size=k, replace=False)].copy()

    for _ in range(iters):
        # ----- ASIGNACIÓN con sesgo por anisotropía -----
        # Similitud coseno equivale a distancia euclídea en L2=1:
        # argmax cos <=> argmin ||x - c||^2 = 2(1 - cos) (constante por x)
        sims = X @ centroids.T                    # (n,k)
        # sesgo por cluster (igual para todos los puntos, cambia cada iter.)
        # estimamos anisotropía del cluster "actual" usando las asignaciones previas;
        # en la 1ª iteración no tenemos labels: usa 0.
        if _ == 0 or 'labels' not in locals():
            bias = np.zeros(k, dtype=np.float32)
        else:
            bias = np.zeros(k, dtype=np.float32)
            for j in range(k):
                idx = (labels == j)
                if not np.any(idx):
                    bias[j] = 0.0
                else:
                    Xj = X[idx]
                    mu = Xj.mean(axis=0, dtype=np.float32)
                    Xc = (Xj - mu).astype(np.float32)
                    A = _cluster_anisotropy(Xc)
                    bias[j] = lambda_iso * A
        # Convertimos sesgo a espacio “distancia”: distancia_eff ≈ 2(1 - cos) + bias
        # Como 2 y el término constante no afectan al argmin, basta con:
        eff = -sims + bias  # menor es mejor
        labels = np.argmin(eff, axis=1)

        # ----- ACTUALIZACIÓN de centroides -----
        new_centroids = np.zeros_like(centroids)
        counts = np.bincount(labels, minlength=k)
        for j in range(k):
            idx = (labels == j)
            if counts[j] == 0:
                new_centroids[j] = X[rng.integers(0, n)]
            else:
                c = X[idx].mean(axis=0)
                # opcional: pequeña retracción hacia 0 para evitar medios muy sesgados
                if reg_mu > 0:
                    c = c * (1.0 - reg_mu)
                new_centroids[j] = c / (np.linalg.norm(c) + 1e-12)

        # parada por convergencia de centroides
        if np.allclose(new_centroids, centroids, atol=1e-6):
            centroids = new_centroids
            break
        centroids = new_centroids

    # asignación final
    sims = X @ centroids.T
    labels = np.argmax(sims, axis=1)
    return labels, (centroids / (np.linalg.norm(centroids, axis=1, keepdims=True) + 1e-12))


In [253]:
@dataclass
class Node:
    is_leaf: bool
    centroids: Optional[np.ndarray] = None     # (B,d) si interno
    children: List["Node"] = field(default_factory=list)
    idxs: Optional[np.ndarray] = None          # ids en hoja
    nid: int = -1

class KMeansTree:
    def __init__(self, B: int = 8, max_depth: int = 3, min_leaf_size: int = 256, kmeans_iters: int = 20, seed: int = 0, iso: bool = False):
        """
        B: branching factor
        max_depth: profundidad máxima del árbol
        min_leaf_size: tamaño mínimo para no seguir dividiendo
        """
        self.B = B
        self.max_depth = max_depth
        self.min_leaf_size = min_leaf_size
        self.kmeans_iters = kmeans_iters
        self.seed = seed
        self._next_id = 0
        self.iso = iso
        self.root: Optional[Node] = None
        self.X: Optional[np.ndarray] = None  # embeddings normalizados
        self._id2leaf = None   # dict: int -> Node
        self._id2path = None   # dict: int -> tuple(int, ...)

    # -------- build --------
    # --- helpers internos ---
    def _build_id_maps(self):
        """Construye mapas doc_id -> hoja y doc_id -> path (tupla de índices hijo)."""
        self._id2leaf = {}
        self._id2path = {}

        def dfs(node, path_prefix):
            if node.is_leaf:
                if node.idxs is None:
                    return
                for doc_id in node.idxs.tolist():
                    self._id2leaf[doc_id] = node
                    self._id2path[doc_id] = tuple(path_prefix)
                return
            # recorre hijos guardando el índice de hijo en la ruta
            for child_idx, ch in enumerate(node.children):
                dfs(ch, path_prefix + [child_idx])

        dfs(self.root, [])

    def fit(self, X: np.ndarray):
        """
        X: (n,d) embeddings (se normalizan internamente a L2=1)
        """
        X = l2_normalize(X.astype(np.float32))
        self.X = X
        n = X.shape[0]
        idxs = np.arange(n, dtype=np.int64)
        self.root = self._build_recursive(idxs, depth=0, seed=self.seed)
        self._build_id_maps()

    def _new_node(self, **kwargs):
        node = Node(**kwargs)
        node.nid = self._next_id
        self._next_id += 1
        return node 

    def _build_recursive(self, idxs: np.ndarray, depth: int, seed: int) -> Node:
        # condición de hoja
        if depth >= self.max_depth or len(idxs) <= self.min_leaf_size:
            return self._new_node(is_leaf=True, idxs=idxs)

        X_sub = self.X[idxs]
        k = min(self.B, max(1, len(idxs)))  # por si hay pocos puntos
        if k == 1:
            return self._new_node(is_leaf=True, idxs=idxs)

        if self.iso:
            labels, centroids = spherical_kmeans_iso(X_sub, k=k, iters=self.kmeans_iters, seed=seed)
        else:
            labels, centroids = spherical_kmeans(X_sub, k=k, iters=self.kmeans_iters, seed=seed)
        children = []
        for j in range(k):
            child_idxs = idxs[labels == j]
            if len(child_idxs) == 0:
                # crea hoja vacía para mantener aridad (opcional)
                children.append(self._new_node(is_leaf=True, idxs=np.array([], dtype=np.int64)))
            else:
                children.append(self._build_recursive(child_idxs, depth + 1, seed + j + 1))

        # Si hay menos hijos que B por pocos datos, centramos en k real
        node = self._new_node(is_leaf=False, centroids=centroids[:k], children=children)
        return node
    
    # --- API pública ---
    def get_leaf_path(self, doc_id: int):
        """
        Devuelve la ruta desde la raíz hasta la hoja que contiene doc_id
        como lista de índices de hijo [i0, i1, ..., iL-1].
        Lanza KeyError si doc_id no existe.
        """
        if self._id2path is None:
            raise RuntimeError("El índice aún no ha sido construido. Llama a fit() primero.")
        try:
            return list(self._id2path[int(doc_id)])
        except KeyError:
            raise KeyError(f"doc_id {doc_id} no existe en este árbol")

    def get_leaf_node(self, doc_id: int):
        """
        Devuelve el objeto Node (hoja) que contiene doc_id.
        Lanza KeyError si doc_id no existe.
        """
        if self._id2leaf is None:
            raise RuntimeError("El índice aún no ha sido construido. Llama a fit() primero.")
        try:
            return self._id2leaf[int(doc_id)]
        except KeyError:
            raise KeyError(f"doc_id {doc_id} no existe en este árbol")

    # -------- query --------
    def query(self, q: np.ndarray, topN: int = 10) -> List[Tuple[int, float]]:
        """
        q: (d,) embedding de consulta (se normaliza)
        Devuelve lista de (idx, similitud) ordenada desc.
        """
        assert self.root is not None and self.X is not None, "Primero llama a fit()"
        q = l2_normalize(q.reshape(1, -1).astype(np.float32))[0]
        leaf = self._route_greedy(q, self.root)
        if leaf.idxs.size == 0:
            return []
        X_leaf = self.X[leaf.idxs]  # ya normalizados
        sims = X_leaf @ q
        # topN parcial eficiente
        k = min(topN, sims.size)
        top_idx = np.argpartition(-sims, k - 1)[:k]
        pairs = list(zip(leaf.idxs[top_idx].tolist(), sims[top_idx].tolist()))
        # ordena exacto
        pairs.sort(key=lambda t: -t[1])
        return pairs

    def _route_greedy(self, q: np.ndarray, node: Node) -> Node:
        while not node.is_leaf:
            C = node.centroids  # (k,d)
            sims = C @ q        # coseno (centroides y q están L2=1)
            j = int(np.argmax(sims))
            node = node.children[j]
        return node
    
    def print_tree(self, max_children: int = 8):
        assert self.root is not None
        def _rec(node, depth):
            indent = "  " * depth
            if node.is_leaf:
                size = 0 if node.idxs is None else len(node.idxs)
                print(f"{indent}• [Leaf nid={node.nid}] size={size}")
            else:
                k = 0 if node.centroids is None else node.centroids.shape[0]
                print(f"{indent}◦ [Inner nid={node.nid}] k={k}")
                # por si hay muchos hijos, recorta la muestra visual
                children = node.children
                if len(children) > max_children:
                    head = children[:max_children//2]
                    tail = children[-max_children//2:]
                    for ch in head: _rec(ch, depth+1)
                    print(f"{indent}  ... ({len(children)-len(head)-len(tail)} hijos omitidos) ...")
                    for ch in tail: _rec(ch, depth+1)
                else:
                    for ch in children:
                        _rec(ch, depth+1)
        _rec(self.root, 0)

    def tree_stats(self):
        assert self.root is not None
        num_nodes = 0
        num_leaves = 0
        max_depth = 0
        leaf_sizes = []

        def _rec(node, depth):
            nonlocal num_nodes, num_leaves, max_depth
            num_nodes += 1
            max_depth = max(max_depth, depth)
            if node.is_leaf:
                num_leaves += 1
                leaf_sizes.append(0 if node.idxs is None else len(node.idxs))
            else:
                for ch in node.children:
                    _rec(ch, depth+1)
        _rec(self.root, 0)
        leaf_sizes = np.array(leaf_sizes)
        return {
            "num_nodes": num_nodes,
            "num_leaves": num_leaves,
            "max_depth": max_depth,
            "leaf_size_min": int(leaf_sizes.min()) if len(leaf_sizes) else 0,
            "leaf_size_med": float(np.median(leaf_sizes)) if len(leaf_sizes) else 0,
            "leaf_size_max": int(leaf_sizes.max()) if len(leaf_sizes) else 0,
        }

In [254]:
class KMeansTreeBeamSearch(KMeansTree):
    def __init__(
        self,
        B: int = 8,
        max_depth: int = 3,
        min_leaf_size: int = 256,
        kmeans_iters: int = 20,
        seed: int = 0,
        iso: bool = False,
        # --- nuevos parámetros de búsqueda ---
        beam: int = 3,              # ancho máximo del beam
        tau_margin: float = 0.02,   # margen adaptativo s1-s2
        last_level_probe: int = 2,  # nº extra de hojas a probar en el último nivel
        last_level_delta: float = 0.01  # umbral de similitud respecto al mejor centroide
    ):
        super().__init__(B, max_depth, min_leaf_size, kmeans_iters, seed, iso)
        # búsqueda mejorada
        self.beam = max(1, beam)
        self.tau_margin = float(tau_margin)
        self.last_level_probe = max(0, last_level_probe)
        self.last_level_delta = float(last_level_delta)

     # -------- routing mejorado --------
    def _route_adaptive_beam(self, q: np.ndarray) -> List[Node]:
        assert self.root is not None
        frontier = [self.root]
        depth = 0

        while True:
            new_frontier: List[Node] = []
            all_leaves = True

            for node in frontier:
                if node.is_leaf:
                    new_frontier.append(node)
                    continue

                all_leaves = False
                C = node.centroids  # (k,d)
                sims = C @ q
                order = np.argsort(-sims)

                s1 = sims[order[0]]
                s2 = sims[order[1]] if len(order) > 1 else -1.0
                local_beam = 1 if (s1 - s2) >= self.tau_margin else self.beam

                # ¿sus hijos son hojas? (último salto)
                next_is_leaf = all(ch.is_leaf for ch in node.children)

                if not next_is_leaf:
                    chosen = order[:local_beam]
                    new_frontier.extend([node.children[i] for i in chosen])
                else:
                    # multi-probe: añade hasta last_level_probe extras
                    chosen = order[:local_beam]
                    # añade vecinos cercanos por umbral Δ
                    if self.last_level_probe > 0:
                        s_best = sims[order[0]]
                        extra = []
                        for j in order[local_beam:]:
                            if s_best - sims[j] <= self.last_level_delta:
                                extra.append(j)
                                if len(extra) >= self.last_level_probe:
                                    break
                        chosen = np.concatenate([chosen, np.array(extra, dtype=int)]) if len(extra) else chosen
                    new_frontier.extend([node.children[i] for i in chosen])

            frontier = new_frontier
            depth += 1
            if all_leaves or depth >= self.max_depth:
                break

            # si por beam nos quedamos sin nodos (poco probable), salimos
            if len(frontier) == 0:
                break

        # aquí frontier debe ser una lista de hojas candidatas
        # quitamos duplicadas por si algún nodo se añadió dos veces
        unique_leaves = []
        seen = set()
        for lf in frontier:
            key = id(lf)
            if key not in seen:
                seen.add(key)
                unique_leaves.append(lf)
        return unique_leaves

    # -------- búsqueda sobre hojas candidatas --------
    def query(self, q: np.ndarray, topN: int = 10) -> List[Tuple[int, float]]:
        assert self.root is not None and self.X is not None, "Primero llama a fit()"
        q = l2_normalize(q.reshape(1, -1).astype(np.float32))[0]
        leaves = self._route_adaptive_beam(q)

        cand_ids = []
        for leaf in leaves:
            if leaf.idxs is None or leaf.idxs.size == 0:
                continue
            cand_ids.extend(leaf.idxs.tolist())
        if not cand_ids:
            return []

        cand_ids = np.unique(np.array(cand_ids, dtype=np.int64))
        sims = self.X[cand_ids] @ q
        k = min(topN, sims.size)
        top_idx = np.argpartition(-sims, k - 1)[:k]
        pairs = list(zip(cand_ids[top_idx].tolist(), sims[top_idx].tolist()))
        pairs.sort(key=lambda t: -t[1])
        return pairs

In [255]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    model_kwargs={"device": "cuda"}
)

loaded_vectorstore=FAISS.load_local(
    "../data/db/parliament_db/parliament_all_docs_embeddings_sentence-transformers_paraphrase-multilingual-mpnet-base-v2",
    model,
    allow_dangerous_deserialization=True
)

print(f"Loaded vector store contains {loaded_vectorstore.index.ntotal} vectors")

n = loaded_vectorstore.index.ntotal
d = loaded_vectorstore.index.d  # dimensión de los embeddings

Loaded vector store contains 11162 vectors


In [256]:
# get metadata of the index
pos_to_store_id = [loaded_vectorstore.index_to_docstore_id[i] for i in range(n)]

doc_ids = []
for store_id in pos_to_store_id:
    doc = loaded_vectorstore.docstore.search(store_id)  # recupera el Document
    md = getattr(doc, "metadata", {}) if doc is not None else {}
    # Ajusta la prioridad de claves según cómo lo guardaste
    for key in ("id", "doc_id", "document_id", "uid"):
        if key in md:
            doc_ids.append(md[key])
            break
    else:
        # Si no hay id en metadata, usa el store_id como fallback
        doc_ids.append(store_id)
pos_to_doc_id = {i: doc_ids[i] for i in range(n)}

In [257]:
X = np.array([emb for emb in loaded_vectorstore.index.reconstruct_n(0, loaded_vectorstore.index.ntotal)])
print(f"All embeddings shape: {X.shape}")

All embeddings shape: (11162, 768)


In [380]:
tree = KMeansTree(B=8, max_depth=1, min_leaf_size=10, kmeans_iters=20, seed=42, iso=False)
tree.fit(X)
tree.print_tree(max_children=10)
tree.tree_stats()

◦ [Inner nid=8] k=8
  • [Leaf nid=0] size=1731
  • [Leaf nid=1] size=366
  • [Leaf nid=2] size=680
  • [Leaf nid=3] size=1081
  • [Leaf nid=4] size=2986
  • [Leaf nid=5] size=2024
  • [Leaf nid=6] size=1101
  • [Leaf nid=7] size=1193


{'num_nodes': 9,
 'num_leaves': 8,
 'max_depth': 1,
 'leaf_size_min': 366,
 'leaf_size_med': 1147.0,
 'leaf_size_max': 2986}

In [381]:
doc_id = 3
path = tree.get_leaf_path(doc_id)   # p.ej. [0, 3, 1]
leaf = tree.get_leaf_node(doc_id)   # nodo hoja que contiene ese id
print("ruta:", path, "id:", "".join(str(x) for x in path))
print("tamaño hoja:", len(leaf.idxs))

ruta: [5] id: 5
tamaño hoja: 2024


In [382]:
from datasets import load_from_disk

dataset = load_from_disk("../data/processed/parliament_qa")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'response', 'cost', 'documents', 'type', 'retrieved_pks', 'oracle_context', 'formatted_context'],
        num_rows: 614
    })
    validation: Dataset({
        features: ['id', 'question', 'response', 'cost', 'documents', 'type', 'retrieved_pks', 'oracle_context', 'formatted_context'],
        num_rows: 161
    })
    test: Dataset({
        features: ['question', 'id', 'response', 'type', 'retrieved_pks', 'oracle_context', 'injected_oracle', 'formatted_context', 'documents'],
        num_rows: 205
    })
})

In [383]:
# create labels in dataset using the tree
def assign_labels(example):
    doc_id = example['id']
    id = next((k for k, v in pos_to_doc_id.items() if v == doc_id), None)
    try:
        path = tree.get_leaf_path(id)
        label = "".join(str(x) for x in path)
    except KeyError:
        label = "unknown"
    return {"label": label}

dataset = dataset.map(assign_labels)

Map: 100%|██████████| 614/614 [00:00<00:00, 4759.96 examples/s]
Map: 100%|██████████| 161/161 [00:00<00:00, 4434.80 examples/s]
Map: 100%|██████████| 205/205 [00:00<00:00, 1901.95 examples/s]


In [384]:
len(set(dataset['train'][:]['label']))

8

In [386]:
from tqdm import tqdm

labels_real = []
labels_predicted = []
topN = 100
for idx in tqdm(range(len(dataset['test']))):
    query = dataset['test'][idx]['question']
    id_real = dataset['test'][idx]['id']
    query_emb = np.array(model.embed_query(query))
    results = tree.query(query_emb, topN=topN)
    labels_predicted.append([pos_to_doc_id[idx] for idx, sim in results])
    labels_real.append([id_real])    

  0%|          | 0/205 [00:00<?, ?it/s]


AttributeError: 'Qwen3ForSequenceClassification' object has no attribute 'embed_query'

In [316]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.2741
  mAP: 0.2741
  AvgRank: 7.9315
  CMC@1: 0.2537
  Recall@k (macro)@1: 0.2537
  Precision@k (macro)@1: 0.2537
  Accuracy@1: 0.2537
  F1@k (macro)@1: 0.2537
  CMC@5: 0.2927
  Recall@k (macro)@5: 0.2927
  Precision@k (macro)@5: 0.0585
  Accuracy@5: 0.2927
  F1@k (macro)@5: 0.0976
  CMC@10: 0.3073
  Recall@k (macro)@10: 0.3073
  Precision@k (macro)@10: 0.0307
  Accuracy@10: 0.3073
  F1@k (macro)@10: 0.0559
  CMC@20: 0.3171
  Recall@k (macro)@20: 0.3171
  Precision@k (macro)@20: 0.0159
  Accuracy@20: 0.3171
  F1@k (macro)@20: 0.0302
  CMC@100: 0.3561
  Recall@k (macro)@100: 0.3561
  Precision@k (macro)@100: 0.0036
  Accuracy@100: 0.3561
  F1@k (macro)@100: 0.0071


## LLM for routing

In [387]:
from datasets import DatasetDict, concatenate_datasets
dataset_clf = {}
dataset_clf['train'] = dataset['train'].remove_columns([col for col in dataset['train'].column_names if col not in ['question', 'label']])
dataset_clf['validation'] = dataset['validation'].remove_columns([col for col in dataset['validation'].column_names if col not in ['question', 'label']])
dataset_clf['test'] = dataset['test'].remove_columns([col for col in dataset['test'].column_names if col not in ['question', 'label']])

dataset_clf = DatasetDict(dataset_clf)

In [388]:
dataset_clf = dataset_clf.rename_column("question", "text")
dataset_clf

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 614
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 161
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 205
    })
})

In [389]:
all_data = concatenate_datasets([dataset_clf['train'], dataset_clf['validation'], dataset_clf['test']])
num_labels = len(all_data['label'])
print(f"Number of labels: {num_labels}")
labels_list = all_data.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_clf = dataset_clf.map(map_labels)

Number of labels: 980
Labels: ['6', '3', '7', '0', '4', '1', '2', '5']


Map: 100%|██████████| 614/614 [00:00<00:00, 47970.62 examples/s]
Map: 100%|██████████| 161/161 [00:00<00:00, 34910.97 examples/s]
Map: 100%|██████████| 205/205 [00:00<00:00, 37029.82 examples/s]


In [390]:
dataset_clf["train"] = concatenate_datasets([dataset_clf['train'], dataset_clf['validation']]).shuffle(seed=42)                                            

In [419]:
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model

model_name = "Qwen/Qwen3-0.6B"
MAX_LENGTH = 1024

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [420]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    low_cpu_mem_usage=True,
    device_map={"": 0}
)
model.config.pad_token_id = tokenizer.pad_token_id

OutOfMemoryError: CUDA out of memory. Tried to allocate 298.00 MiB. GPU 0 has a total capacity of 23.99 GiB of which 0 bytes is free. Of the allocated memory 22.72 GiB is allocated by PyTorch, and 287.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [416]:
lora_r = 512
lora_alpha = lora_r * 2
lora_dropout = 0.0
lora_bias = "none"
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]

In [417]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=lora_bias,
    target_modules=target_modules
)
model = get_peft_model(model, config)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 23.99 GiB of which 0 bytes is free. Of the allocated memory 22.72 GiB is allocated by PyTorch, and 287.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [409]:
for p in model.base_model.model.score.parameters():
    p.requires_grad_(True)

In [410]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

# tokenize test from dataset
tokenized_datasets = dataset_clf.map(preprocess, batched=True)


Map: 100%|██████████| 161/161 [00:00<00:00, 19201.63 examples/s]


In [411]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def softmax(x, axis=-1):
    x = np.asarray(x, dtype=np.float64)
    # Restar el máximo para evitar overflow
    x_shift = x - np.max(x, axis=axis, keepdims=True)
    exps = np.exp(x_shift)
    return exps / np.sum(exps, axis=axis, keepdims=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()

    return {
        "accuracy": acc
    }

In [412]:
SEED = 42
EPOCHS = 10
BATCH_SIZE = 8

In [413]:
training_args = TrainingArguments(
    output_dir=f"models/parlamento_clf_{model_name.replace('/', '_')}_cluster",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.05,
    warmup_ratio=0.2,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="none",
    seed=SEED,
    label_smoothing_factor=0.1,
)

# IDs de tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [414]:
model.print_trainable_parameters()  # Verificar parámetros entrenables

trainable params: 530,489,344 || all params: 1,126,539,264 || trainable%: 47.0902


In [415]:
trainer.train()
metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 23.99 GiB of which 0 bytes is free. Of the allocated memory 22.72 GiB is allocated by PyTorch, and 287.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [404]:
metrics

{'eval_loss': 3.4565346240997314,
 'eval_accuracy': 0.4097560975609756,
 'eval_runtime': 1.5787,
 'eval_samples_per_second': 129.85,
 'eval_steps_per_second': 16.469,
 'epoch': 10.0}

## Other

In [246]:
tree_beam = KMeansTreeBeamSearch(B=10, max_depth=2, min_leaf_size=20, kmeans_iters=100, beam=10, tau_margin=0.01, last_level_probe=10, last_level_delta=0.01, seed=42, iso=True)
tree_beam.fit(X)
stats = tree_beam.tree_stats()
print("Estadísticas del árbol con beam search:", stats)

Estadísticas del árbol con beam search: {'num_nodes': 111, 'num_leaves': 100, 'max_depth': 2, 'leaf_size_min': 2, 'leaf_size_med': 78.5, 'leaf_size_max': 463}


In [247]:
labels_real = []
labels_predicted = []
topN = 100
for idx in tqdm(range(len(dataset['test']))):
    query = dataset['test'][idx]['question']
    id_real = dataset['test'][idx]['id']
    query_emb = np.array(model.embed_query(query))
    results = tree_beam.query(query_emb, topN=topN)
    labels_predicted.append([pos_to_doc_id[idx] for idx, sim in results])
    labels_real.append([id_real])

100%|██████████| 205/205 [00:02<00:00, 78.26it/s]


In [248]:
metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.3442
  mAP: 0.3442
  AvgRank: 6.8261
  CMC@1: 0.3171
  Recall@k (macro)@1: 0.3171
  Precision@k (macro)@1: 0.3171
  Accuracy@1: 0.3171
  F1@k (macro)@1: 0.3171
  CMC@5: 0.3756
  Recall@k (macro)@5: 0.3756
  Precision@k (macro)@5: 0.0751
  Accuracy@5: 0.3756
  F1@k (macro)@5: 0.1252
  CMC@10: 0.3951
  Recall@k (macro)@10: 0.3951
  Precision@k (macro)@10: 0.0395
  Accuracy@10: 0.3951
  F1@k (macro)@10: 0.0718
  CMC@20: 0.4098
  Recall@k (macro)@20: 0.4098
  Precision@k (macro)@20: 0.0205
  Accuracy@20: 0.4098
  F1@k (macro)@20: 0.0390
  CMC@100: 0.4488
  Recall@k (macro)@100: 0.4488
  Precision@k (macro)@100: 0.0045
  Accuracy@100: 0.4488
  F1@k (macro)@100: 0.0089


In [249]:
from itertools import product
VALUES_B = [5, 10, 20, 40, 80]
VALUES_MAX_DEPTH = [1, 2, 3, 4]
VALUES_MIN_LEAF_SIZE = [5, 10, 20, 40, 80, 100, 200]
VALUES_KMEANS_ITERS = [5, 10, 20, 40, 80]
# contruir combinaciones en una lista
param_combinations = list(product(VALUES_B, VALUES_MAX_DEPTH, VALUES_MIN_LEAF_SIZE, VALUES_KMEANS_ITERS))
print(f"Total parameter combinations to try: {len(param_combinations)}")

Total parameter combinations to try: 700


In [252]:
best_mrr = 0.0
for b, depth, min_size, iters in tqdm(param_combinations):
    #print(f"Probando B={b}, max_depth={depth}, min_leaf_size={min_size}, kmeans_iters={iters}")
    tree = KMeansTreeBeamSearch(B=b, max_depth=depth, min_leaf_size=min_size, kmeans_iters=iters, seed=42)
    tree.fit(X)
    
    labels_predicted = []

    for idx in range(len(dataset['test'])):
        query = dataset['test'][idx]['question']
        query_emb = np.array(model.embed_query(query))
        results = tree.query(query_emb, topN=topN)
        labels_predicted.append([pos_to_doc_id[idx] for idx, sim in results])
    
    metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)
    
    if metrics['MRR'] > best_mrr:
        best_mrr = metrics['MRR']
        print(f"MRR:{metrics['MRR']:.4f} @ B={b}, max_depth={depth}, min_leaf_size={min_size}, kmeans_iters={iters}")

print(f"Best MRR: {best_mrr:.4f}")

  0%|          | 1/700 [00:03<36:37,  3.14s/it]

MRR:0.3664 @ B=5, max_depth=1, min_leaf_size=5, kmeans_iters=5


  0%|          | 2/700 [00:05<34:09,  2.94s/it]

MRR:0.3990 @ B=5, max_depth=1, min_leaf_size=5, kmeans_iters=10


 20%|██        | 142/700 [06:51<26:26,  2.84s/it]

MRR:0.4048 @ B=10, max_depth=1, min_leaf_size=5, kmeans_iters=10


 74%|███████▍  | 519/700 [20:47<07:15,  2.40s/it]


KeyboardInterrupt: 

In [81]:
labels_predicted = []
retriever=loaded_vectorstore.as_retriever(search_kwargs={"k":topN})

for idx in tqdm(range(len(dataset['test']))):
    query = dataset['test'][idx]['question']
    docs = retriever.get_relevant_documents(query)
    labels_predicted.append([doc.metadata['id'] for doc in docs])



100%|██████████| 205/205 [00:01<00:00, 108.46it/s]


In [82]:
metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.4780
  mAP: 0.4780
  AvgRank: 8.3931
  CMC@1: 0.4049
  Recall@k (macro)@1: 0.4049
  Precision@k (macro)@1: 0.4049
  Accuracy@1: 0.4049
  F1@k (macro)@1: 0.4049
  CMC@5: 0.5463
  Recall@k (macro)@5: 0.5463
  Precision@k (macro)@5: 0.1093
  Accuracy@5: 0.5463
  F1@k (macro)@5: 0.1821
  CMC@10: 0.5902
  Recall@k (macro)@10: 0.5902
  Precision@k (macro)@10: 0.0590
  Accuracy@10: 0.5902
  F1@k (macro)@10: 0.1073
  CMC@20: 0.6146
  Recall@k (macro)@20: 0.6146
  Precision@k (macro)@20: 0.0307
  Accuracy@20: 0.6146
  F1@k (macro)@20: 0.0585
  CMC@100: 0.7073
  Recall@k (macro)@100: 0.7073
  Precision@k (macro)@100: 0.0071
  Accuracy@100: 0.7073
  F1@k (macro)@100: 0.0140
