In [None]:
from google.colab import drive

# Ejecuta la funci√≥n de montaje
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# === Sistema (Octave) ===
!sudo DEBIAN_FRONTEND=noninteractive apt-get update -yq
!sudo DEBIAN_FRONTEND=noninteractive apt-get install -yq octave liboctave-dev

# Verificar Octave:
!octave --quiet --eval "printf('Octave %s\n', version());"

Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lis

In [None]:
# === Python (libs) ===
# Sugerencias:
# - --quiet para menos ruido
# - hf-transfer acelera descargas de modelos
# - 'transformers' y 'accelerate' ayudan con flujos HF/vLLM y utilidades
# - 'sentencepiece' para tokenizadores (Qwen, Llama, etc.)
# - 'pandas' √∫til para CSVs de resultados; 'openpyxl' para Excel

!pip install -q --upgrade pip
!pip install -q datasets huggingface_hub transformers accelerate \
  sentencepiece pandas openpyxl hf-transfer \
  vllm math-verify

# Activar transferencia acelerada en HF (m√°s r√°pido al bajar modelos)
!export HF_HUB_ENABLE_HF_TRANSFER=1

In [None]:
!pip install pymilvus[milvus-lite]
!pip -q install tqdm
!pip -q install math-verify



In [None]:
# === Chequeos r√°pidos de entorno ===

import os, sys, subprocess

print("Python:", sys.version.split()[0])
print("HF_HUB_ENABLE_HF_TRANSFER:", os.environ.get("HF_HUB_ENABLE_HF_TRANSFER"))

# GPU y CUDA visibles
try:
    out = subprocess.check_output(["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"])
    print("GPU(s):\n", out.decode().strip())
except Exception as e:
    print("nvidia-smi no disponible:", e)

# Importes cr√≠ticos
for pkg in ["datasets","huggingface_hub","transformers","accelerate","vllm","math_verify","openpyxl","pandas"]:
    try:
        __import__(pkg.replace("-", "_"))
        print(f"OK import {pkg}")
    except Exception as e:
        print(f"Fallo import {pkg} -> {e}")

# vLLM sanity (no lanza servidor, solo import y versi√≥n)
try:
    import vllm
    print("vLLM:", getattr(vllm, "__version__", "unknown"))
except Exception as e:
    print("Error vLLM:", e)

Python: 3.12.12
HF_HUB_ENABLE_HF_TRANSFER: None
GPU(s):
 NVIDIA L4, 23034 MiB, 550.54.15
OK import datasets
OK import huggingface_hub
OK import transformers
OK import accelerate
OK import vllm
OK import math_verify
OK import openpyxl
OK import pandas
vLLM: 0.11.0


In [None]:
!which octave
!octave --eval "disp(2 + 2)"

/usr/bin/octave
octave: X11 DISPLAY environment variable not set
octave: disabling GUI features
4


# Definiciones de clases

In [None]:
import os
import time
import logging
from typing import List, Dict, Optional, Union

from huggingface_hub import snapshot_download
from vllm import LLM, SamplingParams

# Opcional: transformers solo para chat_template
try:
    from transformers import AutoTokenizer
    _HAS_TRANSFORMERS = True
except Exception:
    _HAS_TRANSFORMERS = False

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger("VLLMGenerator")


class VLLMGenerator:
    def __init__(
        self,
        model_name: str,
        download: bool = False,
        local_dir: Optional[str] = None,
        hf_token: Optional[str] = None,
        temperature: float = 0.0,
        top_p: float = 1.0,
        max_tokens: int = 1024,
        seed: int = 42,
        log_prompts: bool = False,
        # Ajustes de vLLM / GPU
        dtype: str = "auto",                    # "auto" | "float16" | "bfloat16" | "float32"
        tensor_parallel_size: int = 1,
        gpu_memory_utilization: float = 0.92,   # Colab L4 va bien con 0.90‚Äì0.95
        max_model_len: Optional[int] = None,    # e.g., 8192 para limitar si hace falta
        trust_remote_code: bool = True,
    ):
        """
        Wrapper de vLLM con:
        - Descarga opcional desde HF.
        - Chat template autom√°tico si el tokenizador lo soporta.
        - Par√°metros de muestreo y carga configurables.
        """
        self.repo_or_path = model_name
        self.log_prompts = log_prompts
        self._tokenizer = None
        self._use_chat_template = False
        self._eos_token = None
        self._stop_tokens = None

        # Descarga opcional del modelo (√∫til para repetir pruebas sin red/latencia)
        if download:
            folder = local_dir or model_name.split("/")[-1]
            logger.info("Descargando modelo %s a ./%s ...", model_name, folder)
            self.repo_or_path = snapshot_download(
                repo_id=model_name,
                local_dir=folder,
                local_dir_use_symlinks=False,  # evita symlinks problem√°ticos en Colab/Drive
                token=hf_token
            )
            logger.info("Modelo descargado en: %s", self.repo_or_path)

        # Cargar tokenizer para chat templates (si est√° disponible)
        if _HAS_TRANSFORMERS:
            try:
                self._tokenizer = AutoTokenizer.from_pretrained(
                    self.repo_or_path,
                    use_fast=True,
                    token=hf_token,
                    trust_remote_code=trust_remote_code
                )
                # Detectar si el tokenizer tiene plantilla de chat
                if hasattr(self._tokenizer, "apply_chat_template"):
                    self._use_chat_template = True
                    logger.info("Chat template detectado para este modelo.")
                # eos/stop
                self._eos_token = getattr(self._tokenizer, "eos_token", None)
            except Exception as e:
                logger.warning("No se pudo cargar tokenizer (%s). Se usar√° formateo fallback.", e)
        else:
            logger.info("transformers no disponible; usando formateo fallback.")

        # Inicializar LLM de vLLM
        load_start = time.time()
        self.llm = LLM(
            model=self.repo_or_path,
            dtype=dtype,
            tensor_parallel_size=tensor_parallel_size,
            gpu_memory_utilization=gpu_memory_utilization,
            max_model_len=max_model_len,
            trust_remote_code=trust_remote_code,
            # se puede a√±adir: enforce_eager=True para depuraci√≥n (m√°s lento)
        )
        logger.info("Modelo cargado en %.2f s", time.time() - load_start)

        # Sampling por defecto (sobreescribir al llamar a generate)
        stops = []
        if self._eos_token:
            stops.append(self._eos_token)
        # Algunos modelos usan tokens especiales de fin:
        # Qwen2: "<|im_end|>", Mistral/LLaMA: "</s>", etc.
        # a√±adirlos expl√≠citamente:
        # if "qwen" in model_name.lower(): stops.append("<|im_end|>")

        self.sampling_params = SamplingParams(
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
            seed=seed,
            stop=stops or None
        )
        logger.info("Sampling por defecto: %s", self.sampling_params)

    # ---------- Helpers de formateo ----------
    def _format_messages(self, messages: List[Dict[str, str]]) -> str:
        """
        Convierte una lista de mensajes [{'role': 'system'|'user'|'assistant', 'content': str}, ...]
        a un prompt seg√∫n chat_template si existe; si no, usa el fallback original.
        """
        # Normalizar roles
        norm = []
        for m in messages:
            role = m["role"].strip().lower()
            if role not in {"system", "user", "assistant"}:
                role = "user"
            norm.append({"role": role, "content": m["content"].strip()})

        if self._use_chat_template:
            try:
                # add_generation_prompt=True agrega el turno del assistant al final
                prompt = self._tokenizer.apply_chat_template(
                    norm,
                    tokenize=False,
                    add_generation_prompt=True
                )
                return prompt
            except Exception as e:
                logger.warning("Fallo apply_chat_template (%s). Usando fallback.", e)

        # Fallback (formato original)
        prompt = []
        for m in norm:
            if m["role"] == "system":
                prompt.append("<|system|>\n" + m["content"] + "\n")
            elif m["role"] == "user":
                prompt.append("<|user|>\n" + m["content"] + "\n")
            else:
                prompt.append("<|assistant|>\n" + m["content"] + "\n")
        prompt.append("<|assistant|>\n")
        return "".join(prompt)

    # ---------- API p√∫blica ----------
    def generate(
        self,
        prompts: List[str],
        sampling: Optional[SamplingParams] = None
    ) -> List[str]:
        sp = sampling or self.sampling_params
        start = time.time()
        outputs = self.llm.generate(prompts, sp)
        logger.info("Generaci√≥n completada en %.2f s", time.time() - start)

        texts = []
        for out in outputs:
            if out.outputs:
                texts.append(out.outputs[0].text.strip())
            else:
                texts.append("")
        return texts

    def chat(
        self,
        messages: List[Dict[str, str]],
        debug: Optional[bool] = None,
        sampling: Optional[SamplingParams] = None
    ) -> str:
        prompt = self._format_messages(messages)
        should_log = self.log_prompts if debug is None else debug
        if should_log:
            print("üì§ Prompt enviado al modelo\n" + "-"*40 + f"\n{prompt}\n" + "-"*40)
        resp = self.generate([prompt], sampling=sampling)
        return resp[0] if resp else ""

    def batch_chat(
        self,
        conversations: List[List[Dict[str, str]]],
        debug: bool = False,
        sampling: Optional[SamplingParams] = None
    ) -> List[str]:
        prompts = [self._format_messages(msgs) for msgs in conversations]
        if debug or self.log_prompts:
            for i, p in enumerate(prompts, 1):
                print(f"\nüì§ Prompt #{i}\n" + "-"*40 + f"\n{p}\n" + "-"*40)
        results = self.generate(prompts, sampling=sampling)
        return [r.strip() for r in results]

INFO 11-02 17:09:30 [__init__.py:216] Automatically detected platform cuda.


In [None]:
from typing import Dict, List

class ModelRegistry:
    """
    Registry of supported Hugging Face models for vLLMGenerator.

    Contains a fixed list of model identifiers and their metadata.
    """
    # Mapping from model key to huggingface repo id
    MODEL_REPOS: Dict[str, str] = {
        "llama3-8b-instruct": "nreHieW/Llama-3.1-8B-Instruct",
        "deepseek-math-7b": "deepseek-ai/deepseek-math-7b-instruct",
        "qwen2-7b": "Qwen/Qwen2-7B",
        "qwen2-7b-instruct": "Qwen/Qwen2-7B-Instruct",
        "qwen2-math-7b-instruct": "Qwen/Qwen2-Math-7B-Instruct",
        "mistral-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.3",
        "mathstral-7b": "mistralai/Mathstral-7B-v0.1",
        "deepseek-coder-7b": "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
        "mathcoder-l-7b": "MathLLMs/MathCoder-L-7B",
        "open-reasoner-zero-7b": "Open-Reasoner-Zero/Open-Reasoner-Zero-7B",
    }

    # Additional details for each model
    MODEL_DETAILS: Dict[str, Dict[str, str]] = {
        "llama3-8b-instruct": {
            "description": "LLaMA-3 8B instruct-tuned model",
            "parameters": "8 billion",
            "context_length": "8192 tokens",
            "suitable_for": "Instruction following tasks with improved reasoning",
        },
        "deepseek-math-7b": {
            "description": "DeepSeek Math 7B instruct model",
            "parameters": "7 billion",
            "context_length": "4096 tokens",
            "suitable_for": "Mathematical problem solving and proofs",
        },
        "qwen2-7b": {
            "description": "Qwen 2 7B base model",
            "parameters": "7 billion",
            "context_length": "4096 tokens",
            "suitable_for": "General purpose text generation",
        },
        "qwen2-7b-instruct": {
            "description": "Qwen 2 7B instruct-tuned model",
            "parameters": "7 billion",
            "context_length": "4096 tokens",
            "suitable_for": "Instruction-based tasks and chat",
        },
        "qwen2-math-7b-instruct": {
            "description": "Qwen 2 Math 7B instruct model",
            "parameters": "7 billion",
            "context_length": "4096 tokens",
            "suitable_for": "Mathematical reasoning and code generation",
        },
        "mistral-7b-instruct": {
            "description": "Mistral 7B Instruct v0.3",
            "parameters": "7 billion",
            "context_length": "8192 tokens",
            "suitable_for": "Instruction following with long context",
        },
        "mathstral-7b": {
            "description": "Mathstral 7B",
            "parameters": "7 billion",
            "context_length": "8192 tokens",
            "suitable_for": "Mathematics-specific tasks",
        },
        "deepseek-coder-7b": {
            "description": "DeepSeek Coder 7B instruct model",
            "parameters": "7 billion",
            "context_length": "4096 tokens",
            "suitable_for": "Code generation and debugging",
        },
        "mathcoder-l-7b": {
            "description": "MathCoder-L-7B: open-source 7B model tailored for mathematical reasoning & code generation",
            "parameters": "7 billion",
            "context_length": "unknown (use safe 8192 tokens)",
            "suitable_for": "Mathematical problem solving with code generation"
        },
        "open-reasoner-zero-7b": {
            "description": "Open-Reasoner-Zero-7B: open-source 7B reasoning-oriented model (RL trained) for math/logic tasks",
            "parameters": "7 billion",
            "context_length": "safe ~8192 tokens",
            "suitable_for": "Logical & mathematical reasoning (chain-of-thought)"
        },
    }

    @classmethod
    def get_supported_models(cls) -> List[str]:
        """Return list of supported model keys."""
        return list(cls.MODEL_REPOS.keys())

    @classmethod
    def get_model_repo(cls, model_key: str) -> str:
        """Given a model key, return the HF repository identifier."""
        return cls.MODEL_REPOS[model_key]

    @classmethod
    def get_model_details(cls, model_key: str) -> Dict[str, str]:
        """Return metadata for a given model key."""
        return cls.MODEL_DETAILS.get(model_key, {})

    @classmethod
    def validate_model_key(cls, model_key: str) -> bool:
        """Check if a model key is registered."""
        return model_key in cls.MODEL_REPOS

    @classmethod
    def as_list_of_repos(cls) -> List[str]:
        """Return list of HF repo strings for all supported models."""
        return list(cls.MODEL_REPOS.values())


    # --- Helpers ---
    @classmethod
    def canonical_key(cls, model_key: str) -> str:
        return model_key.strip().lower()

    @classmethod
    def safe_get_model_repo(cls, model_key: str) -> str:
        key = cls.canonical_key(model_key)
        if key in cls.MODEL_REPOS:
            return cls.MODEL_REPOS[key]
        candidates = difflib.get_close_matches(key, cls.MODEL_REPOS.keys(), n=3, cutoff=0.5)
        if candidates:
            raise KeyError(f"Modelo '{model_key}' no registrado. ¬øQuisiste decir: {', '.join(candidates)}?")
        raise KeyError(f"Modelo '{model_key}' no registrado. Usa uno de: {', '.join(cls.MODEL_REPOS.keys())}")

    @classmethod
    def get_family(cls, model_key: str) -> str:
        key = cls.canonical_key(model_key)
        repo = cls.MODEL_REPOS.get(key, "")
        low = f"{key} {repo}".lower()
        if "llama" in low: return "llama"
        if "qwen" in low: return "qwen"
        if "mistral" in low or "mathstral" in low: return "mistral"
        if "deepseek" in low: return "deepseek"
        return "generic"

    @classmethod
    def default_stops_for(cls, model_key: str):
        fam = cls.get_family(model_key)
        if fam == "qwen": return ["<|im_end|>"]
        if fam in ("llama", "mistral"): return ["</s>"]
        if fam == "deepseek": return ["<|EOT|>", "</s>"]
        return None

    @classmethod
    def sampling_defaults_for(cls, model_key: str):
        fam = cls.get_family(model_key)
        if fam in ("llama", "mistral"): return {"temperature": 0.2, "top_p": 0.9}
        if fam in ("qwen", "deepseek"): return {"temperature": 0.1, "top_p": 0.95}
        return {"temperature": 0.0, "top_p": 1.0}

In [None]:
import subprocess
import tempfile
import os
import shutil
import signal
from typing import Optional, Tuple, List, Dict

class OctaveExecutionError(Exception): ...
class OctaveTimeoutError(Exception): ...

class OctaveCodeExecutor:
    def __init__(
        self,
        timeout: int = 10,
        octave_cmd: str = "octave",
        max_output_size: int = 10000,
        workdir: Optional[str] = None,
        packages: Optional[List[str]] = None,
        env: Optional[Dict[str, str]] = None,
    ):
        self.timeout = int(timeout)
        self.octave_cmd = octave_cmd
        self.max_output_size = int(max_output_size)
        self.workdir = workdir
        self.packages = packages or []
        self.env = {"OMP_NUM_THREADS": "1", "OPENBLAS_NUM_THREADS": "1", **(env or {})}

        if shutil.which(self.octave_cmd) is None:
            raise FileNotFoundError(
                f"El binario '{self.octave_cmd}' no se encontr√≥.\n"
                f"Instala Octave con:\n!apt-get update && apt-get install -y octave"
            )

    def _safe_path_for_octave(self, path: str) -> str:
        # Octave usa '...' para strings; escapamos comillas simples dobl√°ndolas
        return path.replace("'", "''")

    def _write_temp_file(self, code: str, dir_path: Optional[str]) -> str:
        fd, path = tempfile.mkstemp(suffix=".m", dir=dir_path)
        with os.fdopen(fd, "w") as f:
            f.write(code)
        return path

    def _build_eval(self, mfile_path: str) -> str:
        # Prelude: opciones r√°pidas y cargas de paquetes
        prelude_lines = [
            "more off;",
            "warning('off','all');",
        ]
        for p in self.packages:
            prelude_lines.append(f"try; pkg load {p}; catch; end;")
        prelude = " ".join(prelude_lines)

        spath = self._safe_path_for_octave(mfile_path)
        # Ejecuta y controla error con getReport; devuelve c√≥digo de salida
        eval_str = (
            f"{prelude} "
            f"try, run('{spath}'); exit(0); "
            f"catch err, fprintf(2, '%s\\n', getReport(err, 'extended', 'hyperlinks', 'off')); exit(2); end;"
        )
        return eval_str

    def execute_with_timeout(self, code: str, keep_temp: bool = False) -> Tuple[Optional[str], Optional[str]]:
        """
        API compatible:
        - Devuelve (stdout, stderr or None)
        - Lanza OctaveTimeoutError si se excede timeout
        - Lanza OctaveExecutionError si Octave devuelve c√≥digo != 0
        """
        # Usar workdir temporal si no se especifica
        temp_dir = None
        cwd = self.workdir
        if cwd is None:
            temp_dir = tempfile.mkdtemp(prefix="oct_")
            cwd = temp_dir

        mfile_path = self._write_temp_file(code, dir_path=cwd)
        try:
            cmd = [
                self.octave_cmd,
                "-qf",            # quiet + no init files
                "--no-gui",
                "--eval",
                self._build_eval(mfile_path),
            ]

            # Crear un grupo de procesos para poder matar hijos en timeout
            proc = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                cwd=cwd,
                env={**os.environ, **self.env},
                preexec_fn=os.setsid  # Linux/Colab: nuevo process group
            )
            try:
                stdout, stderr = proc.communicate(timeout=self.timeout)
            except subprocess.TimeoutExpired:
                # Matar todo el grupo
                try:
                    os.killpg(proc.pid, signal.SIGKILL)
                except Exception:
                    proc.kill()
                raise OctaveTimeoutError("Tiempo de ejecuci√≥n excedido para el c√≥digo Octave.")

            # Truncar salidas
            stdout = (stdout or "")[:self.max_output_size].strip()
            stderr = (stderr or "").strip()

            if proc.returncode != 0:
                # Si hubo error, levanta excepci√≥n con el reporte extendido
                raise OctaveExecutionError(stderr or "Ejecuci√≥n de Octave fallida (sin stderr).")

            return stdout, (stderr if stderr else None)

        finally:
            # Limpieza
            if not keep_temp:
                try:
                    if os.path.exists(mfile_path):
                        os.remove(mfile_path)
                except Exception:
                    pass
                if temp_dir and os.path.isdir(temp_dir):
                    shutil.rmtree(temp_dir, ignore_errors=True)

    # M√©todo extra (opcional): devuelve metadatos completos
    def execute(self, code: str, keep_temp: bool = False) -> Dict[str, Optional[str]]:
        try:
            out, err = self.execute_with_timeout(code, keep_temp=keep_temp)
            return {"stdout": out, "stderr": err, "returncode": 0, "timed_out": False}
        except OctaveTimeoutError as e:
            return {"stdout": None, "stderr": str(e), "returncode": None, "timed_out": True}
        except OctaveExecutionError as e:
            return {"stdout": None, "stderr": str(e), "returncode": 2, "timed_out": False}

In [None]:
from typing import List, Optional
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from pymilvus import MilvusClient

class MilvusRetriever:
    def __init__(self, df, encoder, db_path, collection_name, rebuild: bool = False):
        """
        df: DataFrame con columnas: question, octave_code, embedding (string "[f1 f2 ...]")
        encoder: SentenceTransformer (para consultas)
        db_path: ruta Milvus Lite, e.g. '/content/.../milvus.db'
        collection_name: nombre de la colecci√≥n
        rebuild: si True, fuerza reindexado desde cero
        """
        self.df = df.reset_index(drop=True)
        self.encoder = encoder
        self.client = MilvusClient(uri=db_path)
        self.collection_name = collection_name
        self.dimension = 384  # debe coincidir con los embeddings

        # Crear colecci√≥n si no existe
        if self.collection_name not in self.client.list_collections():
            self.client.create_collection(
                self.collection_name,
                dimension=self.dimension,
                consistency_level="Eventually",
                auto_id=True
            )
            self._index_data()
        else:
            # Si existe, decidir si reindexar o no
            if rebuild or self._row_count() == 0:
                try:
                    self.client.drop_collection(self.collection_name)
                except Exception:
                    pass
                self.client.create_collection(
                    self.collection_name,
                    dimension=self.dimension,
                    consistency_level="Eventually",
                    auto_id=True
                )
                self._index_data()
            else:
                print(f"Milvus collection '{self.collection_name}' ya existe con datos.")

    def _row_count(self) -> int:
        """
        Devuelve el n√∫mero de entidades en la colecci√≥n usando APIs disponibles.
        """
        # 1) Intentar get_collection_stats (MilvusClient simple API)
        try:
            stats = self.client.get_collection_stats(self.collection_name)
            # row_count puede venir como str
            return int(stats.get("row_count", 0))
        except Exception:
            pass

        # 2) Intentar describe_collection (algunas versiones)
        try:
            info = self.client.describe_collection(self.collection_name)
            # distintos nombres posibles
            for key in ("row_count", "num_entities", "count"):
                if key in info:
                    return int(info[key])
        except Exception:
            pass

        # 3) Fallback: hacer un search con vector dummy y estimar si hay algo
        try:
            import numpy as np
            dummy = np.zeros((1, self.dimension), dtype=np.float32)
            res = self.client.search(
                self.collection_name,
                data=dummy,
                output_fields=["question"],
                limit=1,
                consistency_level="Eventually"
            )
            # si devuelve hits sin error, asumimos > 0
            return 1 if (isinstance(res, list) and len(res) > 0 and len(res[0]) > 0) else 0
        except Exception:
            return 0

    def _index_data(self):
        print("üì• Indexando vectores en Milvus...")
        import numpy as np
        vectors = np.stack(
            self.df["embedding"].apply(lambda x: np.fromstring(str(x).strip("[]"), sep=" "))
        ).astype(np.float32)

        # Normaliza si √≠ndice/consulta lo requiere (opcional)
        # from numpy.linalg import norm
        # vectors = (vectors / (norm(vectors, axis=1, keepdims=True) + 1e-12)).astype(np.float32)

        entities = [
            {"question": row["question"], "octave_code": row["octave_code"], "vector": vec}
            for row, vec in zip(self.df.to_dict(orient="records"), vectors)
        ]
        self.client.insert(self.collection_name, data=entities, progress_bar=True)
        print("Indexaci√≥n completada.")

    def retrieve(self, query, top_k=3):
        import torch
        import torch.nn.functional as F
        import numpy as np

        q_emb = self.encoder.encode([query], convert_to_numpy=True)
        q_emb = F.normalize(torch.tensor(q_emb), p=2, dim=1).numpy().astype(np.float32)

        results = self.client.search(
            self.collection_name,
            data=q_emb,
            output_fields=["question", "octave_code"],
            limit=top_k + 1,
            consistency_level="Eventually"
        )

        seen = set()
        retrieved = []
        for hit in results[0]:
            q_text = hit.entity["question"].strip()
            if q_text == query.strip():
                continue
            if q_text in seen:
                continue
            seen.add(q_text)
            retrieved.append(f"Problem: {q_text}\nCode: {hit.entity['octave_code']}")
            if len(retrieved) == top_k:
                break

        return "\n".join(retrieved)


    def list_all_problems(self) -> List[str]:
        return self.df["question"].drop_duplicates().tolist()

In [None]:
import pandas as pd
import random
from collections import defaultdict
from typing import List, Dict, Tuple, Optional

class FewShotRetriever:
    def __init__(self, csv_path: str, seed: Optional[int] = None):
        """
        Inicializa el retriever cargando el .csv y clasificando ejemplos por problem_type.

        Requiere columnas: 'question', 'octave_code', 'problem_type'.
        """
        self.examples_by_type: Dict[str, List[Tuple[str, str]]] = defaultdict(list)
        self.seed = seed
        if seed is not None:
            random.seed(seed)

        df = pd.read_csv(csv_path)
        # Verificaci√≥n b√°sica de columnas
        required_cols = {"question", "octave_code", "problem_type"}
        missing = required_cols - set(df.columns)
        if missing:
            raise ValueError(f"Faltan columnas en el CSV: {', '.join(sorted(missing))}")

        # Limpieza y carga
        df = df.fillna({"question": "", "octave_code": "", "problem_type": "Unknown"})
        for _, row in df.iterrows():
            problem_type = str(row["problem_type"]).strip() or "Unknown"
            question = str(row["question"]).strip()
            code = str(row["octave_code"]).strip()
            if question and code:
                self.examples_by_type[problem_type].append((question, code))

        total_types = len(self.examples_by_type)
        total_items = sum(len(v) for v in self.examples_by_type.values())
        print(f"Indexados {total_items} ejemplos en {total_types} categor√≠as (problem_type).")

    def retrieve(self, problem_type: str, k: int = 3, exclude_question: Optional[str] = None) -> str:
        """
        Recupera k ejemplos del mismo 'problem_type' y devuelve un string con bloques "Problem/Code".
        Si 'exclude_question' se provee, evita devolver exactamente ese enunciado (√∫til si el ejemplo actual viene del mismo CSV).
        """
        candidates = self.examples_by_type.get(problem_type, [])
        if not candidates:
            return ""

        pool = candidates
        if exclude_question:
            eq = exclude_question.strip()
            pool = [(q, c) for (q, c) in candidates if q.strip() != eq]

        if not pool:
            return ""

        sampled = random.sample(pool, min(k, len(pool)))
        retrieved = [f"Problem: {q}\nCode: {c}" for q, c in sampled]
        return "\n".join(retrieved)

    # Helpers opcionales
    def categories(self) -> List[str]:
        """Lista de problem_type disponibles."""
        return sorted(self.examples_by_type.keys())

    def stats(self) -> pd.DataFrame:
        """Tabla con conteo por problem_type."""
        return pd.DataFrame(
            [(t, len(v)) for t, v in self.examples_by_type.items()],
            columns=["problem_type", "count"]
        ).sort_values("count", ascending=False).reset_index(drop=True)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import pickle
import re
from typing import List, Dict, Any, Optional

# ==================== Modelo ====================

class BertMultiTaskClassifier(nn.Module):
    def __init__(self, dropout: float, num_problem_types: int, num_question_types: int):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        # Congelar encoder (como ten√≠as)
        for p in self.bert.parameters():
            p.requires_grad = False

        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()
        hidden = self.bert.config.hidden_size  # 768 para bert-base
        self.classifier_problem = nn.Linear(hidden, num_problem_types)
        self.classifier_question = nn.Linear(hidden, num_question_types)

    def forward(self, input_ids, attention_mask):
        # return_dict=True para nombres claros; fallback a pooler o CLS
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        if outputs.pooler_output is not None:
            rep = outputs.pooler_output
        else:
            rep = outputs.last_hidden_state[:, 0, :]  # [CLS]
        x = self.dropout(self.activation(rep))
        logits_problem = self.classifier_problem(x)
        logits_question = self.classifier_question(x)
        return logits_problem, logits_question

# ==================== Preprocesamiento ====================

def preprocess_function(example: Dict[str, str], tokenizer, seq_len: int) -> Dict[str, Any]:
    text = example["problem"].lower()  # ‚ö†Ô∏è Mantengo lower() por consistencia con el entrenamiento
    # Conservar operadores y puntuaci√≥n matem√°tica relevante
    text = re.sub(r"[^\w\d\s\+\-\*/=^‚àö%.,()]", " ", text)
    # Normalizar repeticiones de 'x' largas
    text = re.sub(r"\b[x]{2,}\b", "x", text)
    text = re.sub(" +", " ", text).strip()

    tokens = tokenizer(
        text,
        padding="max_length",
        max_length=seq_len,
        truncation=True,
        return_tensors="pt"  # ya devuelve tensores
    )
    return tokens

# ==================== Utils pickle ====================

def save_file(name, obj):
    with open(name, "wb") as f:
        pickle.dump(obj, f)

def load_file(name):
    with open(name, "rb") as f:
        return pickle.load(f)

# ==================== Pipeline de Inferencia ====================

class MultiTaskInferencePipeline:
    def __init__(
        self,
        model_path: str,
        encoder_problem_path: str,
        encoder_question_path: str,
        dropout: float = 0.5,
        seq_len: int = 512
    ):
        self.preprocess_function = preprocess_function
        self.seq_len = seq_len

        # LabelEncoders
        with open(encoder_problem_path, "rb") as f:
            self.le_problem = pickle.load(f)
        with open(encoder_question_path, "rb") as f:
            self.le_question = pickle.load(f)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

        # Modelo
        self.model = BertMultiTaskClassifier(
            dropout=dropout,
            num_problem_types=len(self.le_problem.classes_),
            num_question_types=len(self.le_question.classes_)
        )
        state = torch.load(model_path, map_location="cpu")
        self.model.load_state_dict(state, strict=True)  # si .pt viene de DataParallel, usar strict=False
        self.model.to(self.device)
        self.model.eval()

    @torch.inference_mode()
    def predict(self, text: str) -> Dict[str, str]:
        """
        Devuelve las etiquetas predichas.
        """
        tokens = self.preprocess_function({"problem": text}, self.tokenizer, seq_len=self.seq_len)
        input_ids = tokens["input_ids"].to(self.device)
        attention_mask = tokens["attention_mask"].to(self.device)

        logits_problem, logits_question = self.model(input_ids, attention_mask)
        pred_problem = torch.argmax(logits_problem, dim=1).item()
        pred_question = torch.argmax(logits_question, dim=1).item()

        label_problem = self.le_problem.inverse_transform([pred_problem])[0]
        label_question = self.le_question.inverse_transform([pred_question])[0]

        return {"problem_type": label_problem, "question_type": label_question}

    @torch.inference_mode()
    def predict_with_probs(self, text: str) -> Dict[str, Any]:
        """
        Devuelve etiquetas y probabilidades (softmax).
        √ötil para umbrales/inspecci√≥n.
        """
        tokens = self.preprocess_function({"problem": text}, self.tokenizer, seq_len=self.seq_len)
        input_ids = tokens["input_ids"].to(self.device)
        attention_mask = tokens["attention_mask"].to(self.device)

        logits_problem, logits_question = self.model(input_ids, attention_mask)
        probs_problem = F.softmax(logits_problem, dim=1).squeeze(0).cpu().numpy()
        probs_question = F.softmax(logits_question, dim=1).squeeze(0).cpu().numpy()

        pred_problem = int(probs_problem.argmax())
        pred_question = int(probs_question.argmax())

        label_problem = self.le_problem.inverse_transform([pred_problem])[0]
        label_question = self.le_question.inverse_transform([pred_question])[0]

        # Mapa {label: prob} ordenado
        pp_map = {lbl: float(probs_problem[i]) for i, lbl in enumerate(self.le_problem.classes_)}
        pq_map = {lbl: float(probs_question[i]) for i, lbl in enumerate(self.le_question.classes_)}

        return {
            "problem_type": label_problem,
            "question_type": label_question,
            "problem_type_probs": dict(sorted(pp_map.items(), key=lambda x: x[1], reverse=True)),
            "question_type_probs": dict(sorted(pq_map.items(), key=lambda x: x[1], reverse=True)),
        }

    @torch.inference_mode()
    def batch_predict(self, texts: List[str], return_probs: bool = False, batch_size: int = 32) -> List[Dict[str, Any]]:
        """
        Inferencia por lotes. Mucho m√°s eficiente para evaluar datasets completos.
        """
        results: List[Dict[str, Any]] = []
        for i in range(0, len(texts), batch_size):
            chunk = texts[i:i + batch_size]
            # Tokenizar en lote
            proc = [self.preprocess_function({"problem": t}, self.tokenizer, seq_len=self.seq_len) for t in chunk]
            input_ids = torch.cat([p["input_ids"] for p in proc], dim=0).to(self.device)
            attention_mask = torch.cat([p["attention_mask"] for p in proc], dim=0).to(self.device)

            logits_problem, logits_question = self.model(input_ids, attention_mask)

            if return_probs:
                probs_problem = F.softmax(logits_problem, dim=1).cpu()
                probs_question = F.softmax(logits_question, dim=1).cpu()

            preds_prob = torch.argmax(logits_problem, dim=1).cpu().tolist()
            preds_ques = torch.argmax(logits_question, dim=1).cpu().tolist()

            for idx, (pp, qq) in enumerate(zip(preds_prob, preds_ques)):
                lp = self.le_problem.inverse_transform([pp])[0]
                lq = self.le_question.inverse_transform([qq])[0]
                item = {"problem_type": lp, "question_type": lq}
                if return_probs:
                    pp_map = {lbl: float(probs_problem[idx, j]) for j, lbl in enumerate(self.le_problem.classes_)}
                    pq_map = {lbl: float(probs_question[idx, j]) for j, lbl in enumerate(self.le_question.classes_)}
                    item["problem_type_probs"] = dict(sorted(pp_map.items(), key=lambda x: x[1], reverse=True))
                    item["question_type_probs"] = dict(sorted(pq_map.items(), key=lambda x: x[1], reverse=True))
                results.append(item)
        return results

In [None]:
import logging
from typing import Dict, List, Any
from abc import ABC, import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import pickle
import re
from typing import List, Dict, Any, Optional

# ==================== Modelo ====================

class BertMultiTaskClassifier(nn.Module):
    def __init__(self, dropout: float, num_problem_types: int, num_question_types: int):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        # Congelar encoder (como ten√≠as)
        for p in self.bert.parameters():
            p.requires_grad = False

        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()
        hidden = self.bert.config.hidden_size  # 768 para bert-base
        self.classifier_problem = nn.Linear(hidden, num_problem_types)
        self.classifier_question = nn.Linear(hidden, num_question_types)

    def forward(self, input_ids, attention_mask):
        # return_dict=True para nombres claros; fallback a pooler o CLS
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        if outputs.pooler_output is not None:
            rep = outputs.pooler_output
        else:
            rep = outputs.last_hidden_state[:, 0, :]  # [CLS]
        x = self.dropout(self.activation(rep))
        logits_problem = self.classifier_problem(x)
        logits_question = self.classifier_question(x)
        return logits_problem, logits_question

# ==================== Preprocesamiento ====================

def preprocess_function(example: Dict[str, str], tokenizer, seq_len: int) -> Dict[str, Any]:
    text = example["problem"].lower()  # ‚ö†Ô∏è Mantengo lower() por consistencia con el entrenamiento
    # Conservar operadores y puntuaci√≥n matem√°tica relevante
    text = re.sub(r"[^\w\d\s\+\-\*/=^‚àö%.,()]", " ", text)
    # Normalizar repeticiones de 'x' largas
    text = re.sub(r"\b[x]{2,}\b", "x", text)
    text = re.sub(" +", " ", text).strip()

    tokens = tokenizer(
        text,
        padding="max_length",
        max_length=seq_len,
        truncation=True,
        return_tensors="pt"  # ya devuelve tensores
    )
    return tokens

# ==================== Utils pickle ====================

def save_file(name, obj):
    with open(name, "wb") as f:
        pickle.dump(obj, f)

def load_file(name):
    with open(name, "rb") as f:
        return pickle.load(f)

# ==================== Pipeline de Inferencia ====================

class MultiTaskInferencePipeline:
    def __init__(
        self,
        model_path: str,
        encoder_problem_path: str,
        encoder_question_path: str,
        dropout: float = 0.5,
        seq_len: int = 512
    ):
        self.preprocess_function = preprocess_function
        self.seq_len = seq_len

        # LabelEncoders
        with open(encoder_problem_path, "rb") as f:
            self.le_problem = pickle.load(f)
        with open(encoder_question_path, "rb") as f:
            self.le_question = pickle.load(f)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

        # Modelo
        self.model = BertMultiTaskClassifier(
            dropout=dropout,
            num_problem_types=len(self.le_problem.classes_),
            num_question_types=len(self.le_question.classes_)
        )
        state = torch.load(model_path, map_location="cpu")
        self.model.load_state_dict(state, strict=True)  # si .pt viene de DataParallel, usar strict=False
        self.model.to(self.device)
        self.model.eval()

    @torch.inference_mode()
    def predict(self, text: str) -> Dict[str, str]:
        """
        Devuelve las etiquetas predichas.
        """
        tokens = self.preprocess_function({"problem": text}, self.tokenizer, seq_len=self.seq_len)
        input_ids = tokens["input_ids"].to(self.device)
        attention_mask = tokens["attention_mask"].to(self.device)

        logits_problem, logits_question = self.model(input_ids, attention_mask)
        pred_problem = torch.argmax(logits_problem, dim=1).item()
        pred_question = torch.argmax(logits_question, dim=1).item()

        label_problem = self.le_problem.inverse_transform([pred_problem])[0]
        label_question = self.le_question.inverse_transform([pred_question])[0]

        return {"problem_type": label_problem, "question_type": label_question}

    @torch.inference_mode()
    def predict_with_probs(self, text: str) -> Dict[str, Any]:
        """
        Devuelve etiquetas y probabilidades (softmax).
        √ötil para umbrales/inspecci√≥n.
        """
        tokens = self.preprocess_function({"problem": text}, self.tokenizer, seq_len=self.seq_len)
        input_ids = tokens["input_ids"].to(self.device)
        attention_mask = tokens["attention_mask"].to(self.device)

        logits_problem, logits_question = self.model(input_ids, attention_mask)
        probs_problem = F.softmax(logits_problem, dim=1).squeeze(0).cpu().numpy()
        probs_question = F.softmax(logits_question, dim=1).squeeze(0).cpu().numpy()

        pred_problem = int(probs_problem.argmax())
        pred_question = int(probs_question.argmax())

        label_problem = self.le_problem.inverse_transform([pred_problem])[0]
        label_question = self.le_question.inverse_transform([pred_question])[0]

        # Mapa {label: prob} ordenado
        pp_map = {lbl: float(probs_problem[i]) for i, lbl in enumerate(self.le_problem.classes_)}
        pq_map = {lbl: float(probs_question[i]) for i, lbl in enumerate(self.le_question.classes_)}

        return {
            "problem_type": label_problem,
            "question_type": label_question,
            "problem_type_probs": dict(sorted(pp_map.items(), key=lambda x: x[1], reverse=True)),
            "question_type_probs": dict(sorted(pq_map.items(), key=lambda x: x[1], reverse=True)),
        }

    @torch.inference_mode()
    def batch_predict(self, texts: List[str], return_probs: bool = False, batch_size: int = 32) -> List[Dict[str, Any]]:
        """
        Inferencia por lotes. Mucho m√°s eficiente para evaluar datasets completos.
        """
        results: List[Dict[str, Any]] = []
        for i in range(0, len(texts), batch_size):
            chunk = texts[i:i + batch_size]
            # Tokenizar en lote
            proc = [self.preprocess_function({"problem": t}, self.tokenizer, seq_len=self.seq_len) for t in chunk]
            input_ids = torch.cat([p["input_ids"] for p in proc], dim=0).to(self.device)
            attention_mask = torch.cat([p["attention_mask"] for p in proc], dim=0).to(self.device)

            logits_problem, logits_question = self.model(input_ids, attention_mask)

            if return_probs:
                probs_problem = F.softmax(logits_problem, dim=1).cpu()
                probs_question = F.softmax(logits_question, dim=1).cpu()

            preds_prob = torch.argmax(logits_problem, dim=1).cpu().tolist()
            preds_ques = torch.argmax(logits_question, dim=1).cpu().tolist()

            for idx, (pp, qq) in enumerate(zip(preds_prob, preds_ques)):
                lp = self.le_problem.inverse_transform([pp])[0]
                lq = self.le_question.inverse_transform([qq])[0]
                item = {"problem_type": lp, "question_type": lq}
                if return_probs:
                    pp_map = {lbl: float(probs_problem[idx, j]) for j, lbl in enumerate(self.le_problem.classes_)}
                    pq_map = {lbl: float(probs_question[idx, j]) for j, lbl in enumerate(self.le_question.classes_)}
                    item["problem_type_probs"] = dict(sorted(pp_map.items(), key=lambda x: x[1], reverse=True))
                    item["question_type_probs"] = dict(sorted(pq_map.items(), key=lambda x: x[1], reverse=True))
                results.append(item)
        return results
import re
import json
import time
import torch.nn.functional as F
import time
from pymilvus import MilvusClient

logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

pattern_true = re.compile(r"\btrue\b", re.IGNORECASE)
pattern_false = re.compile(r"\bfalse\b", re.IGNORECASE)

# Instrucci√≥n fuerte para el √∫ltimo turno (evita prosa/markdown)
# Pol√≠tica final elegida globalmente
CODE_ONLY_MSG = (
    "Output ONLY a valid GNU Octave script (no prose/markdown). "
    "When executed, print ONLY the final answer (numeric or single letter) using "
    "printf('%.15g\\n', value) or printf('%c\\n', letter). No extra text."
)

def parse_resolvability(resp: str):
    """
    Devuelve True/False/None seg√∫n la PRIMERA ocurrencia inequ√≠voca de 'true' o 'false'.
    Evita casos como 'untrue' o apariciones posteriores contradictorias.
    """
    resp_low = resp.lower()
    t = pattern_true.search(resp_low)
    f = pattern_false.search(resp_low)
    if t and (not f or t.start() < f.start()):
        return True
    if f and (not t or f.start() < t.start()):
        return False
    return None

class ConversationalPromptStrategy(ABC):
    """
    Clase base abstracta para estrategias conversacionales.

    Cada subclase debe implementar un flujo completo de conversaci√≥n
    con el modelo, incluyendo resolubilidad, extracci√≥n de caracter√≠sticas
    (si aplica) y generaci√≥n de c√≥digo.
    """

    @abstractmethod
    def run_conversation(self, row: dict, generator) -> dict:
        """
        Ejecuta una conversaci√≥n completa con el modelo para un problema dado.

        Args:
            row (dict): Una fila del dataset (de un CSV).
            generator: Instancia del generador (e.g., VLLMGenerator).

        Returns:
            dict: Un diccionario con al menos:
                - 'resolvability_response'
                - 'is_octave_resolvable' (bool)
                - 'problem_features' (opcional)
                - 'model_output' (c√≥digo o razonamiento final)
        """
        pass

class NonConversationalZeroShotStrategy(ConversationalPromptStrategy):
    """
    Estrategia NO conversacional (single-shot).
    Solo entrega el problema y exige c√≥digo GNU Octave que imprima la respuesta final.
    """
    def run_conversation(self, row: dict, generator) -> dict:
        start_time = time.time()

        question = str(row.get("question", "")).strip()
        if not question:
            return {"inference_time": 0.0}

        # Prompt single-shot (sin conversaci√≥n)
        prompt = (
            "You are given a math problem.\n"
            "Solve it programmatically using GNU Octave and return ONLY the executable script.\n\n"
            "Problem:\n"
            f"{question}\n\n"
            # Pol√≠tica fuerte elegida globalmente
            f"{CODE_ONLY_MSG}\n"
        )

        # Llamada single-shot al modelo
        # Usamos generate() para evitar cualquier estructura de chat.
        out_list = generator.generate([prompt])
        model_output = out_list[0] if out_list else ""

        return {
            "resolvability_prompt": None,
            "resolvability_response": None,
            "is_octave_resolvable": None,
            "problem_features": None,
            "prompt": prompt,
            "model_output": model_output,
            "inference_time": time.time() - start_time
        }


class NonConversationalReasonedCodeStrategy(ConversationalPromptStrategy):
    """
    Estrategia NO conversacional (single-shot).
    Induce razonamiento interno (sin que lo imprima) y exige SOLO el script GNU Octave.
    No extrae resolubilidad ni caracter√≠sticas.
    """
    def run_conversation(self, row: dict, generator) -> dict:
        start_time = time.time()

        question = str(row.get("question", "")).strip()
        if not question:
            return {"inference_time": 0.0}

        # Prompt single-shot: pensar en silencio, devolver solo el script
        prompt = (
            "You are given a math problem.\n"
            "Determine if the math problem can be solved programmatically using GNU Octave.Answer with 'True' or 'False'."
            "In case of 'False', try to generate the solution to the math problem in the most suitable way:\n"
            "In case of 'True', proceed as follows:\n"
            "Analyze the problem and extract its key data and the following characteristics:\n"
                    "'given_data', 'unknowns', 'constraints','applicable methods',"
                    "'If a unit of measurement is used, specify which one'.\n"
            "With this information:\n\n"
            "Problem:\n"
            f"{question}\n\n"
            f"{CODE_ONLY_MSG}\n"
        )

        out_list = generator.generate([prompt])
        model_output = out_list[0] if out_list else ""

        return {
            "resolvability_prompt": None,
            "resolvability_response": None,
            "is_octave_resolvable": None,
            "problem_features": None,
            "prompt": prompt,
            "model_output": model_output,
            "inference_time": time.time() - start_time
        }

class ZeroShotConversationalStrategy(ConversationalPromptStrategy):
    def run_conversation(self, row: dict, generator) -> dict:
        start_time = time.time()
        question = str(row.get("question", "")).strip()
        if not question:
            return {"inference_time": 0.0}

        messages = []
        messages.append({"role": "user", "content": f"Here is a math problem:\n\n{question}"})
        messages.append({"role": "assistant", "content": "OK, I've received the problem."})

        messages.append({
            "role": "user",
            "content": (
                "Determine if the math problem can be solved programmatically using GNU Octave. "
                "Answer with 'True' or 'False' and justify your choice."
            )
        })
        resolvability_response = generator.chat(messages)
        messages.append({"role": "assistant", "content": resolvability_response})

        is_octave_resolvable = parse_resolvability(resolvability_response)

        problem_features = None
        if is_octave_resolvable:
            messages.append({
                "role": "user",
                "content": (
                    "Analyze the problem and extract its key data and the following characteristics:\n"
                    "'given_data', 'unknowns', 'constraints','applicable methods',"
                    "'If a unit of measurement is used, specify which one'.\n"
                )
            })
            problem_features = generator.chat(messages)
            messages.append({"role": "assistant", "content": problem_features})

        if is_octave_resolvable:
            messages.append({
                "role": "user",
                "content": f"{CODE_ONLY_MSG}"
            })
        else:
            messages.append({
                "role": "user",
                "content": "Provide the solution to the math problem in the most suitable way."
            })

        model_output = generator.chat(messages)
        messages.append({"role": "assistant", "content": model_output})

        return {
            "resolvability_prompt": messages[0]["content"],
            "resolvability_response": resolvability_response,
            "is_octave_resolvable": is_octave_resolvable,
            "problem_features": problem_features,
            "prompt": json.dumps(messages),
            "model_output": model_output,
            "inference_time": time.time() - start_time
        }


class ChainOfThoughtConversationalStrategy(ConversationalPromptStrategy):
    def run_conversation(self, row: dict, generator) -> dict:
        start_time = time.time()
        question = str(row.get("question", "")).strip()
        rationale = str(row.get("rationale", "")).strip()
        if not question:
            return {"inference_time": 0.0}

        messages = []
        if rationale:
            messages.append({"role": "user", "content": f"Here is a math problem:\n\n{question}.\nRationale: {rationale}\n"})
        else:
            messages.append({"role": "user", "content": f"Here is a math problem:\n\n{question}."})
        messages.append({"role": "assistant", "content": "OK, I've received the problem."})

        messages.append({
            "role": "user",
            "content": (
                "Determine if the math problem can be solved programmatically using GNU Octave. "
                "Answer with 'True' or 'False' and justify your choice."
            )
        })
        resolvability_response = generator.chat(messages)
        messages.append({"role": "assistant", "content": resolvability_response})
        is_octave_resolvable = parse_resolvability(resolvability_response)

        problem_features = None
        if is_octave_resolvable:
            messages.append({
                "role": "user",
                "content": (
                    "Analyze the problem and extract its key data and the following characteristics:\n"
                    "'given_data', 'unknowns', 'constraints','applicable methods',"
                    "'If a unit of measurement is used, specify which one'.\n"
                )
            })
            problem_features = generator.chat(messages)
            messages.append({"role": "assistant", "content": problem_features})

        if is_octave_resolvable:
            messages.append({"role": "user", "content": f"{CODE_ONLY_MSG}"})
        else:
            messages.append({"role": "user", "content": "Provide the solution to the math problem in the most suitable way."})

        model_output = generator.chat(messages)
        messages.append({"role": "assistant", "content": model_output})

        return {
            "resolvability_prompt": messages[0]["content"],
            "resolvability_response": resolvability_response,
            "is_octave_resolvable": is_octave_resolvable,
            "problem_features": problem_features,
            "prompt": json.dumps(messages),
            "model_output": model_output,
            "inference_time": time.time() - start_time
        }


class ChainOfThoughtReasoningConversationalStrategy(ConversationalPromptStrategy):
    def run_conversation(self, row: dict, generator) -> dict:
        start_time = time.time()
        question = str(row.get("question", "")).strip()
        if not question:
            return {"inference_time": 0.0}

        # generator.generate devuelve lista
        rationale_prompt = "Think step by step to solve the following math problem:\n" + question + "\n"
        rationale_list = generator.generate([rationale_prompt])
        rationale = rationale_list[0] if rationale_list else ""

        messages = []
        if rationale:
            messages.append({"role": "user", "content": f"Here is a math problem:\n\n{question}.\nRationale: {rationale}\n"})
        else:
            messages.append({"role": "user", "content": f"Here is a math problem:\n\n{question}."})
        messages.append({"role": "assistant", "content": "OK, I've received the problem."})

        messages.append({
            "role": "user",
            "content": (
                "Determine if the math problem can be solved programmatically using GNU Octave. "
                "Answer with 'True' or 'False' and justify your choice."
            )
        })
        resolvability_response = generator.chat(messages)
        messages.append({"role": "assistant", "content": resolvability_response})
        is_octave_resolvable = parse_resolvability(resolvability_response)

        problem_features = None
        if is_octave_resolvable:
            messages.append({
                "role": "user",
                "content": (
                    "Analyze the problem and extract its key data and the following characteristics:\n"
                    "'given_data', 'unknowns', 'constraints','applicable methods',"
                    "'If a unit of measurement is used, specify which one'.\n"
                )
            })
            problem_features = generator.chat(messages)
            messages.append({"role": "assistant", "content": problem_features})

        if is_octave_resolvable:
            messages.append({"role": "user", "content": f"{CODE_ONLY_MSG}"})
        else:
            messages.append({"role": "user", "content": "Provide the solution to the math problem in the most suitable way."})

        model_output = generator.chat(messages)
        messages.append({"role": "assistant", "content": model_output})

        return {
            "resolvability_prompt": messages[0]["content"],
            "resolvability_response": resolvability_response,
            "is_octave_resolvable": is_octave_resolvable,
            "problem_features": problem_features,
            "prompt": json.dumps(messages),
            "model_output": model_output,
            "inference_time": time.time() - start_time
        }


class RAGConversationalStrategy(ConversationalPromptStrategy):
    def __init__(self, retriever):
        self.retriever = retriever

    def run_conversation(self, row: dict, generator) -> dict:
        start_time = time.time()
        question = str(row.get("question", "")).strip()
        if not question:
            return {"inference_time": 0.0}

        messages = []
        messages.append({"role": "user", "content": f"Here is a math problem:\n\n{question}"})
        messages.append({"role": "assistant", "content": "OK, I've received the problem."})

        messages.append({
            "role": "user",
            "content": (
                "Determine if the math problem can be solved programmatically using GNU Octave. "
                "Answer with 'True' or 'False' and justify your choice."
            )
        })
        resolvability_response = generator.chat(messages)
        messages.append({"role": "assistant", "content": resolvability_response})
        is_octave_resolvable = parse_resolvability(resolvability_response)

        problem_features = None
        if is_octave_resolvable:
            messages.append({
                "role": "user",
                "content": (
                    "Analyze the problem and extract its key data and the following characteristics:\n"
                    "'given_data', 'unknowns', 'constraints','applicable methods',"
                    "'If a unit of measurement is used, specify which one'.\n"
                )
            })
            problem_features = generator.chat(messages)
            messages.append({"role": "assistant", "content": problem_features})

        if is_octave_resolvable:
            retrieved_context = self.retriever.retrieve(question)
            messages.append({
                "role": "user",
                "content": (
                    f"You are given related examples:\n{retrieved_context}\n\n"
                    f"Problem features: {problem_features}\n\n"
                    f"Problem: {question}\n\n"
                    f"{CODE_ONLY_MSG}"
                )
            })
        else:
            messages.append({"role": "user", "content": f"Problem: {question}\n\nProvide the solution to the math problem in the most suitable way."})

        model_output = generator.chat(messages)
        messages.append({"role": "assistant", "content": model_output})

        return {
            "resolvability_prompt": messages[0]["content"],
            "resolvability_response": resolvability_response,
            "is_octave_resolvable": is_octave_resolvable,
            "problem_features": problem_features,
            "prompt": json.dumps(messages),
            "model_output": model_output,
            "inference_time": time.time() - start_time
        }


class FewShotConversationalStrategy(ConversationalPromptStrategy):
    def __init__(self, retriever, classifier=None, k=3):
        self.retriever = retriever
        self.classifier = classifier
        self.k = k

    def run_conversation(self, row: dict, generator) -> dict:
        start_time = time.time()
        question = str(row.get("question", "")).strip()
        if not question:
            return {"inference_time": 0.0}

        messages = []
        messages.append({"role": "user", "content": f"Here is a math problem:\n\n{question}"})
        messages.append({"role": "assistant", "content": "OK, I've received the problem."})

        messages.append({
            "role": "user",
            "content": (
                "Determine if the math problem can be solved programmatically using GNU Octave. "
                "Answer with 'True' or 'False' and justify your choice."
            )
        })
        resolvability_response = generator.chat(messages)
        messages.append({"role": "assistant", "content": resolvability_response})
        is_octave_resolvable = parse_resolvability(resolvability_response)

        problem_features = None
        if is_octave_resolvable:
            messages.append({
                "role": "user",
                "content": (
                    "Analyze the problem and extract its key data and the following characteristics:\n"
                    "'given_data', 'unknowns', 'constraints','applicable methods',"
                    "'If a unit of measurement is used, specify which one'.\n"
                )
            })
            problem_features = generator.chat(messages)
            messages.append({"role": "assistant", "content": problem_features})

        if is_octave_resolvable:
            # definir siempre retrieved_context y usar firma correcta del retriever
            retrieved_context = ""
            question_type = None
            if self.classifier:
                try:
                    preds = self.classifier.predict(question)
                    question_type = preds.get("problem_type", None)
                except Exception:
                    question_type = None

            if question_type:
                # tu FewShotRetriever espera (problem_type, k)
                retrieved_context = self.retriever.retrieve(problem_type=question_type, k=self.k)
            else:
                # si no hay tipo, tener un m√©todo general; si no, se queda vac√≠o
                try:
                    # default; si no, ignora
                    retrieved_context = self.retriever.retrieve(problem_type="Unknown", k=self.k)
                except Exception:
                    retrieved_context = ""

            if retrieved_context:
                messages.append({
                    "role": "user",
                    "content": (
                        f"You are given related examples:\n{retrieved_context}\n\n"
                        f"Problem features: {problem_features}\n\n"
                        f"Problem: {question}\n\n"
                        f"{CODE_ONLY_MSG}"
                    )
                })
            else:
                messages.append({
                    "role": "user",
                    "content": (
                        f"Problem features: {problem_features}\n\n"
                        f"Problem: {question}\n\n"
                        f"{CODE_ONLY_MSG}"
                    )
                })
        else:
            messages.append({"role": "user", "content": f"Problem: {question}\n\nProvide the solution to the math problem in the most suitable way."})

        model_output = generator.chat(messages)
        messages.append({"role": "assistant", "content": model_output})

        return {
            "resolvability_prompt": messages[0]["content"],
            "resolvability_response": resolvability_response,
            "is_octave_resolvable": is_octave_resolvable,
            "problem_features": problem_features,
            "prompt": json.dumps(messages),
            "model_output": model_output,
            "inference_time": time.time() - start_time
        }

In [None]:
# Rutas fijas en Google Drive para cada dataset
DATASET_PATHS = {
    "aqua": "/content/drive/MyDrive/tesis/datasets/AQUA/test.json",
    "gsm8k": "/content/drive/MyDrive/tesis/datasets/GSM-8K/test.jsonl",
    "math_data": "/content/drive/MyDrive/tesis/datasets/MATH/math_data.jsonl",
    "math_shuffled": "/content/drive/MyDrive/tesis/datasets/MATH/shuffled_math.jsonl",
    "mmlu": "/content/drive/MyDrive/tesis/datasets/MMLU/MMLU_test.jsonl",
    "benchmark": "/content/drive/MyDrive/tesis/datasets/benchmarks/benchmark_math_gsm8k_3x300.jsonl",
}

In [None]:
import json
import os
import re
from typing import Dict, List, Optional
import sympy as sp
from sympy.parsing.latex import parse_latex as latex2sympy


import pandas as pd

def boxed_to_value_string(boxed_expr: str, precision: int = 15) -> str:
    """
    Convierte el contenido del \\boxed a string para tu dataset:
      - Si es num√©rico real, devuelve el valor como string flotante (%.{precision}g).
      - Si no es num√©rico (letra u otra expresi√≥n), devuelve la cadena tal cual.
      - Siempre retorna str.
    """
    s = (boxed_expr or "").strip()
    if not s:
        return ""
    try:
        sym = latex2sympy(s)
        # Intentar evaluar num√©ricamente con algo m√°s de precisi√≥n
        val = sp.N(sym, precision + 5)
        # Si es real (o convertible a float), lo formateamos; si no, devolvemos la cadena original
        if val.is_real is False:
            return s
        # A veces is_real es None pero s√≠ es convertible:
        f = float(val)
        return f"{f:.{precision}g}"
    except Exception:
        # No se pudo parsear / evaluar: devolver la cadena original
        return s

def extract_boxed_solution(latex: str) -> str:
    """
    Busca el primer \\boxed{...}, extrae lo que hay dentro.
    Si no hay, devuelve cadena vac√≠a.
    """
    if latex is None:
        return ""
    s = str(latex)
    idx = s.find(r"\boxed{")
    if idx < 0:
        return ""
    start = idx + len(r"\boxed{")
    depth = 0
    i = start
    while i < len(s):
        if s[i] == "{":
            depth += 1
        elif s[i] == "}":
            if depth == 0:
                return s[start:i].strip()
            depth -= 1
        i += 1
    # si no encuentra cierre:
    return s[start:].strip()



class DatasetCSVBuilder:
    """
    Transforma datasets JSON/JSONL a CSV estandarizado para el pipeline experimental,
    con soporte para procesar solo los primeros `max_items` registros si se desea.
    """

    def __init__(self, dataset_paths: Dict[str, str]):
        self.dataset_paths = dataset_paths

        # Unidades comunes; ampliarlo si quieres
        self.unit_pattern = re.compile(r'[$‚Ç¨¬£%]|(?<![a-zA-Z])m\b|cm\b|kg\b|km\b')
        self.choice_pattern = re.compile(r'^\s*([A-Za-z])\)\s*(.*)$')
        self.boxed_pattern = re.compile(r'\\boxed\{([^}]*)\}')
        self.simple_tex_pattern = re.compile(r'\\text\{([^}]*)\}')

        # Columnas adicionales necesarias para el pipeline experimental completo
        self.experiment_columns = [
            "strategy",
            "model",
            "dataset",
            "resolvability_prompt",
            "resolvability_response",
            "is_octave_resolvable",
            "problem_features",
            "prompt",
            "model_output",
            "inference_time",
            "octave_code",
            "execution_output",
            "execution_error",
            "is_correct",
        ]

    # --------------------------------------------------------------------- #
    # API p√∫blica
    # --------------------------------------------------------------------- #
    def create_dataset(
        self,
        key: str,
        max_items: Optional[int] = None,
    ) -> pd.DataFrame:
        """
        Lee el dataset indicado por `key`, procesa hasta `max_items`
        registros (o todos si es None) y guarda el CSV resultante.
        """
        path = self.dataset_paths[key]
        records = self._load_records(path, max_items=max_items)
        df = pd.DataFrame(records)
        df = self._transform_dataframe(key, df)
        df = self._finalize_schema(df, dataset_name=key)
        self._save_as_csv(df, path, max_items=max_items)
        return df

    # --------------------------------------------------------------------- #
    # Funciones internas
    # --------------------------------------------------------------------- #
    def _load_records(
        self,
        path: str,
        *,
        max_items: Optional[int] = None,
    ) -> List[dict]:
        """
        Devuelve una lista de diccionarios cargados desde un .json o .jsonl.
        Si `max_items` est√° definido, corta la lista a ese tama√±o.
        """
        data = []
        with open(path, "r", encoding="utf-8") as f:
            try:
                if path.endswith(".jsonl"):
                    for line in f:
                        if not line.strip():
                            continue
                        data.append(json.loads(line.strip()))
                        if max_items and len(data) >= max_items:
                            break
                else:  # .json (lista o objeto √∫nico)
                    content = json.load(f)
                    if isinstance(content, list):
                        data.extend(content[:max_items] if max_items else content)
                    else:
                        data.append(content)
            except json.JSONDecodeError as e:
                print(f"Error al leer {path}: {e}")

        return data

    def _transform_dataframe(self, name: str, df: pd.DataFrame) -> pd.DataFrame:
        """
        Normaliza columnas a un esquema com√∫n:
        - question, answer, rationale (strings)
        - otras columnas espec√≠ficas por dataset (options, level, type, etc.)
        """
        df = df.copy()

        # Asegurar existencia de columnas base (se completan luego)
        for c in ["question", "answer", "rationale"]:
            if c not in df.columns:
                df[c] = None

        if name == "aqua":
            # AQUA-RAT: {question, options (list), correct (letra), rationale?}
            if "correct" in df.columns:
                df = df.rename(columns={"correct": "answer"})
            # Asegurar question
            if "question" not in df.columns and "Problem" in df.columns:
                df = df.rename(columns={"Problem": "question"})

            # Normalizar options a string ";"-separado
            if "options" in df.columns:
                df["options"] = df["options"].apply(
                    lambda x: ';'.join(x) if isinstance(x, list) else (x if isinstance(x, str) else "")
                )

            # Letra a may√∫sculas y sin espacios
            df["answer"] = df["answer"].astype(str).str.strip().str.upper()

            # Mapear letra ‚Üí texto de opci√≥n; limpiar unidades si hay
            df["answer_alternative"] = df.apply(self._map_if_unit, axis=1)

        elif name == "gsm8k":
            # GSM8K: {"question", "answer"} con "rationale #### final"
            if "question" not in df.columns and "Question" in df.columns:
                df = df.rename(columns={"Question": "question"})
            if "answer" in df.columns:
                parts = df["answer"].astype(str).str.split("####", n=1, expand=True)
                if parts.shape[1] == 2:
                    df["rationale"] = parts[0].str.strip()
                    df["answer"] = parts[1].str.strip()
                else:
                    # No hay '####': dejar answer tal cual y rationale vac√≠o
                    df["rationale"] = ""
                    df["answer"] = parts[0].str.strip()

        elif name in ("math_data", "math_shuffled"):
          # 1) question := problem
          if "problem" in df.columns:
              df["question"] = df["problem"].astype(str)
          else:
              # si faltara 'problem', conserva 'question' si existe; si no, vac√≠o
              df["question"] = df.get("question", "").astype(str)

          # 2) rationale := solution (solo para posibles estrategias que la usen)
          if "solution" in df.columns:
              df["rationale"] = df["solution"].astype(str)
          else:
              df["rationale"] = df.get("rationale", "").astype(str)

          # 3) answer := contenido de \boxed{...} convertido a string num√©rico si es evaluable; si no, tal cual
          df["answer"] = df["rationale"].apply(
              lambda s: extract_boxed_solution(s))


        elif name == "mmlu":
            # MMLU: nombres var√≠an; soportar "question"/"Question", "choices"/"options"
            if "question" not in df.columns and "Question" in df.columns:
                df = df.rename(columns={"Question": "question"})
            if "choices" not in df.columns:
                # construir 'choices' desde columnas "Option*"
                option_cols = [c for c in df.columns if c.lower().startswith("option")]
                if option_cols:
                    df["choices"] = df[option_cols].apply(
                        lambda row: ';'.join(row.values.astype(str)), axis=1
                    )
            # answer suele venir ya como letra o texto; lo dejamos tal cual
        elif name == "benchmark":
          # El benchmark ya debe venir con question/answer (y opcionalmente rationale, dataset).
          # Normalizamos nombres y tipos sin re-interpretar contenido.
          # Acepta tambi√©n 'problem'/'solution' como fallback por si alg√∫n bloque viene crudo.

          # question
          if "question" in df.columns:
              df["question"] = df["question"].astype(str)
          elif "problem" in df.columns:
              df["question"] = df["problem"].astype(str)
          else:
              df["question"] = ""

          # answer
          if "answer" in df.columns:
              df["answer"] = df["answer"].astype(str)
          elif "solution" in df.columns:
              # Si por error entra crudo, lo dejamos tal cual (sin extraer \boxed{})
              df["answer"] = df["solution"].astype(str)
          else:
              df["answer"] = ""

          # rationale (opcional)
          if "rationale" in df.columns:
              df["rationale"] = df["rationale"].astype(str)
          elif "solution" in df.columns:
              df["rationale"] = df["solution"].astype(str)
          else:
              df["rationale"] = ""

          # dataset (si no viene, asignar 'benchmark')
          if "dataset" not in df.columns:
              df["dataset"] = "benchmark"
          else:
              df["dataset"] = df["dataset"].astype(str).replace("", "benchmark")


        # Asegurar tipos string y strip b√°sico
        for c in ["question", "answer", "rationale"]:
            df[c] = df[c].astype(str).fillna("").str.strip()

        # Agregar id incremental si no existe
        if "id" not in df.columns:
            df.insert(0, "id", range(len(df)))

        return df

    def _finalize_schema(self, df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
        # A√±adir columnas experimentales si faltan
        for col in self.experiment_columns:
            if col not in df.columns:
                df[col] = dataset_name if col == "dataset" else None

        # Asegurar columnas m√≠nimas
        for c in ["question", "answer", "rationale", "dataset"]:
            if c not in df.columns:
                df[c] = "" if c != "dataset" else dataset_name

        return df

    def _map_if_unit(self, row):
        """
        Para AQUA:
        - row['answer'] es una letra (A/B/C/...)
        - row['options'] es 'A) ...;B) ...;...'
        Devuelve el texto de opci√≥n mapeado; si incluye unidades, las quita.
        """
        letter = str(row.get("answer", "")).strip().upper()
        options = row.get("options", "")
        if not isinstance(options, str) or not options:
            return None

        # Construir mapping letra ‚Üí texto
        mapping = {}
        for raw in [p for p in options.split(';') if p.strip()]:
            m = self.choice_pattern.match(raw)
            if m:
                k, v = m.group(1).upper(), m.group(2).strip()
                mapping[k] = v
            else:
                # Si no hay "A)" expl√≠cito, mapear por orden A,B,C,D...
                pass

        if not mapping:
            parts = [p.strip() for p in options.split(';') if p.strip()]
            abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            mapping = {abc[i]: parts[i] for i in range(min(len(parts), len(abc)))}

        value = mapping.get(letter, None)
        if not value:
            return None

        # Quitar unidades comunes si aparecen pegadas
        cleaned = self.unit_pattern.sub('', value).strip()
        return cleaned if cleaned else value

    def _save_as_csv(
        self,
        df: pd.DataFrame,
        original_path: str,
        *,
        max_items: Optional[int] = None,
    ):
        """
        Guarda el CSV con un sufijo que indica si es un recorte parcial.
        """
        base_dir = os.path.dirname(original_path)
        base_name = os.path.splitext(os.path.basename(original_path))[0]
        suffix = f"_experiment_{max_items}" if max_items else "_experiment"
        output_path = os.path.join(base_dir, f"{base_name}{suffix}.csv")
        df.to_csv(output_path, index=False)
        print(f"‚úÖ CSV experimental guardado en: {output_path}")

# EXPERIMENTOS

# 1) Configuraci√≥n

In [None]:
# =========================
# CONFIG GLOBAL
# =========================
import os, re, json, random
import pandas as pd
from datetime import datetime



# LLM / ejecuci√≥n
MAX_TOKENS       = 1024
MAX_FIX_ATTEMPTS = 1
EXEC_TIMEOUT     = 10

# Salidas
OUT_DIR = "/content/drive/MyDrive/tesis/experiments_final"
os.makedirs(OUT_DIR, exist_ok=True)

# --- RAG CONFIG ---
RAG_CSV_PATH = "/content/drive/MyDrive/tesis/datasets/NuminaMath-1.5_rag_corpus/NuminaMath-1.5_rag_corpus_final.csv"
RAG_DB_PATH = "./milvus_data.db" #"/content/drive/MyDrive/tesis/milvus_lite/milvus.db"  # Milvus Lite
RAG_COLLECTION = "numinamath_rag_v1"  # cambiar si regeneras el √≠ndice
RAG_ENCODER_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # 384 dims (coincide con MilvusRetriever)

def results_path(dataset_key: str, model_key: str, strategy_name: str) -> str:
    return os.path.join(OUT_DIR, f"results_{dataset_key}_{strategy_name}_{model_key}.csv")

# 2) Validaci√≥n de stdout y verificaci√≥n (n√∫mero **o** letra)

In [None]:
# =========================
# VALIDACI√ìN STRICTA & VERIFICACI√ìN
# =========================
import re
from math_verify import parse, verify

_NUMERIC_PATTERN = r"[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?(?:[ij])?"
_LETTER_PATTERN  = r"[A-Za-z]"
_VALID_OUTPUT = re.compile(rf"^\s*(?:{_NUMERIC_PATTERN}|{_LETTER_PATTERN})\s*$")

def is_valid_stdout(s: str) -> bool:
    """True si stdout es SOLO un n√∫mero v√°lido o UNA letra (sin texto extra)."""
    return bool(_VALID_OUTPUT.match(s or ""))

def verify_output(gold: str, out: str) -> bool:
    """Compara gold vs stdout: letra directa o n√∫mero v√≠a math_verify."""
    gold = ("" if gold is None else str(gold)).strip()
    out  = ("" if out  is None else str(out )).strip()
    if not is_valid_stdout(out): return False
    if len(out) == 1 and out.isalpha():
        return gold.upper() == out.upper()
    try:
        return verify(parse(gold), parse(out))
    except Exception:
        return False

# 3) Preprocesamiento del c√≥digo generado por el modelo

In [None]:
import re

# Detecta bloques con fences markdown ```octave / ```m / ```matlab o gen√©ricos ```
_CODE_FENCE_RE = re.compile(
    r"```(?:octave|matlab|m)?\s*(.*?)```",
    re.DOTALL | re.IGNORECASE
)

# Alternativa: bloques HTML <code>...</code>
_HTML_CODE_RE = re.compile(
    r"<code[^>]*>(.*?)</code>",
    re.DOTALL | re.IGNORECASE
)

def extract_octave_code(text: str) -> str:
    """
    Extrae el c√≥digo Octave de una respuesta del modelo.
    Prioriza bloques en fences Markdown o <code>...</code>.
    Si no encuentra, devuelve el texto tal cual (strip).
    """
    if text is None:
        return ""
    s = str(text)

    # 1) Fences Markdown con/ sin lenguaje
    m = _CODE_FENCE_RE.search(s)
    if m:
        return m.group(1).strip()

    # 2) Bloques HTML <code>...</code>
    m = _HTML_CODE_RE.search(s)
    if m:
        return m.group(1).strip()

    # 3) Fallback: devolver el texto "limpio" (sin fences) tal cual
    #    (por si el modelo respet√≥ la policy y ya devolvi√≥ solo c√≥digo)
    return s.strip()

# 4) Builders (modelo, estrategia, dataset)

In [None]:
# =========================
# BUILDERS
# =========================

# ============ RAG retriever builder (Milvus Lite) ============
from sentence_transformers import SentenceTransformer
import pandas as pd

_RAG_CACHE = {"retriever": None}

def build_rag_retriever():
    if _RAG_CACHE["retriever"] is not None:
        return _RAG_CACHE["retriever"]

    # Cargar corpus con embeddings precomputados
    rag_df = pd.read_csv(RAG_CSV_PATH)

    # Encoder para embebidos de consultas (384 dims; coincide con MilvusRetriever.dimension=384)
    encoder = SentenceTransformer(RAG_ENCODER_NAME)

    # Inicializar MilvusRetriever (usa Milvus Lite en un archivo local)
    retriever = MilvusRetriever(
        df=rag_df,
        encoder=encoder,
        db_path=RAG_DB_PATH,
        collection_name=RAG_COLLECTION
    )
    _RAG_CACHE["retriever"] = retriever
    return retriever

def build_generator(model_key: str, max_tokens: int = MAX_TOKENS):
    return VLLMGenerator(
        model_name=ModelRegistry.get_model_repo(model_key),
        download=True,
        temperature=0.0,   # estable para comparar Model/Policy
        top_p=1.0,
        max_tokens=max_tokens,
        log_prompts=False
    )

def build_strategy(name: str):
    n = name.lower()

    if n == "nonconv_zeroshot": return NonConversationalZeroShotStrategy()
    if n == "nonconv_packed": return NonConversationalReasonedCodeStrategy()

    if n == "few_shots":
        retr = FewShotRetriever(
            csv_path="/content/drive/MyDrive/tesis/datasets/NuminaMath-1.5_rag_corpus/NuminaMath-1.5_rag_corpus_final.csv"
        )
        clf = MultiTaskInferencePipeline(
            model_path="/content/drive/MyDrive/tesis/categorizacion/bert_pre_trained_math_f1.pth",
            encoder_problem_path="/content/drive/MyDrive/tesis/categorizacion/problem_type_encoder.pkl",
            encoder_question_path="/content/drive/MyDrive/tesis/categorizacion/question_type_encoder.pkl"
        )
        return FewShotConversationalStrategy(retriever=retr, classifier=clf, k=3)
    if n == "zero_shot":        return ZeroShotConversationalStrategy()
    if n == "cot":              return ChainOfThoughtConversationalStrategy()
    if n == "cot_reasoning":    return ChainOfThoughtReasoningConversationalStrategy()

    if n == "rag":
        rag_retriever = build_rag_retriever()
        return RAGConversationalStrategy(retriever=rag_retriever)

    raise ValueError(f"Estrategia no soportada: {name}")

"""def load_subset_df(dataset_key: str,
                   start_index: int, # = START_INDEX,
                   limit: int, # = LIMIT_PER_DATASET,
                   n_samples: int = N_SAMPLES) -> pd.DataFrame:"""

def load_subset_df(dataset_key: str,
                   start_index: int,
                   limit: int,
                   n_samples: int ) -> pd.DataFrame:
    """
    Devuelve los primeros n_samples a partir de start_index (sin aleatoriedad),
    dentro de la ventana [start_index : start_index + limit).

    Retorna columnas: row_id, question, answer
    """
    builder = DatasetCSVBuilder(dataset_paths=DATASET_PATHS)
    df_all = builder.create_dataset(dataset_key)

    # Ventana fija del dataset
    df_sub = df_all.iloc[start_index : start_index + limit].copy()

    # Tomar en orden los primeros n_samples (si hay menos, devuelve los que haya)
    if len(df_sub) > n_samples:
        df_sub = df_sub.head(n_samples).copy()

    # row_id estable por dataset+√≠ndice original del slice
    df_sub.insert(0, "row_id", [f"{dataset_key}:{i}" for i in df_sub.index])

    # Asegurar columnas m√≠nimas
    for c in ("question", "answer"):
        if c not in df_sub:
            df_sub[c] = ""

    return df_sub[["row_id", "question", "answer"]]

# 5) N√∫cleo de ejecuci√≥n por ejemplo (con pol√≠tica)

In [None]:
import time

# =========================
# CORE: ejecutar 1 ejemplo con auto-fix y verificaci√≥n
# =========================
def run_single_example(row: dict, strategy, generator, executor,
                       max_fix_attempts: int = MAX_FIX_ATTEMPTS) -> dict:

    # inicio cron√≥metro local
    _t0 = time.time()

    question = str(row.get("question", "")).strip()
    answer   = str(row.get("answer", "")).strip()
    row_id   = row.get("row_id")

    # 1) Conversaci√≥n base (la estrategia ya decide c√≥mo pedir el c√≥digo)
    base = strategy.run_conversation(row, generator)

    # 2) Tomar el output del modelo y EXTRAER el c√≥digo (sin inyectar pol√≠tica)
    raw_output = base.get("model_output", "")
    code = extract_octave_code(raw_output)

    # 3) Ejecutar + auto-reparar si hay error o salida inv√°lida
    attempts = 0
    stdout = None
    stderr = None
    last_code = code

    while attempts <= max_fix_attempts:
        try:
            stdout, stderr = executor.execute_with_timeout(last_code)
        except Exception as e:
            stdout, stderr = None, str(e)

        if (stderr is None) and is_valid_stdout(stdout):
            break

        attempts += 1
        if attempts > max_fix_attempts:
            break

        repair_messages = [
            {"role": "user", "content":
                "The following GNU Octave code failed or produced an invalid output.\n"
                "Please fix it and return ONLY the corrected Octave script (no prose/markdown).\n\n"
                "Previous code:\n```octave\n" + last_code + "\n```\n"
                "stderr:\n" + (stderr or "(none)") + "\n"
                "stdout:\n" + (stdout or "(none)") + "\n"
            }
        ]
        repaired = generator.chat(repair_messages)
        last_code = extract_octave_code(repaired)

    # 4) Verificaci√≥n estricta del stdout (n√∫mero o letra) y correcci√≥n
    is_correct = (stderr is None) and is_valid_stdout(stdout) and verify_output(answer, stdout)

    # sumar el tiempo local al de base.get("inference_time")
    _elapsed = time.time() - _t0
    _base_time = base.get("inference_time") or 0.0
    _total_time = _base_time + _elapsed

    return {
        "row_id": row_id,
        "question": question,
        "answer": answer,
        "octave_code": last_code,
        "execution_output": stdout,
        "execution_error": stderr,
        "is_correct": is_correct,
        # extras √∫tiles para auditor√≠a
        "resolvability_response": base.get("resolvability_response"),
        "is_octave_resolvable": base.get("is_octave_resolvable"),
        "problem_features": base.get("problem_features"),
        "inference_time": _total_time,  # tiempo total
        "prompt_base": base.get("prompt"),
        "model_output_raw": raw_output,
    }

In [None]:
from collections import defaultdict
from IPython.display import display

def expected_row_ids(dataset_key: str, start_index: int, limit: int, n_samples: int) -> list[str]:
    """
    Deriva los row_id esperados para 'dataset_key' seg√∫n tu funci√≥n load_subset_df
    (primeros N_SAMPLES desde START_INDEX).
    """
    df = load_subset_df(dataset_key, start_index, limit, n_samples)
    return df["row_id"].astype(str).tolist()

def progress_table(models, datasets, strategy_name, out_dir=OUT_DIR) -> pd.DataFrame:
    """
    Construye una tabla con el progreso por (dataset, model, strategy):
        - total_target (N_SAMPLES)
        - done (cu√°ntos row_id ya est√°n en CSV)
        - pending (= total_target - done)
        - out_csv (ruta del csv donde se escribe)
    """
    rows = []
    for model_key in models:
        for dataset_key in datasets:
            out_csv = results_path(dataset_key, model_key, strategy_name)
            exp_ids = set(expected_row_ids(dataset_key))
            done_ids = set()
            if os.path.exists(out_csv):
                try:
                    tmp = pd.read_csv(out_csv, usecols=["row_id"])
                    done_ids = set(tmp["row_id"].astype(str).tolist())
                except Exception:
                    done_ids = set()
            rows.append({
                "dataset": dataset_key,
                "model": model_key,
                "strategy": strategy_name,
                "total_target": len(exp_ids),
                "done": len(exp_ids & done_ids),
                "pending": len(exp_ids - done_ids),
                "out_csv": out_csv
            })
    dfp = pd.DataFrame(rows).sort_values(["dataset","model"]).reset_index(drop=True)
    display(dfp)
    return dfp

def list_pending_ids(dataset_key: str, model_key: str, strategy_name: str, start_index: int, limit: int, n_samples: int) -> list[str]:
    out_csv = results_path(dataset_key, model_key, strategy_name)
    exp = set(expected_row_ids(dataset_key, start_index, limit, n_samples))
    done = set()
    if os.path.exists(out_csv):
        try:
            done = set(pd.read_csv(out_csv, usecols=["row_id"])["row_id"].astype(str))
        except Exception:
            pass
    return sorted(exp - done)

# 6) Reanudaci√≥n (checkpoint) y orquestador multi-modelo/dataset/pol√≠tica

In [None]:
# =========================
# REANUDACI√ìN & ORQUESTADOR
# =========================

from tqdm.auto import tqdm

def run_experiments_progress(models, datasets, strategy_name, start_index, limit, n_samples):
    for model_key in models:
        print(f"\n================= MODEL: {model_key} =================")
        generator = build_generator(model_key)
        strategy  = build_strategy(strategy_name)
        executor  = OctaveCodeExecutor(timeout=EXEC_TIMEOUT)

        for dataset_key in datasets:
            print(f"-- dataset: {dataset_key}")
            # ids esperados y pendientes
            all_ids = expected_row_ids(dataset_key, start_index, limit, n_samples)
            out_csv = results_path(dataset_key, model_key, strategy_name)
            done_ids = set()
            if os.path.exists(out_csv):
                try:
                    done_ids = set(pd.read_csv(out_csv, usecols=["row_id"])["row_id"].astype(str))
                except Exception:
                    done_ids = set()
            pend_ids = [rid for rid in all_ids if rid not in done_ids]

            # si no hay pendientes, contin√∫a
            if not pend_ids:
                print("No hay pendientes. Saltando.")
                continue

            # cargar subset completo una sola vez y filtrar por pending ids
            df = load_subset_df(dataset_key, start_index, limit, n_samples)
            df = df[df["row_id"].isin(pend_ids)].copy()

            write_header = not os.path.exists(out_csv)
            with open(out_csv, "a", encoding="utf-8") as fout:
                for _, row in tqdm(df.iterrows(), total=len(df), desc=f"{dataset_key} ({len(pend_ids)} pendientes)"):
                    rowd = row.to_dict()
                    try:
                        out = run_single_example(
                            row=rowd,
                            strategy=strategy,
                            generator=generator,
                            executor=executor,
                            max_fix_attempts=MAX_FIX_ATTEMPTS
                        )
                        out.update({
                            "dataset": dataset_key,
                            "model": model_key,
                            "strategy": strategy_name,
                            "timestamp": datetime.now().isoformat(timespec="seconds")
                        })
                    except Exception as e:
                        out = {
                            "row_id": rowd.get("row_id"),
                            "question": rowd.get("question"),
                            "answer": rowd.get("answer"),
                            "octave_code": None,
                            "execution_output": None,
                            "execution_error": f"PIPELINE_ERROR: {e}",
                            "is_correct": False,
                            "dataset": dataset_key,
                            "model": model_key,
                            "strategy": strategy_name,
                            "timestamp": datetime.now().isoformat(timespec="seconds")
                        }
                    pd.DataFrame([out]).to_csv(fout, header=write_header, index=False)
                    write_header = False

            # peque√±o resumen por combo al terminar
            new_done = len(set(expected_row_ids(dataset_key, start_index, limit, n_samples)) & set(pd.read_csv(out_csv, usecols=["row_id"])["row_id"].astype(str)))
            print(f"  ‚Üí Progreso {dataset_key}/{model_key}: {new_done}/{len(all_ids)} completados")

    print("Experimentos finalizados / checkpoints actualizados.")

# 7) Lanzar los experimentos

In [None]:
# Tama√±os
N_SAMPLES          = 900      # ejercicios por dataset para iteraci√≥n r√°pida
START_INDEX        = 0       # desde qu√© √≠ndice muestrear
LIMIT_PER_DATASET  = 900     # recortar a los primeros 400 √≠tems

# Datasets / Modelos a evaluar
DATASETS_TO_RUN = ["benchmark"] #["gsm8k", "math_data", "math_shuffled", "benchmark"]
MODELS_TO_RUN   = ["deepseek-math-7b"] #["deepseek-math-7b", "mathstral-7b", "mistral-7b-instruct", "qwen2-math-7b-instruct"]
# Estrategia por defecto ("nonconv_zeroshot", "nonconv_packed", "zero_shot", "few_shots", "cot_reasoning", "rag")
STRATEGY_NAME   = "few_shots"

run_experiments_progress(
    models=MODELS_TO_RUN,
    datasets=DATASETS_TO_RUN,
    strategy_name=STRATEGY_NAME,
    start_index=START_INDEX,
    limit=LIMIT_PER_DATASET,
    n_samples=N_SAMPLES
)





For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: a9d4f501-4393-4763-9295-75638d5ea02d)')' thrown while requesting HEAD https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct/resolve/0a5828f800a36df0fd7f0ed581b983246c0677ff/LICENSE
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 098e2ca6-1afb-42d4-b5f1-fc958dfcb1cb)')' thrown while requesting HEAD https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct/resolve/0a5828f800a36df0fd7f0ed581b983246c0677ff/pytorch_model-00001-of-00002.bin
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c75c1333-c742-4d9b-bc23-395429acf5d6)')' thrown while requesting HEAD https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct/resolve/0a5828f800a36df0fd7f0ed581b983246c0677ff/generation_config.json
Retrying in 1s [Retry

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

INFO 11-02 17:10:45 [utils.py:233] non-default args: {'trust_remote_code': True, 'gpu_memory_utilization': 0.92, 'disable_log_stats': True, 'model': '/content/deepseek-math-7b-instruct'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-02 17:10:45 [model.py:547] Resolved architecture: LlamaForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-02 17:10:45 [model.py:1510] Using max model len 4096
INFO 11-02 17:10:47 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-02 17:12:12 [llm.py:306] Supported_tasks: ['generate']
Indexados 1994 ejemplos en 6 categor√≠as (problem_type).
-- dataset: benchmark
‚úÖ CSV experimental guardado en: /content/drive/MyDrive/tesis/datasets/benchmarks/benchmark_math_gsm8k_3x300_experiment.csv
No hay pendientes. Saltando.
Experimentos finalizados / checkpoints actualizados.


# Resumen & Leaderboard

In [None]:
# =========================
# RESUMEN & RANKING (penaliza no cobertura) + TIEMPOS
# =========================
import os, glob, math
import pandas as pd
from datetime import datetime
import numpy as np

OUT_DIR = "/content/drive/MyDrive/tesis/experiments_final"

# 1) Cargar todos los CSV de resultados
csv_paths = sorted(glob.glob(os.path.join(OUT_DIR, "results_*.csv")))
if not csv_paths:
    raise FileNotFoundError(f"No se encontraron CSVs de resultados en {OUT_DIR}")

dfs = []
for p in csv_paths:
    try:
        df = pd.read_csv(p)
        # columnas m√≠nimas
        for c in [
            "row_id","dataset","model","strategy","is_correct",
            "octave_code","execution_output","execution_error",
            "inference_time"
        ]:
            if c not in df.columns:
                df[c] = None
        df["__source_file"] = os.path.basename(p)
        dfs.append(df)
    except Exception as e:
        print(f"‚ö†Ô∏è Error leyendo {p}: {e}")

if not dfs:
    raise RuntimeError("No se pudo cargar ning√∫n CSV v√°lido.")
all_df = pd.concat(dfs, ignore_index=True)

# 2) Normalizaci√≥n de tipos
# - is_correct: booleana (NaN -> False)
all_df["is_correct"] = all_df["is_correct"].fillna(False).astype(bool)

# Asegura la columna is_octave_resolvable
if "is_octave_resolvable" not in all_df.columns:
    all_df["is_octave_resolvable"] = False
all_df["is_octave_resolvable"] = all_df["is_octave_resolvable"].fillna(False).astype(bool)

# Define qu√© estrategias cuentan como conversacionales
CONV_STRATS = {
    "zero_shot", "few_shots", "cot", "cot_reasoning", "rag"
}

# Marca de intento para no conversacionales (rastro de ejecuci√≥n)
attempted_nonconv = (
    all_df["octave_code"].notna() |
    all_df["execution_output"].notna() |
    all_df["execution_error"].notna()
)

# Calcula attempted seg√∫n el tipo de estrategia
all_df["attempted"] = np.where(
    all_df["strategy"].astype(str).isin(CONV_STRATS),
    all_df["is_octave_resolvable"],      # conversacionales: solo si resoluble
    attempted_nonconv                    # no conversacionales: rastro de ejecuci√≥n
).astype(bool)

# - inference_time: num√©rico (NaN si no disponible)
all_df["inference_time"] = pd.to_numeric(all_df["inference_time"], errors="coerce")

# 3) Total de √≠tems por dataset (n√∫mero de problemas distintos en el dataset)
total_by_dataset = (
    all_df.groupby("dataset")["row_id"]
    .nunique()
    .rename("total_items")
    .reset_index()
)

# 4) Agregaci√≥n por (dataset, modelo, estrategia)
agg = (
    all_df.groupby(["dataset","model","strategy"], dropna=False)
    .agg(
        attempted=("attempted", "sum"),                                # cu√°ntos problemas intent√≥
        correct=("is_correct", lambda s: int(pd.Series(s).sum())),     # cu√°ntos correctos
        total_rows=("row_id","count"),                                  # filas logueadas (referencia)
        inf_time_sum=("inference_time", "sum"),                         # suma de tiempos (ignora NaN)
        inf_time_mean=("inference_time", "mean"),                       # media simple (ignora NaN)
        inf_time_median=("inference_time", "median")                    # mediana (robusta)
    )
    .reset_index()
)

# 5) Unir el total de √≠tems reales del dataset
agg = agg.merge(total_by_dataset, on="dataset", how="left")

# 6) M√©tricas
# - coverage = attempted / total_items (qu√© fracci√≥n del dataset intent√≥)
# - attempt_success = correct / attempted (precisi√≥n sobre lo intentado)
# - overall_success = correct / total_items (penaliza no cobertura)
agg["coverage"] = agg.apply(lambda r: (r["attempted"]/r["total_items"]) if r["total_items"]>0 else 0.0, axis=1)
agg["attempt_success"] = agg.apply(lambda r: (r["correct"]/r["attempted"]) if r["attempted"]>0 else 0.0, axis=1)
agg["overall_success"] = agg.apply(lambda r: (r["correct"]/r["total_items"]) if r["total_items"]>0 else 0.0, axis=1)

# Tiempo medio por intento (promedia mejor cuando hay distinta cobertura)
agg["inf_time_per_attempt"] = agg.apply(
    lambda r: (r["inf_time_sum"]/r["attempted"]) if r["attempted"]>0 else float("nan"), axis=1
)

# 7) Rankings
# 7.1 Ranking por dataset (ordenado por overall_success desc; desempate: attempt_success desc, coverage desc)
rank_per_dataset = (
    agg.sort_values(
        ["dataset","overall_success","attempt_success","coverage","correct"],
        ascending=[True, False, False, False, False]
    )
)

# 7.2 Ranking global sumando across datasets
global_score = (
    agg.groupby(["model","strategy"], dropna=False)
       .agg(
           total_correct=("correct","sum"),
           total_attempted=("attempted","sum"),
           total_items=("total_items","sum"),
           total_inf_time=("inf_time_sum","sum"),
           mean_inf_time=("inf_time_mean","mean"),          # media de medias (informativa)
           median_inf_time=("inf_time_median","median")     # mediana de medianas (robusta)
       )
       .reset_index()
)

# M√©tricas globales
global_score["global_overall_success"] = global_score.apply(
    lambda r: (r["total_correct"]/r["total_items"]) if r["total_items"]>0 else 0.0, axis=1
)
global_score["global_attempt_success"] = global_score.apply(
    lambda r: (r["total_correct"]/r["total_attempted"]) if r["total_attempted"]>0 else 0.0, axis=1
)
global_score["global_coverage"] = global_score.apply(
    lambda r: (r["total_attempted"]/r["total_items"]) if r["total_items"]>0 else 0.0, axis=1
)
# Tiempo medio global por intento (ponderado por intentos)
global_score["global_inf_time_per_attempt"] = global_score.apply(
    lambda r: (r["total_inf_time"]/r["total_attempted"]) if r["total_attempted"]>0 else float("nan"), axis=1
)

global_rank = global_score.sort_values(
    ["global_overall_success","global_attempt_success","global_coverage","total_correct"],
    ascending=[False, False, False, False]
).reset_index(drop=True)

# 8) Guardar y mostrar
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
agg_path = os.path.join(OUT_DIR, f"agg_overall_{ts}.csv")
rank_ds_path = os.path.join(OUT_DIR, f"rank_per_dataset_{ts}.csv")
global_rank_path = os.path.join(OUT_DIR, f"global_rank_{ts}.csv")

agg.to_csv(agg_path, index=False)
rank_per_dataset.to_csv(rank_ds_path, index=False)
global_rank.to_csv(global_rank_path, index=False)

print("‚úÖ Guardados:")
print("  -", agg_path)
print("  -", rank_ds_path)
print("  -", global_rank_path)

print("\n=== üèÅ Ranking GLOBAL (mejor ‚Üí peor) ===")
display(rank_per_dataset[[
    "dataset","model","strategy",
    "correct","attempted","total_items",
    "overall_success","attempt_success","coverage",
    "inf_time_per_attempt","inf_time_mean","inf_time_median","inf_time_sum"
]])



‚úÖ Guardados:
  - /content/drive/MyDrive/tesis/experiments_final/agg_overall_20251030_210027.csv
  - /content/drive/MyDrive/tesis/experiments_final/rank_per_dataset_20251030_210027.csv
  - /content/drive/MyDrive/tesis/experiments_final/global_rank_20251030_210027.csv

=== üèÅ Ranking GLOBAL (mejor ‚Üí peor) ===


  all_df["is_octave_resolvable"] = all_df["is_octave_resolvable"].fillna(False).astype(bool)


Unnamed: 0,dataset,model,strategy,correct,attempted,total_items,overall_success,attempt_success,coverage,inf_time_per_attempt,inf_time_mean,inf_time_median,inf_time_sum
6,benchmark,mathstral-7b,cot_reasoning,422,900,900,0.468889,0.468889,1.0,101.477798,101.477798,97.070483,91330.01803
3,benchmark,deepseek-math-7b,nonconv_zeroshot,379,811,900,0.421111,0.467324,0.901111,26.336648,23.732246,18.86735,21359.021514
18,benchmark,qwen2-math-7b-instruct,cot_reasoning,379,868,900,0.421111,0.436636,0.964444,170.554202,164.490053,151.67298,148041.047634
10,benchmark,mathstral-7b,rag,377,900,900,0.418889,0.418889,1.0,60.705703,60.705703,56.027802,54635.132286
2,benchmark,deepseek-math-7b,nonconv_packed,376,827,900,0.417778,0.454655,0.918889,25.215631,23.170363,19.114528,20853.326453
11,benchmark,mathstral-7b,zero_shot,365,900,900,0.405556,0.405556,1.0,63.291097,63.291097,59.663431,56961.987339
7,benchmark,mathstral-7b,few_shots,364,900,900,0.404444,0.404444,1.0,61.473006,61.473006,57.597412,55325.705601
9,benchmark,mathstral-7b,nonconv_zeroshot,355,900,900,0.394444,0.394444,1.0,24.845617,24.845617,21.301638,22361.055521
23,benchmark,qwen2-math-7b-instruct,zero_shot,352,777,900,0.391111,0.453024,0.863333,137.642868,118.831676,107.985078,106948.508755
19,benchmark,qwen2-math-7b-instruct,few_shots,347,782,900,0.385556,0.443734,0.868889,140.296614,121.766873,110.719337,109711.952215


In [None]:
display(global_rank[[
    "model","strategy",
    "total_correct","total_attempted","total_items",
    "global_overall_success","global_attempt_success","global_coverage",
    "global_inf_time_per_attempt","total_inf_time","mean_inf_time","median_inf_time"
]].head(20))

print("\n=== üß© Ranking por DATASET (ordenado por overall_success) ===")
display(rank_per_dataset[[
    "dataset","model","strategy",
    "correct","attempted","total_items",
    "overall_success","attempt_success","coverage",
    "inf_time_per_attempt","inf_time_mean","inf_time_median","inf_time_sum"
]])

Unnamed: 0,model,strategy,total_correct,total_attempted,total_items,global_overall_success,global_attempt_success,global_coverage,global_inf_time_per_attempt,total_inf_time,mean_inf_time,median_inf_time
0,mathstral-7b,cot_reasoning,422,900,900,0.468889,0.468889,1.0,101.477798,91330.01803,101.477798,97.070483
1,deepseek-math-7b,nonconv_zeroshot,379,811,900,0.421111,0.467324,0.901111,26.336648,21359.021514,23.732246,18.86735
2,qwen2-math-7b-instruct,cot_reasoning,379,868,900,0.421111,0.436636,0.964444,170.554202,148041.047634,164.490053,151.67298
3,mathstral-7b,rag,377,900,900,0.418889,0.418889,1.0,60.705703,54635.132286,60.705703,56.027802
4,deepseek-math-7b,nonconv_packed,376,827,900,0.417778,0.454655,0.918889,25.215631,20853.326453,23.170363,19.114528
5,mathstral-7b,zero_shot,365,900,900,0.405556,0.405556,1.0,63.291097,56961.987339,63.291097,59.663431
6,mathstral-7b,few_shots,364,900,900,0.404444,0.404444,1.0,61.473006,55325.705601,61.473006,57.597412
7,mathstral-7b,nonconv_zeroshot,355,900,900,0.394444,0.394444,1.0,24.845617,22361.055521,24.845617,21.301638
8,qwen2-math-7b-instruct,zero_shot,352,777,900,0.391111,0.453024,0.863333,137.642868,106948.508755,118.831676,107.985078
9,qwen2-math-7b-instruct,few_shots,347,782,900,0.385556,0.443734,0.868889,140.296614,109711.952215,121.766873,110.719337



=== üß© Ranking por DATASET (ordenado por overall_success) ===


Unnamed: 0,dataset,model,strategy,correct,attempted,total_items,overall_success,attempt_success,coverage,inf_time_per_attempt,inf_time_mean,inf_time_median,inf_time_sum
6,benchmark,mathstral-7b,cot_reasoning,422,900,900,0.468889,0.468889,1.0,101.477798,101.477798,97.070483,91330.01803
3,benchmark,deepseek-math-7b,nonconv_zeroshot,379,811,900,0.421111,0.467324,0.901111,26.336648,23.732246,18.86735,21359.021514
18,benchmark,qwen2-math-7b-instruct,cot_reasoning,379,868,900,0.421111,0.436636,0.964444,170.554202,164.490053,151.67298,148041.047634
10,benchmark,mathstral-7b,rag,377,900,900,0.418889,0.418889,1.0,60.705703,60.705703,56.027802,54635.132286
2,benchmark,deepseek-math-7b,nonconv_packed,376,827,900,0.417778,0.454655,0.918889,25.215631,23.170363,19.114528,20853.326453
11,benchmark,mathstral-7b,zero_shot,365,900,900,0.405556,0.405556,1.0,63.291097,63.291097,59.663431,56961.987339
7,benchmark,mathstral-7b,few_shots,364,900,900,0.404444,0.404444,1.0,61.473006,61.473006,57.597412,55325.705601
9,benchmark,mathstral-7b,nonconv_zeroshot,355,900,900,0.394444,0.394444,1.0,24.845617,24.845617,21.301638,22361.055521
23,benchmark,qwen2-math-7b-instruct,zero_shot,352,777,900,0.391111,0.453024,0.863333,137.642868,118.831676,107.985078,106948.508755
19,benchmark,qwen2-math-7b-instruct,few_shots,347,782,900,0.385556,0.443734,0.868889,140.296614,121.766873,110.719337,109711.952215


In [None]:
# =========================
# AN√ÅLISIS ESTAD√çSTICO: COCHRAN'S Q y McNEMAR (penalizando no-cobertura)
# =========================
import os, glob
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import cochrans_q, mcnemar
from statsmodels.stats.multitest import multipletests

OUT_DIR = "/content/drive/MyDrive/tesis/experiments_final"

# 1) Cargar results_*.csv
csv_paths = sorted(glob.glob(os.path.join(OUT_DIR, "results_*.csv")))
if not csv_paths:
    raise FileNotFoundError(f"No se encontraron CSVs de resultados en {OUT_DIR}")

dfs = []
for p in csv_paths:
    try:
        df = pd.read_csv(p)
        df["__source_file"] = os.path.basename(p)
        dfs.append(df)
    except Exception as e:
        print(f"‚ö†Ô∏è Error leyendo {p}: {e}")
if not dfs:
    raise RuntimeError("No se pudo cargar ning√∫n CSV v√°lido.")
all_df = pd.concat(dfs, ignore_index=True)

# (opcional) limita al benchmark
# all_df = all_df[all_df["dataset"] == "benchmark"].copy()

# 2) Campos m√≠nimos y tipado
for c in ["row_id","dataset","model","strategy","is_correct"]:
    if c not in all_df.columns:
        all_df[c] = None
# is_correct -> bool/int
all_df["is_correct"] = all_df["is_correct"].fillna(False).astype(bool).astype(int)

# 3) Construir "pair" y la grilla completa row_id √ó pair para penalizar no-cobertura
all_df["pair"] = all_df["model"].astype(str) + "_" + all_df["strategy"].astype(str)
row_ids = all_df["row_id"].astype(str).unique()
pairs   = all_df["pair"].unique()

pivot = (all_df
         .pivot_table(index="row_id", columns="pair", values="is_correct", aggfunc="first"))
# reindex a la grilla completa y falta -> 0
pivot = pivot.reindex(index=row_ids, columns=pairs).fillna(0).astype(int)

print(f"‚úÖ Matriz creada con {pivot.shape[0]} problemas y {pivot.shape[1]} combinaciones.")

# 4) Cochran‚Äôs Q (test global)
print("\n=== üßÆ COCHRAN'S Q TEST ===")
q_res = cochrans_q(pivot.to_numpy())
print(f"Q = {q_res.statistic:.4f}, p = {q_res.pvalue:.6f}")
if q_res.pvalue < 0.05:
    print("‚Üí Hay diferencias significativas entre combinaciones (rechaza H0).")
else:
    print("‚Üí No hay evidencia suficiente de diferencias globales (no se rechaza H0).")

# 5) McNemar pareado + correcci√≥n Holm
print("\n=== ‚öñÔ∏è MCNEMAR PAIRWISE TESTS ===")
pairs_list = list(pivot.columns)
pw = []
for i in range(len(pairs_list)):
    for j in range(i+1, len(pairs_list)):
        a, b = pairs_list[i], pairs_list[j]
        tbl = pd.crosstab(pivot[a], pivot[b])  # 2x2
        if tbl.shape == (2,2):
            res = mcnemar(tbl, exact=False, correction=True)
            pw.append({"A": a, "B": b, "stat": float(res.statistic), "p": float(res.pvalue)})

pairwise_df = pd.DataFrame(pw).sort_values("p").reset_index(drop=True)
if not pairwise_df.empty:
    # Holm‚ÄìBonferroni
    rej, p_adj, *_ = multipletests(pairwise_df["p"].values, method="holm")
    pairwise_df["p_holm"] = p_adj
    pairwise_df["signif_0.05"] = rej

    print("\nParejas significativas (p_holm < 0.05):")
    display(pairwise_df[pairwise_df["p_holm"] < 0.05])
    print("\nResumen completo (ordenado por p cruda):")
    display(pairwise_df)
else:
    print("No hay pares comparables (matriz vac√≠a o columnas insuficientes).")

# 6) ‚ÄúMejor combinaci√≥n‚Äù por tasa de acierto (promedio de la columna)
acc_by_pair = pivot.mean(axis=0).sort_values(ascending=False)
best_pair   = acc_by_pair.index[0]
best_score  = acc_by_pair.iloc[0]
print("\n=== üèÜ MEJOR COMBINACI√ìN (por proporci√≥n de aciertos penalizando no-cobertura) ===")
print(f"{best_pair}: accuracy = {best_score:.4f}")

# (Opcional) tabla ordenada
rank_table = acc_by_pair.reset_index()
rank_table.columns = ["pair", "accuracy"]
display(rank_table)

‚úÖ Matriz creada con 900 problemas y 24 combinaciones.

=== üßÆ COCHRAN'S Q TEST ===
Q = 678.7573, p = 0.000000
‚Üí Hay diferencias significativas entre combinaciones (rechaza H0).

=== ‚öñÔ∏è MCNEMAR PAIRWISE TESTS ===

Parejas significativas (p_holm < 0.05):


Unnamed: 0,A,B,stat,p,p_holm,signif_0.05
0,mathstral-7b_cot_reasoning,mistral-7b-instruct_nonconv_packed,165.120623,8.609151e-38,2.376126e-35,True
1,mathstral-7b_cot_reasoning,mistral-7b-instruct_zero_shot,131.390335,2.034066e-30,5.593681e-28,True
2,mathstral-7b_cot_reasoning,mistral-7b-instruct_nonconv_zeroshot,129.322449,5.764936e-30,1.579592e-27,True
3,mathstral-7b_cot_reasoning,mistral-7b-instruct_few_shots,116.964000,2.923258e-27,7.980495e-25,True
4,mistral-7b-instruct_nonconv_packed,deepseek-math-7b_nonconv_zeroshot,114.521552,1.001716e-26,2.724668e-24,True
...,...,...,...,...,...,...
138,deepseek-math-7b_few_shots,mathstral-7b_nonconv_packed,16.497854,4.870517e-05,6.721313e-03,True
139,mathstral-7b_nonconv_packed,mistral-7b-instruct_nonconv_packed,15.297561,9.183501e-05,1.258140e-02,True
140,qwen2-math-7b-instruct_nonconv_packed,deepseek-math-7b_zero_shot,15.250000,9.417676e-05,1.280804e-02,True
141,deepseek-math-7b_cot_reasoning,mistral-7b-instruct_nonconv_zeroshot,13.425620,2.482110e-04,3.350848e-02,True



Resumen completo (ordenado por p cruda):


Unnamed: 0,A,B,stat,p,p_holm,signif_0.05
0,mathstral-7b_cot_reasoning,mistral-7b-instruct_nonconv_packed,165.120623,8.609151e-38,2.376126e-35,True
1,mathstral-7b_cot_reasoning,mistral-7b-instruct_zero_shot,131.390335,2.034066e-30,5.593681e-28,True
2,mathstral-7b_cot_reasoning,mistral-7b-instruct_nonconv_zeroshot,129.322449,5.764936e-30,1.579592e-27,True
3,mathstral-7b_cot_reasoning,mistral-7b-instruct_few_shots,116.964000,2.923258e-27,7.980495e-25,True
4,mistral-7b-instruct_nonconv_packed,deepseek-math-7b_nonconv_zeroshot,114.521552,1.001716e-26,2.724668e-24,True
...,...,...,...,...,...,...
271,qwen2-math-7b-instruct_cot_reasoning,mathstral-7b_rag,0.003906,9.501647e-01,1.000000e+00,False
272,qwen2-math-7b-instruct_cot_reasoning,deepseek-math-7b_nonconv_zeroshot,0.003817,9.507379e-01,1.000000e+00,False
273,mathstral-7b_few_shots,mathstral-7b_zero_shot,0.000000,1.000000e+00,1.000000e+00,False
274,deepseek-math-7b_nonconv_packed,mathstral-7b_rag,0.000000,1.000000e+00,1.000000e+00,False



=== üèÜ MEJOR COMBINACI√ìN (por proporci√≥n de aciertos penalizando no-cobertura) ===
mathstral-7b_cot_reasoning: accuracy = 0.4689


Unnamed: 0,pair,accuracy
0,mathstral-7b_cot_reasoning,0.468889
1,qwen2-math-7b-instruct_cot_reasoning,0.421111
2,deepseek-math-7b_nonconv_zeroshot,0.421111
3,mathstral-7b_rag,0.418889
4,deepseek-math-7b_nonconv_packed,0.417778
5,mathstral-7b_zero_shot,0.405556
6,mathstral-7b_few_shots,0.404444
7,mathstral-7b_nonconv_zeroshot,0.394444
8,qwen2-math-7b-instruct_zero_shot,0.391111
9,qwen2-math-7b-instruct_few_shots,0.385556


In [None]:
# ===============================
# Mejor combinaci√≥n (modelo + estrategia) penalizando no-cobertura
# ===============================
import os
import glob
import pandas as pd
import numpy as np

OUT_DIR = "/content/drive/MyDrive/tesis/experiments_final"

# 1) Cargar todos los CSV de resultados
csv_paths = sorted(glob.glob(os.path.join(OUT_DIR, "results_*.csv")))
if not csv_paths:
    raise FileNotFoundError(f"No se encontraron CSVs en {OUT_DIR}")

dfs = []
for p in csv_paths:
    try:
        df = pd.read_csv(
            p,
            usecols=["row_id","dataset","model","strategy","is_correct"]
        )
        df["__source_file"] = os.path.basename(p)
        dfs.append(df)
    except Exception as e:
        print(f"‚ö†Ô∏è Error leyendo {p}: {e}")

if not dfs:
    raise RuntimeError("No se pudo cargar ning√∫n CSV v√°lido.")

all_df = pd.concat(dfs, ignore_index=True)

# 2) Tipado y columna combinada
all_df["row_id"]   = all_df["row_id"].astype(str)
all_df["dataset"]  = all_df["dataset"].astype(str)
all_df["model"]    = all_df["model"].astype(str)
all_df["strategy"] = all_df["strategy"].astype(str)
all_df["pair"]     = all_df["model"] + "_" + all_df["strategy"]

# is_correct -> bool/int (NaN -> False)
all_df["is_correct"] = all_df["is_correct"].fillna(False).astype(bool).astype(int)

# 3) Ranking por dataset (penaliza no-cobertura construyendo la grilla completa)
pairs_global = sorted(all_df["pair"].unique())
per_dataset_rows = []

for ds, g in all_df.groupby("dataset", dropna=False):
    row_ids = sorted(g["row_id"].unique())
    # pivot de aciertos (0/1) y reindexar a grilla completa row_id √ó pair
    piv = (g.pivot_table(index="row_id", columns="pair", values="is_correct", aggfunc="first")
             .reindex(index=row_ids, columns=pairs_global)
             .fillna(0).astype(int))

    # m√©tricas por par dentro del dataset
    correct_total = piv.sum(axis=0)                    # aciertos totales del par en este dataset
    total_items   = len(row_ids)                       # n√∫mero de problemas del dataset
    overall_succ  = correct_total / total_items        # proporci√≥n de aciertos penalizando no-cobertura

    tmp = (pd.DataFrame({
            "dataset": ds,
            "pair": correct_total.index,
            "correct_total": correct_total.values,
            "total_items": total_items,
            "overall_success": overall_succ.values
          })
          .sort_values(["overall_success","correct_total"], ascending=[False, False])
          .reset_index(drop=True))
    per_dataset_rows.append(tmp)

rank_per_dataset = pd.concat(per_dataset_rows, ignore_index=True)

# Separar model / strategy para lectura c√≥moda
rank_per_dataset[["model","strategy"]] = rank_per_dataset["pair"].str.split("_", n=1, expand=True)

# 4) Ranking global (sumando a trav√©s de datasets)
global_rank = (rank_per_dataset
               .groupby(["pair","model","strategy"], dropna=False)
               .agg(total_correct=("correct_total","sum"),
                    total_items=("total_items","sum"))
               .reset_index())

global_rank["global_overall_success"] = global_rank["total_correct"] / global_rank["total_items"]

global_rank = (global_rank
               .sort_values(["global_overall_success","total_correct","total_items"],
                             ascending=[False, False, False])
               .reset_index(drop=True))

# 5) Mostrar ‚Äúmejor combinaci√≥n‚Äù y top 5
print("‚úÖ Ranking GLOBAL (top 5 por proporci√≥n de aciertos penalizando no-cobertura):")
print(global_rank.head(5)[["pair","model","strategy","total_correct","total_items","global_overall_success"]])

best = global_rank.iloc[0]
print("\nüéØ Mejor combinaci√≥n GLOBAL:")
print(f"  Par: {best['pair']}  |  Modelo: {best['model']}  |  Estrategia: {best['strategy']}")
print(f"  Aciertos/Total: {int(best['total_correct'])}/{int(best['total_items'])}  "
      f"(global_overall_success = {best['global_overall_success']:.4f})")

# 6) Guardar rankings a CSV
rank_ds_path     = os.path.join(OUT_DIR, "ranking_model_strategy_per_dataset_penalized.csv")
global_rank_path = os.path.join(OUT_DIR, "ranking_model_strategy_global_penalized.csv")

rank_per_dataset.to_csv(rank_ds_path, index=False)
global_rank.to_csv(global_rank_path, index=False)

print("\nüíæ Guardados:")
print("  -", rank_ds_path)
print("  -", global_rank_path)

# (Opcional) Mostrar ranking por dataset ordenado
print("\n=== üß© Ranking por DATASET (mejor ‚Üí peor en cada dataset) ===")
disp_cols = ["dataset","pair","model","strategy","correct_total","total_items","overall_success"]
for ds in rank_per_dataset["dataset"].unique():
    print(f"\n[Dataset: {ds}]")
    print(rank_per_dataset[rank_per_dataset["dataset"]==ds][disp_cols]
          .sort_values("overall_success", ascending=False)
          .head(10)
          .to_string(index=False))

‚úÖ Ranking GLOBAL (top 5 por proporci√≥n de aciertos penalizando no-cobertura):
                                   pair                   model  \
0            mathstral-7b_cot_reasoning            mathstral-7b   
1     deepseek-math-7b_nonconv_zeroshot        deepseek-math-7b   
2  qwen2-math-7b-instruct_cot_reasoning  qwen2-math-7b-instruct   
3                      mathstral-7b_rag            mathstral-7b   
4       deepseek-math-7b_nonconv_packed        deepseek-math-7b   

           strategy  total_correct  total_items  global_overall_success  
0     cot_reasoning            422          900                0.468889  
1  nonconv_zeroshot            379          900                0.421111  
2     cot_reasoning            379          900                0.421111  
3               rag            377          900                0.418889  
4    nonconv_packed            376          900                0.417778  

üéØ Mejor combinaci√≥n GLOBAL:
  Par: mathstral-7b_cot_reasoning  |  M