In [2]:
!pip install transformers torch accelerate --upgrade

Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.11.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.8/375.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.24.1
    Uninstalling accelerate-0.24.1:
      Successfully uninstalled accelerate-0.24.1
Successfully installed accelerate-1.11.0
[0m

In [1]:
# llm_recommender.py

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
import re

class LLMRecommender:
    """Wrapper genérico para cualquier LLM de HuggingFace"""
    
    def __init__(self, model_name: str, device: str = 'cuda', 
                 load_in_8bit: bool = False, max_memory_gb: int = 15):
        """
        Args:
            model_name: nombre del modelo en HuggingFace 
            device: 'cuda' o 'cpu'
            load_in_8bit: si usar cuantización 8-bit
            max_memory_gb: GB máximos a usar
        """
        self.model_name = model_name
        self.device = device
        
        print(f"Loading {model_name}...")
        
        # Cargar tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Configurar pad token si no existe
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Cargar modelo con cuantización si se solicita
        if load_in_8bit:
            from transformers import BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
            
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                quantization_config=quantization_config,
                max_memory={0: f"{max_memory_gb}GB"}
            )
        else:
            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32

            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=dtype,
                device_map="auto",
                max_memory={0: f"{max_memory_gb}GB"}
            )
        
        self.model.eval()  # modo evaluación
        print(f"Model loaded successfully on {self.device}!")
        print(f"Model size: ~{self.get_model_size_gb():.2f} GB")
    
    def get_model_size_gb(self) -> float:
        """Calcula el tamaño del modelo en GB"""
        param_size = sum(p.nelement() * p.element_size() for p in self.model.parameters())
        buffer_size = sum(b.nelement() * b.element_size() for b in self.model.buffers())
        return (param_size + buffer_size) / (1024**3)
    
    def generate_recommendations(self, context: str, prompt_template, 
                                max_new_tokens: int = 250,
                                temperature: float = 0.7) -> tuple:
        """
        Genera recomendaciones dado un contexto
        
        Args:
            context: string con el contexto de la conversación
            prompt_template: función que formatea el prompt
            max_new_tokens: máximo de tokens a generar
            temperature: temperatura para sampling
            
        Returns:
            response: string con la respuesta del modelo
            latency: tiempo de generación en segundos
        """
        # Formatear prompt
        prompt = prompt_template(context)
        
        # Tokenizar
        inputs = self.tokenizer(
            prompt, 
            return_tensors="pt",
            truncation=True,
            max_length=2048  # limitar contexto para evitar OOM
        ).to(self.device)
        
        # Generar
        start_time = time.time()
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True if temperature > 0 else False,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                repetition_penalty=1.1,  # evitar repetición
                top_p=0.9,  # nucleus sampling
            )
        
        latency = time.time() - start_time
        
        # Decodificar solo la parte nueva
        input_length = inputs['input_ids'].shape[1]
        response = self.tokenizer.decode(
            outputs[0][input_length:], 
            skip_special_tokens=True
        )
        
        return response, latency
    
    def extract_movie_titles(self, response: str, max_recommendations: int = 10) -> list:
        """
        Extrae títulos de películas de la respuesta del modelo
        VERSIÓN MEJORADA: separa título de descripción
        
        Args:
            response: string con la respuesta del modelo
            max_recommendations: máximo de recomendaciones a retornar
            
        Returns:
            list de strings con títulos de películas (solo los títulos)
        """
        movies = []
        lines = response.strip().split('\n')
        
        for line in lines:
            line = line.strip()
            
            # Saltar líneas vacías o muy cortas
            if len(line) < 3:
                continue
            
            # Saltar líneas que no parecen ser recomendaciones
            skip_phrases = ['here are', 'based on', 'you might', 'i recommend', 
                           'these are', 'recommendation', 'user:', 'assistant:', 
                           'given this', 'conversation']
            if any(phrase in line.lower()[:50] for phrase in skip_phrases):
                continue
            
            # Remover numeración común: "1.", "1)", "- ", etc.
            line = re.sub(r'^[\d\-\.\)\*\•\s]+', '', line)
            
            # NUEVO: Separar título de descripción
            # Buscar el primer separador: ":", "-", "–", o dos espacios
            separators = [':', ' -', ' –', '  ']
            title = line
            
            for sep in separators:
                if sep in line:
                    title = line.split(sep)[0].strip()
                    break
            
            # Remover comillas si las hay
            title = title.strip('"\'')
            
            # Limpiar caracteres extraños al final
            title = re.sub(r'[:\-–,\.]$', '', title).strip()
            
            # Filtros de calidad
            if 3 < len(title) < 100:  # longitud razonable
                movies.append(title)
            
            if len(movies) >= max_recommendations:
                break
        
        return movies[:max_recommendations]
    
    def clear_memory(self):
        """Limpia memoria GPU"""
        if hasattr(self, 'model'):
            del self.model
        if hasattr(self, 'tokenizer'):
            del self.tokenizer
        torch.cuda.empty_cache()

# Test mejorado
if __name__ == "__main__":
    llm = LLMRecommender("google/gemma-3-1b-it", load_in_8bit=False)
    
    test_context = """User: I love action movies with great special effects like "The Matrix (1999)"
Recommender: What kind of action movies do you prefer?
User: Sci-fi action with deep philosophical themes."""
    
    # Prompt más restrictivo
    def simple_prompt(context):
        return f"""Given this conversation about movies, recommend exactly 5 movies.

Conversation:
{context}

Output ONLY a numbered list of movie titles with years. No descriptions or explanations.

Recommendations:
1."""
    
    print("Testing LLM...")
    response, latency = llm.generate_recommendations(
        test_context, 
        simple_prompt,
        max_new_tokens=150,  # menos tokens
        temperature=0.3  # más determinístico
    )
    
    print(f"\n{'='*80}")
    print(f"Raw Response ({latency:.2f}s):")
    print(f"{'='*80}")
    print(response)
    
    print(f"\n{'='*80}")
    print("Extracted Movies:")
    print(f"{'='*80}")
    movies = llm.extract_movie_titles(response)
    for i, movie in enumerate(movies, 1):
        print(f"{i}. {movie}")
    
    # Test de extracción con texto complejo
    print(f"\n{'='*80}")
    print("Testing extraction with descriptions:")
    print(f"{'='*80}")
    
    test_response = """1. Blade Runner 2049: A visually stunning sequel
2. Inception (2010) - Mind-bending thriller
3. The Matrix Reloaded
4.  Arrival  –  Sci-fi drama about communication
5. Interstellar (2014)"""
    
    extracted = llm.extract_movie_titles(test_response)
    print("Input:")
    print(test_response)
    print("\nExtracted:")
    for i, movie in enumerate(extracted, 1):
        print(f"{i}. {movie}")
    
    llm.clear_memory()

Loading google/gemma-3-1b-it...


`torch_dtype` is deprecated! Use `dtype` instead!
  warn(
2025-10-27 01:19:45.951466: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-27 01:19:45.951539: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-27 01:19:45.953087: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-27 01:19:45.961253: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler fl

Model loaded successfully on cuda!
Model size: ~1.86 GB
Testing LLM...

Raw Response (7.95s):
 The Matrix (1999)
2. Blade Runner (1982)
3. Interstellar (2014)
4. Fifth Element (1997)
5. Dark City (1998)


Extracted Movies:
1. The Matrix (1999)
2. Blade Runner (1982)
3. Interstellar (2014)
4. Fifth Element (1997)
5. Dark City (1998)

Testing extraction with descriptions:
Input:
1. Blade Runner 2049: A visually stunning sequel
2. Inception (2010) - Mind-bending thriller
3. The Matrix Reloaded
4.  Arrival  –  Sci-fi drama about communication
5. Interstellar (2014)

Extracted:
1. Blade Runner 2049
2. Inception (2010)
3. The Matrix Reloaded
4. Arrival
5. Interstellar (2014)


In [5]:
# prompts.py

def zero_shot_prompt(context):
    """Prompt básico zero-shot"""
    return f"""You are a movie recommendation expert. Given the following conversation between a user seeking movie recommendations and a recommender, suggest 10 movies that the user would enjoy based on their preferences.

Conversation:
{context}

Provide exactly 10 movie recommendations as a numbered list. Output ONLY the movie titles with their release year in parentheses, one per line, nothing else.

Recommendations:
1."""

def few_shot_prompt(context):
    """Prompt con ejemplos (few-shot)"""
    examples = """Here are some examples:

Example 1:
User: I love action movies with great special effects like "The Matrix (1999)"
Recommender: What aspects do you enjoy most?
User: The sci-fi elements and philosophical themes

Recommendations:
1. Inception (2010)
2. Blade Runner (1982)
3. The Prestige (2006)
4. Minority Report (2002)
5. Total Recall (1990)

Example 2:
User: I enjoy romantic comedies like "When Harry Met Sally (1989)"
Recommender: Do you prefer classic or modern rom-coms?
User: I like both, but prefer ones with witty dialogue

Recommendations:
1. Sleepless in Seattle (1993)
2. You've Got Mail (1998)
3. Notting Hill (1999)
4. The Proposal (2009)
5. Crazy, Stupid, Love (2011)

"""
    return f"""{examples}

Now provide recommendations for this conversation:

Conversation:
{context}

Provide exactly 10 movie recommendations as a numbered list. Output ONLY the movie titles with their release year in parentheses, one per line.

Recommendations:
1."""

def cot_prompt(context):
    """Chain-of-thought prompt"""
    return f"""You are a movie recommendation expert. Analyze the following conversation and recommend movies.

Conversation:
{context}

First, analyze what the user likes (genres, themes, styles). Then provide 10 movie recommendations.

Analysis of user preferences:
- Genre preferences: [identify genres mentioned]
- Specific movies liked: [list movies user responded positively to]
- Themes/styles: [identify patterns]

Based on this analysis, here are 10 movie recommendations:
1."""

def role_prompt(context):
    """Role-based prompt with specific persona"""
    return f"""You are an experienced movie critic and recommendation specialist with deep knowledge of cinema history, genres, and audience preferences. A user is seeking movie recommendations based on this conversation:

Conversation:
{context}

As an expert recommender, provide 10 carefully selected movie recommendations that match the user's taste. Consider genre, themes, era, and style.

Your 10 recommendations:
1."""

def structured_prompt(context):
    """Prompt pidiendo output estructurado"""
    return f"""Given this movie conversation, provide 10 movie recommendations.

Conversation:
{context}

Output format - provide exactly 10 movies in this format:
1. [Movie Title] ([Year])
2. [Movie Title] ([Year])
...

Recommendations:
1."""

# Diccionario para fácil acceso
PROMPT_STRATEGIES = {
    'zero_shot': zero_shot_prompt,
    'few_shot': few_shot_prompt,
    'chain_of_thought': cot_prompt,
    'role_based': role_prompt,
    'structured': structured_prompt
}

# Test
if __name__ == "__main__":
    test_context = """User: I love action movies with great special effects like "The Matrix (1999)"
Recommender: What kind of action movies do you prefer?
User: Sci-fi action with deep philosophical themes."""
    
    print("Testing prompts:\n")
    for name, prompt_fn in PROMPT_STRATEGIES.items():
        print(f"=== {name.upper()} ===")
        print(prompt_fn(test_context)[:300] + "...")
        print()

Testing prompts:

=== ZERO_SHOT ===
You are a movie recommendation expert. Given the following conversation between a user seeking movie recommendations and a recommender, suggest 10 movies that the user would enjoy based on their preferences.

Conversation:
User: I love action movies with great special effects like "The Matrix (1999)...

=== FEW_SHOT ===
Here are some examples:

Example 1:
User: I love action movies with great special effects like "The Matrix (1999)"
Recommender: What aspects do you enjoy most?
User: The sci-fi elements and philosophical themes

Recommendations:
1. Inception (2010)
2. Blade Runner (1982)
3. The Prestige (2006)
4. Mi...

=== CHAIN_OF_THOUGHT ===
You are a movie recommendation expert. Analyze the following conversation and recommend movies.

Conversation:
User: I love action movies with great special effects like "The Matrix (1999)"
Recommender: What kind of action movies do you prefer?
User: Sci-fi action with deep philosophical themes.

Fi...

=== ROL

In [4]:
import numpy as np
from typing import List, Dict
from difflib import SequenceMatcher

class RecommendationEvaluator:
    """Calcula métricas de evaluación para recomendaciones"""
    
    def __init__(self, fuzzy_match_threshold: float = 0.8):
        """
        Args:
            fuzzy_match_threshold: umbral de similitud para considerar match (0-1)
        """
        self.fuzzy_match_threshold = fuzzy_match_threshold
    
    def normalize_title(self, title: str) -> str:
        """Normaliza un título de película para comparación"""
        # Minúsculas
        title = title.lower()
        # Remover año si está presente
        import re
        title = re.sub(r'\s*\(\d{4}\)\s*', '', title)
        # Remover espacios extras
        title = ' '.join(title.split())
        # Remover puntuación común
        title = title.replace(',', '').replace('.', '').replace(':', '').replace('the ', '')
        return title.strip()
    
    def fuzzy_match(self, title1: str, title2: str) -> float:
        """Calcula similitud entre dos títulos (0-1)"""
        norm1 = self.normalize_title(title1)
        norm2 = self.normalize_title(title2)
        return SequenceMatcher(None, norm1, norm2).ratio()
    
    def match_titles(self, recommended: List[str], ground_truth: List[str]) -> List[str]:
        """
        Encuentra matches entre recomendaciones y ground truth usando fuzzy matching
        
        Returns:
            Lista de títulos de ground_truth que fueron recomendados
        """
        matched = []
        
        for gt_title in ground_truth:
            for rec_title in recommended:
                similarity = self.fuzzy_match(gt_title, rec_title)
                if similarity >= self.fuzzy_match_threshold:
                    matched.append(gt_title)
                    break  # ya encontramos match para este gt_title
        
        return matched
    
    def recall_at_k(self, recommended: List[str], ground_truth: List[str], k: int = 10) -> float:
        """
        Recall@K: proporción de items relevantes que fueron recomendados
        """
        if len(ground_truth) == 0:
            return 0.0
        
        recommended_k = recommended[:k]
        matched = self.match_titles(recommended_k, ground_truth)
        
        return len(matched) / len(ground_truth)
    
    def precision_at_k(self, recommended: List[str], ground_truth: List[str], k: int = 10) -> float:
        """
        Precision@K: proporción de items recomendados que son relevantes
        """
        if len(recommended) == 0:
            return 0.0
        
        recommended_k = recommended[:k]
        matched = self.match_titles(recommended_k, ground_truth)
        
        return len(matched) / min(len(recommended_k), k)
    
    def ndcg_at_k(self, recommended: List[str], ground_truth: List[str], k: int = 10) -> float:
        """
        NDCG@K: Normalized Discounted Cumulative Gain
        """
        recommended_k = recommended[:k]
        
        # DCG
        dcg = 0.0
        for i, rec_title in enumerate(recommended_k):
            # Verificar si este título hace match con alguno del ground truth
            for gt_title in ground_truth:
                if self.fuzzy_match(rec_title, gt_title) >= self.fuzzy_match_threshold:
                    dcg += 1.0 / np.log2(i + 2)  # +2 porque empieza en 0
                    break
        
        # IDCG (ideal DCG)
        idcg = 0.0
        for i in range(min(len(ground_truth), k)):
            idcg += 1.0 / np.log2(i + 2)
        
        if idcg == 0:
            return 0.0
        
        return dcg / idcg
    
    def hit_rate_at_k(self, recommended: List[str], ground_truth: List[str], k: int = 10) -> float:
        """
        Hit Rate@K: 1 si al menos 1 item relevante está en top-K, 0 si no
        """
        recommended_k = recommended[:k]
        matched = self.match_titles(recommended_k, ground_truth)
        return 1.0 if len(matched) > 0 else 0.0
    
    def mrr(self, recommended: List[str], ground_truth: List[str]) -> float:
        """
        Mean Reciprocal Rank: 1/rank del primer item relevante
        """
        for i, rec_title in enumerate(recommended):
            for gt_title in ground_truth:
                if self.fuzzy_match(rec_title, gt_title) >= self.fuzzy_match_threshold:
                    return 1.0 / (i + 1)
        return 0.0
    
    def evaluate_all(self, recommended: List[str], ground_truth: List[str], 
                     k_values: List[int] = [5, 10]) -> Dict[str, float]:
        """
        Calcula todas las métricas
        
        Returns:
            dict con todas las métricas
        """
        metrics = {}
        
        for k in k_values:
            metrics[f'recall@{k}'] = self.recall_at_k(recommended, ground_truth, k)
            metrics[f'precision@{k}'] = self.precision_at_k(recommended, ground_truth, k)
            metrics[f'ndcg@{k}'] = self.ndcg_at_k(recommended, ground_truth, k)
            metrics[f'hit_rate@{k}'] = self.hit_rate_at_k(recommended, ground_truth, k)
        
        metrics['mrr'] = self.mrr(recommended, ground_truth)
        
        return metrics

# Test
if __name__ == "__main__":
    evaluator = RecommendationEvaluator()
    
    # Test con títulos exactos
    recommended = [
        "The Matrix (1999)",
        "Inception (2010)",
        "Blade Runner (1982)",
        "The Prestige (2006)"
    ]
    
    ground_truth = [
        "The Matrix (1999)",
        "Inception (2010)",
        "Interstellar (2014)"
    ]
    
    print("Testing evaluator:")
    print(f"Recommended: {recommended}")
    print(f"Ground truth: {ground_truth}")
    print()
    
    metrics = evaluator.evaluate_all(recommended, ground_truth, k_values=[3, 5])
    
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Test fuzzy matching
    print("\n\nTesting fuzzy matching:")
    test_pairs = [
        ("The Matrix (1999)", "Matrix (1999)"),
        ("The Lord of the Rings", "Lord of Rings"),
        ("Star Wars", "Star Wars: A New Hope"),
    ]
    
    for t1, t2 in test_pairs:
        similarity = evaluator.fuzzy_match(t1, t2)
        print(f"{t1} <-> {t2}: {similarity:.3f}")

Testing evaluator:
Recommended: ['The Matrix (1999)', 'Inception (2010)', 'Blade Runner (1982)', 'The Prestige (2006)']
Ground truth: ['The Matrix (1999)', 'Inception (2010)', 'Interstellar (2014)']

recall@3: 0.6667
precision@3: 0.6667
ndcg@3: 0.7654
hit_rate@3: 1.0000
recall@5: 0.6667
precision@5: 0.5000
ndcg@5: 0.7654
hit_rate@5: 1.0000
mrr: 1.0000


Testing fuzzy matching:
The Matrix (1999) <-> Matrix (1999): 1.000
The Lord of the Rings <-> Lord of Rings: 1.000
Star Wars <-> Star Wars: A New Hope: 0.621


In [6]:
import json
from typing import List, Dict, Tuple
import re

class ReDiALDataset:
    """Clase para cargar y procesar el dataset ReDiAL"""
    
    def __init__(self, data_path: str, split: str = 'test'):
        """
        Args:
            data_path: ruta al directorio con los archivos (ej: './redial_data' o '.')
            split: 'train', 'validation', o 'test'
        """
        self.data_path = data_path
        self.split = split
        self.conversations = []
        self.load_data()
    
    def load_data(self):
        """Carga las conversaciones del archivo correspondiente"""
        # Mapeo de nombres
        file_mapping = {
            'train': 'train_data.jsonl',
            'validation': 'valid_data.jsonl',
            'test': 'test_data.jsonl'
        }
        
        file_name = file_mapping[self.split]
        
        # Intentar con y sin subdirectorio
        try:
            file_path = f"{self.data_path}/{file_name}"
            with open(file_path, 'r') as f:
                test_line = f.readline()
        except FileNotFoundError:
            file_path = file_name
        
        print(f"Loading {self.split} data from {file_path}...")
        
        with open(file_path, 'r') as f:
            for line in f:
                conv = json.loads(line)
                self.conversations.append(conv)
        
        print(f"Loaded {len(self.conversations)} conversations")
    
    def format_conversation_context(self, conversation: Dict,
                                    n_messages: int) -> str:
        """
        Formatea los mensajes en un string legible para el LLM
        
        Args:
            conversation: conversación completa con metadata
            n_messages: cuántos mensajes incluir
            
        Returns:
            string con la conversación formateada
        """
        messages = conversation['messages']
        movie_mentions = conversation['movieMentions']
        
        # Identificar quién es el seeker (iniciador) y quién el recommender
        seeker_id = conversation['initiatorWorkerId']
        recommender_id = conversation['respondentWorkerId']
        
        formatted = []
        
        for msg in messages[:n_messages]:
            # Mapear correctamente según el workerId
            sender_id = msg['senderWorkerId']
            
            if sender_id == seeker_id:
                sender = "User"  # El que busca recomendaciones
            elif sender_id == recommender_id:
                sender = "Recommender"  # El que recomienda
            else:
                sender = f"Speaker{sender_id}"  # fallback
            
            text = msg['text']
            
            # Reemplazar referencias @ID con nombres de películas
            for movie_id, movie_title in movie_mentions.items():
                text = text.replace(f"@{movie_id}", f'"{movie_title}"')
            
            formatted.append(f"{sender}: {text}")
        
        return "\n".join(formatted)
    
    def extract_ground_truth(self, conversation: Dict, 
                            from_message_idx: int) -> List[str]:
        """
        Extrae las películas que fueron sugeridas después del contexto
        
        Args:
            conversation: diccionario con la conversación completa
            from_message_idx: desde qué mensaje extraer ground truth
            
        Returns:
            lista de títulos de películas sugeridas
        """
        messages = conversation['messages'][from_message_idx:]
        movie_mentions = conversation['movieMentions']
        recommender_id = conversation['respondentWorkerId']
        
        # Extraer IDs de películas mencionadas por el RECOMMENDER después del contexto
        mentioned_ids = set()
        for msg in messages:
            # Solo considerar mensajes del recommender
            if msg['senderWorkerId'] == recommender_id:
                text = msg['text']
                ids = re.findall(r'@(\d+)', text)
                mentioned_ids.update(ids)
        
        # Filtrar solo las que fueron sugeridas (suggested=1)
        ground_truth = []
        
        # Revisar en respondentQuestions
        respondent_questions = conversation.get('respondentQuestions', {})
        for movie_id in mentioned_ids:
            if movie_id in respondent_questions:
                if respondent_questions[movie_id].get('suggested', 0) == 1:
                    title = movie_mentions.get(movie_id, '')
                    if title:
                        ground_truth.append(title)
        
        # También revisar initiatorQuestions (por si el seeker acepta algo)
        initiator_questions = conversation.get('initiatorQuestions', {})
        for movie_id in mentioned_ids:
            if movie_id in initiator_questions:
                if initiator_questions[movie_id].get('suggested', 0) == 1:
                    title = movie_mentions.get(movie_id, '')
                    if title and title not in ground_truth:
                        ground_truth.append(title)
        
        return ground_truth
    
    def prepare_conversation_context(self, conversation: Dict, 
                                     context_ratio: float = 0.7) -> Tuple[str, List[str]]:
        """
        Extrae el contexto de una conversación para dar al LLM
        
        Args:
            conversation: diccionario con la conversación
            context_ratio: qué porcentaje de mensajes usar como contexto
            
        Returns:
            context_text: string con el contexto formateado
            ground_truth: lista de títulos de películas que deberían recomendarse
        """
        messages = conversation['messages']
        
        # Calcular cuántos mensajes usar como contexto
        n_context_messages = max(1, int(len(messages) * context_ratio))
        
        # Formatear el contexto
        context_text = self.format_conversation_context(
            conversation,
            n_context_messages
        )
        
        # Extraer ground truth de los mensajes restantes
        ground_truth = self.extract_ground_truth(conversation, n_context_messages)
        
        return context_text, ground_truth
    
    def get_evaluation_samples(self, n_samples: int = None, 
                              context_ratio: float = 0.7,
                              min_ground_truth: int = 1) -> List[Dict]:
        """
        Retorna muestras para evaluación
        
        Args:
            n_samples: cuántas muestras retornar (None = todas)
            context_ratio: proporción de la conversación a usar como contexto
            min_ground_truth: mínimo de películas en ground truth para incluir
            
        Returns:
            list de dicts: [{'context': str, 'ground_truth': list, 'conv_id': str}, ...]
        """
        samples = []
        
        conversations = self.conversations[:n_samples] if n_samples else self.conversations
        
        for conv in conversations:
            context, ground_truth = self.prepare_conversation_context(
                conv, 
                context_ratio
            )
            
            # Solo incluir si hay suficiente ground truth
            if len(ground_truth) >= min_ground_truth:
                samples.append({
                    'context': context,
                    'ground_truth': ground_truth,
                    'conv_id': conv['conversationId'],
                    'all_movies': list(conv['movieMentions'].values()),
                    'n_messages_total': len(conv['messages']),
                    'n_messages_context': int(len(conv['messages']) * context_ratio)
                })
        
        print(f"Prepared {len(samples)} samples with ground truth (min={min_ground_truth})")
        return samples

# Test mejorado
if __name__ == "__main__":
    dataset = ReDiALDataset('.', split='test')
    samples = dataset.get_evaluation_samples(n_samples=5)
    
    print("\n" + "="*80)
    print("SAMPLE 1 - DETAILED")
    print("="*80)
    print(f"Conversation ID: {samples[0]['conv_id']}")
    print(f"Total messages: {samples[0]['n_messages_total']}")
    print(f"Context messages: {samples[0]['n_messages_context']}")
    print("\nContext:")
    print(samples[0]['context'])
    print("\nGround Truth:", samples[0]['ground_truth'])
    print("\nAll movies mentioned:", samples[0]['all_movies'])
    
    print("\n" + "="*80)
    print("STATISTICS")
    print("="*80)
    print(f"Total samples: {len(samples)}")
    gt_lengths = [len(s['ground_truth']) for s in samples]
    print(f"Avg ground truth size: {sum(gt_lengths)/len(gt_lengths):.2f}")
    print(f"Min/Max ground truth: {min(gt_lengths)}/{max(gt_lengths)}")

Loading test data from ./test_data.jsonl...
Loaded 1342 conversations
Prepared 5 samples with ground truth (min=1)

SAMPLE 1 - DETAILED
Conversation ID: 20001
Total messages: 15
Context messages: 10

Context:
User: Hi I am looking for a movie like "Super Troopers (2001)"
Recommender: You should watch "Police Academy  (1984)"
User: Is that a great one? I have never seen it. I have seen "American Pie "
User: I mean "American Pie  (1999)"
Recommender: Yes "Police Academy  (1984)" is very funny and so is "Police Academy 2: Their First Assignment (1985)"
User: It sounds like I need to check them out
Recommender: yes you will enjoy them
User: I appreciate your time. I will need to check those out. Are there any others you would recommend?
Recommender: yes "Lethal Weapon (1987)"
User: Thank you i will watch that too

Ground Truth: ['48 Hrs. (1982)', 'Beverly Hills Cop (1984)']

All movies mentioned: ['Super Troopers (2001)', 'Beverly Hills Cop (1984)', 'Police Academy  (1984)', 'American Pie 

In [10]:
# run_experiment.py
import json
import numpy as np
from tqdm import tqdm
import torch
from datetime import datetime
import os

def run_evaluation(
    model_name: str,
    dataset_path: str,
    output_dir: str,
    prompt_strategy: str = 'zero_shot',
    n_samples: int = None,
    load_in_8bit: bool = False,
    context_ratio: float = 0.7,
    temperature: float = 0.7
):
    """
    Pipeline completo de evaluación
    
    Args:
        model_name: nombre del modelo en HuggingFace
        dataset_path: ruta al dataset ReDial
        output_dir: directorio donde guardar resultados
        prompt_strategy: estrategia de prompting a usar
        n_samples: cuántas muestras evaluar (None = todas)
        load_in_8bit: usar cuantización 8-bit
        context_ratio: proporción de conversación para contexto
        temperature: temperatura de generación
    """
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_short_name = model_name.split('/')[-1]
    output_file = f"{output_dir}/{model_short_name}_{prompt_strategy}_{timestamp}.json"
    
    print("\n" + "="*80)
    print("STARTING EVALUATION")
    print("="*80)
    print(f"Model: {model_name}")
    print(f"Prompt strategy: {prompt_strategy}")
    print(f"Samples: {n_samples if n_samples else 'all'}")
    print(f"Context ratio: {context_ratio}")
    print(f"Temperature: {temperature}")
    print(f"8-bit quantization: {load_in_8bit}")
    print("="*80)
    
    # 1. Cargar dataset
    print("\n[1/5] Loading dataset...")
    dataset = ReDiALDataset(dataset_path, split='test')
    samples = dataset.get_evaluation_samples(n_samples, context_ratio=context_ratio)
    print(f"✓ Loaded {len(samples)} samples with ground truth")
    
    if len(samples) == 0:
        print("✗ No samples with ground truth found!")
        return None
    
    # 2. Cargar modelo
    print(f"\n[2/5] Loading model: {model_name}")
    try:
        llm = LLMRecommender(model_name, load_in_8bit=load_in_8bit)
        print("✓ Model loaded successfully")
    except Exception as e:
        print(f"✗ Error loading model: {e}")
        return None
    
    # 3. Seleccionar estrategia de prompt
    if prompt_strategy not in PROMPT_STRATEGIES:
        print(f"✗ Unknown prompt strategy: {prompt_strategy}")
        print(f"Available: {list(PROMPT_STRATEGIES.keys())}")
        return None
    
    prompt_fn = PROMPT_STRATEGIES[prompt_strategy]
    evaluator = RecommendationEvaluator(fuzzy_match_threshold=0.85)
    
    # 4. Evaluar
    print(f"\n[3/5] Evaluating...")
    all_metrics = []
    all_results = []
    failed_samples = 0
    
    for i, sample in enumerate(tqdm(samples, desc="Evaluating")):
        try:
            context = sample['context']
            ground_truth = sample['ground_truth']
            
            # Generar recomendaciones
            response, latency = llm.generate_recommendations(
                context, 
                prompt_fn,
                max_new_tokens=250,
                temperature=temperature
            )
            
            # Extraer títulos
            recommended = llm.extract_movie_titles(response, max_recommendations=10)
            
            # Calcular métricas
            metrics = evaluator.evaluate_all(recommended, ground_truth, k_values=[5, 10])
            metrics['latency'] = latency
            metrics['n_recommended'] = len(recommended)
            metrics['n_ground_truth'] = len(ground_truth)
            
            all_metrics.append(metrics)
            
            # Guardar detalles (solo primeros 20 para no hacer JSON gigante)
            if len(all_results) < 20:
                all_results.append({
                    'conv_id': sample['conv_id'],
                    'context_preview': context[:300] + "..." if len(context) > 300 else context,
                    'ground_truth': ground_truth,
                    'recommended': recommended,
                    'response_preview': response[:300] + "..." if len(response) > 300 else response,
                    'metrics': metrics
                })
            
        except Exception as e:
            print(f"\n✗ Error in sample {i} (conv_id: {sample.get('conv_id', 'unknown')}): {e}")
            failed_samples += 1
            continue
    
    print(f"\n✓ Evaluation complete")
    print(f"  Successful: {len(all_metrics)}/{len(samples)}")
    print(f"  Failed: {failed_samples}/{len(samples)}")
    
    if len(all_metrics) == 0:
        print("✗ No successful evaluations!")
        llm.clear_memory()
        return None
    
    # 5. Agregar resultados
    print(f"\n[4/5] Computing aggregate metrics...")
    aggregated = {}
    
    for key in all_metrics[0].keys():
        values = [m[key] for m in all_metrics]
        aggregated[f'{key}_mean'] = float(np.mean(values))
        aggregated[f'{key}_std'] = float(np.std(values))
        aggregated[f'{key}_median'] = float(np.median(values))
        aggregated[f'{key}_min'] = float(np.min(values))
        aggregated[f'{key}_max'] = float(np.max(values))
    
    # 6. Guardar resultados
    print(f"\n[5/5] Saving results...")
    results = {
        'experiment_info': {
            'model_name': model_name,
            'model_short_name': model_short_name,
            'prompt_strategy': prompt_strategy,
            'n_samples_requested': n_samples,
            'n_samples_evaluated': len(samples),
            'n_samples_successful': len(all_metrics),
            'n_samples_failed': failed_samples,
            'context_ratio': context_ratio,
            'temperature': temperature,
            'load_in_8bit': load_in_8bit,
            'timestamp': timestamp
        },
        'aggregated_metrics': aggregated,
        'sample_results': all_results
    }
    
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"✓ Results saved to {output_file}")
    
    # Mostrar resultados principales
    print("\n" + "="*80)
    print("RESULTS SUMMARY")
    print("="*80)
    
    key_metrics = ['recall@10', 'precision@10', 'ndcg@10', 'hit_rate@10', 'mrr', 'latency']
    for metric in key_metrics:
        mean_key = f'{metric}_mean'
        std_key = f'{metric}_std'
        if mean_key in aggregated:
            print(f"{metric:20s}: {aggregated[mean_key]:.4f} ± {aggregated[std_key]:.4f}")
    
    print("="*80)
    
    # Limpiar memoria
    print("\nCleaning up memory...")
    try:
        llm.clear_memory()
        torch.cuda.empty_cache()
        print("✓ Done")
    except:
        print("Error limpiando memoria")
    
    return results


def run_multiple_experiments(experiments_config):
    """
    Ejecuta múltiples experimentos en secuencia
    
    Args:
        experiments_config: lista de dicts con configuraciones
    """
    all_results = []
    
    for i, config in enumerate(experiments_config):
        print(f"\n\n{'#'*80}")
        print(f"EXPERIMENT {i+1}/{len(experiments_config)}")
        print(f"{'#'*80}")
        
        result = run_evaluation(**config)
        
        if result:
            all_results.append(result)
        
        # Pequeña pausa entre experimentos
        if i < len(experiments_config) - 1:
            print("\nWaiting 5 seconds before next experiment...")
            import time
            time.sleep(5)
    
    return all_results


if __name__ == "__main__":
    # Crear directorio de resultados
    os.makedirs('./results', exist_ok=True)
    
    # EXPERIMENTO 1: Test rápido con modelo pequeño
    print("\n" + "🚀"*40)
    print("QUICK TEST - Gemma 2B with 100 samples")
    print("🚀"*40)
    
    run_evaluation(
        model_name="Qwen/Qwen3-4B-Instruct-2507",
        dataset_path=".",
        output_dir="./results",
        prompt_strategy='zero_shot',
        n_samples=100,
        load_in_8bit=False,
        context_ratio=0.7,
        temperature=0.7
    )
    
    print("\n\n✅ Quick test completed!")
    print("Check the results folder for the output JSON file.")
    print("\nNext steps:")
    print("1. Review the results")
    print("2. Run more experiments with different models/prompts")
    print("3. Use run_multiple_experiments() for batch evaluation")


🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀
QUICK TEST - Gemma 2B with 100 samples
🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀

STARTING EVALUATION
Model: Qwen/Qwen3-4B-Instruct-2507
Prompt strategy: zero_shot
Samples: 100
Context ratio: 0.7
Temperature: 0.7
8-bit quantization: False

[1/5] Loading dataset...
Loading test data from ./test_data.jsonl...
Loaded 1342 conversations
Prepared 52 samples with ground truth (min=1)
✓ Loaded 52 samples with ground truth

[2/5] Loading model: Qwen/Qwen3-4B-Instruct-2507
Loading Qwen/Qwen3-4B-Instruct-2507...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Model loaded successfully on cuda!
Model size: ~7.49 GB
✓ Model loaded successfully

[3/5] Evaluating...


Evaluating: 100%|██████████| 52/52 [12:12<00:00, 14.09s/it]


✓ Evaluation complete
  Successful: 52/52
  Failed: 0/52

[4/5] Computing aggregate metrics...

[5/5] Saving results...
✓ Results saved to ./results/Qwen3-4B-Instruct-2507_zero_shot_20251027_020746.json

RESULTS SUMMARY
recall@10           : 0.1587 ± 0.3326
precision@10        : 0.0231 ± 0.0465
ndcg@10             : 0.1094 ± 0.2441
hit_rate@10         : 0.2115 ± 0.4084
mrr                 : 0.1085 ± 0.2567
latency             : 14.0647 ± 1.1320

Cleaning up memory...
✓ Done


✅ Quick test completed!
Check the results folder for the output JSON file.

Next steps:
1. Review the results
2. Run more experiments with different models/prompts
3. Use run_multiple_experiments() for batch evaluation



