
# 03 - Evaluación (DT vs Baseline)
Notebook de evaluación: carga checkpoint, evalúa en test users y compara con Popularity.


In [1]:

import sys, pathlib, pickle
import torch
import numpy as np

# Ajustar path al root
ROOT = pathlib.Path.cwd()
if (ROOT / 'config_dataset.py').exists():
    sys.path.append(str(ROOT))
elif (ROOT.parent / 'config_dataset.py').exists():
    ROOT = ROOT.parent
    sys.path.append(str(ROOT))

from config_dataset import NUM_ITEMS
from src.data.load_data import load_training_data, load_test_data
from src.data.preprocessing import create_dt_dataset
from src.models.decision_transformer import DecisionTransformer
from src.models.baselines import PopularityRecommender
from src.evaluation.evaluate import evaluate_model

print(f"NUM_ITEMS: {NUM_ITEMS}")
print(f"GPU disponible: {torch.cuda.is_available()}")


NUM_ITEMS: 472
GPU disponible: False



## Cargar datos
- Test users para evaluación del modelo.
- Trayectorias de train para el baseline Popularity (procesadas si existe pickle).


In [2]:

# Test users
test_users = load_test_data()
print('Usuarios de test:', len(test_users))

# Trayectorias para baseline
processed_path = ROOT / 'data/processed/trajectories_train.pkl'
if processed_path.exists():
    with processed_path.open('rb') as f:
        train_trajectories = pickle.load(f)
    print(f"Trayectorias cargadas de {processed_path} -> {len(train_trajectories)}")
else:
    df_train = load_training_data()
    train_trajectories = create_dt_dataset(df_train)
    print(f"Trayectorias generadas desde train -> {len(train_trajectories)}")


Usuarios de test: 1600
Trayectorias cargadas de /home/manu/Documentos/diplo/tp_decision_transformer/data/processed/trajectories_train.pkl -> 16000



## Cargar modelo y checkpoint


In [3]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DecisionTransformer(
    num_items=NUM_ITEMS,
    num_groups=8,
    hidden_dim=128,
    n_layers=3,
    n_heads=4,
    context_length=20,
)

ckpt_path = ROOT / 'results/checkpoints/dt_checkpoint.pt'
if ckpt_path.exists():
    state = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(state)
    print(f"Checkpoint cargado: {ckpt_path}")
else:
    print("[ADVERTENCIA] No se encontró checkpoint, usando pesos aleatorios.")


Checkpoint cargado: /home/manu/Documentos/diplo/tp_decision_transformer/results/checkpoints/dt_checkpoint.pt



## Evaluación del modelo


In [4]:

k_list = [5, 10, 20]
metrics_dt = evaluate_model(model, test_users, device, target_return=None, k_list=k_list)
print('Métricas Decision Transformer:')
for k,v in metrics_dt.items():
    print(f"  {k}: {v:.4f}")


Métricas Decision Transformer:
  HR@5: 0.0111
  HR@10: 0.0199
  HR@20: 0.0420
  NDCG@5: 0.0065
  NDCG@10: 0.0093
  NDCG@20: 0.0148
  MRR: 0.0144



## Evaluación baseline Popularidad
Usa ranking global, excluyendo items ya vistos en el historial.


In [5]:

# Fit baseline
aff = PopularityRecommender(num_items=NUM_ITEMS)
aff.fit(train_trajectories)

k_list = [5, 10, 20]
context_len = 20

hits = {k: [] for k in k_list}

for user in test_users:
    items = user['items']
    # Simular sesión: historial crece y se recomiendan los más populares no vistos
    for t in range(context_len, len(items)):
        history = items[:t]
        recs = aff.recommend(history, k=max(k_list))
        target = items[t]
        for k in k_list:
            hits[k].append(1.0 if target in recs[:k] else 0.0)

metrics_pop = {f"HR@{k}": float(np.mean(vals)) for k, vals in hits.items()}
print('Métricas Popularidad:')
for k,v in metrics_pop.items():
    print(f"  {k}: {v:.4f}")


Métricas Popularidad:
  HR@5: 0.0619
  HR@10: 0.1283
  HR@20: 0.1969



## Resumen


In [6]:

print('DT:', metrics_dt)
print('POP:', metrics_pop)


DT: {'HR@5': 0.011061946902654867, 'HR@10': 0.01991150442477876, 'HR@20': 0.0420353982300885, 'NDCG@5': 0.006523139708865006, 'NDCG@10': 0.00931465942247779, 'NDCG@20': 0.014838351942269148, 'MRR': 0.014402976602653815}
POP: {'HR@5': 0.061946902654867256, 'HR@10': 0.12831858407079647, 'HR@20': 0.19690265486725664}
