In [1]:
import os
import sys
import yaml
import numpy as np
import pandas as pd
from datetime import datetime

from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecNormalize
from stable_baselines3.common.callbacks import EvalCallback

# Llamar al entorno
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.environment_dqn import RetailEnvDQN

# directorio con los parÃ¡metros de configuraciÃ³n
CONFIG_FILE = "../configs/ultra_fast.yaml"     # ultra_fast o production(mejor calidad, pero mÃ¡s lento)
with open(CONFIG_FILE, "r") as f:
    cfg = yaml.safe_load(f)

# Cargando dataset de entrenamiento 
csv_path = cfg.get("data_path", "../data/data_train.csv")

if not os.path.exists(csv_path):
    print(f"{csv_path} no existe.")
else:
    df = pd.read_csv(csv_path)
    data_array = df.values.astype(np.float32)
    print(f"Dataset cargado â†’ {df.shape[0]} filas, {df.shape[1]} columnas.")

# Se crea una carpeta para experimentos
experiment_dir = f"../results/models/dqn/"
os.makedirs(experiment_dir, exist_ok=True)

# 1. Separar datos (Split temporal, no aleatorio)
split_idx = int(len(data_array) * 0.8)
train_data = data_array[:split_idx]
eval_data = data_array[split_idx:]

def make_train_env(rank):
    """Cada env escribe su propio monitor.csv para evitar colisiones."""
    def _init():
        path = os.path.join(experiment_dir, f"monitor_train_{rank}.csv")
        env = RetailEnvDQN(train_data)
        return Monitor(env, filename=path)
    return _init


def make_eval_env(rank):
    """Eval env separado, sin normalizaciÃ³n de recompensa."""
    def _init():
        path = os.path.join(experiment_dir, f"monitor_eval_{rank}.csv")
        env = RetailEnvDQN(eval_data)
        return Monitor(env, filename=path)
    return _init


# Entrenamiento
N_ENVS = cfg["n_envs"]

# Inicializar entornos
train_env = SubprocVecEnv([make_train_env(i) for i in range(N_ENVS)])
eval_env = DummyVecEnv([make_eval_env(0)])


# ConfiguraciÃ³n del DQN con los parÃ¡metros del archivo de configuraciÃ³n
policy_kwargs = dict(net_arch=cfg["policy_net"]["layers"])

model = DQN(
    "MlpPolicy",
    train_env,
    verbose=1,
    learning_rate=cfg["dqn"]["learning_rate"],
    gamma=cfg["dqn"]["gamma"],
    buffer_size=cfg["dqn"]["buffer_size"],
    learning_starts=cfg["dqn"]["learning_starts"],
    batch_size=cfg["dqn"]["batch_size"],
    train_freq=cfg["dqn"]["train_freq"],
    gradient_steps=cfg["dqn"]["gradient_steps"],
    n_steps=cfg["dqn"]["n_steps"],
    target_update_interval=cfg["dqn"]["target_update_interval"],
    exploration_fraction=cfg["exploration"]["fraction"],
    exploration_initial_eps=cfg["exploration"]["initial_eps"],
    exploration_final_eps=cfg["exploration"]["final_eps"],
    policy_kwargs=policy_kwargs,
    tensorboard_log=os.path.join(experiment_dir, "tensorboard")
)

# Capturando el mejor modelo 
callback = EvalCallback(
    eval_env,
    best_model_save_path=os.path.join(experiment_dir, "best_model"),
    log_path=os.path.join(experiment_dir, "eval_logs"),
    eval_freq=10_000,
    deterministic=True,
    render=False,
)


TOTAL_STEPS = cfg["total_steps"]

print(f"Entrenando por {TOTAL_STEPS:,} pasos\n")

model.learn(
    total_timesteps=TOTAL_STEPS,
    callback=callback
)

print("\nEntrenamiento finalizado.")

# Guardando el modelo final 
model_final_dir = os.path.join(experiment_dir, "model_final")
os.makedirs(model_final_dir, exist_ok=True)

model.save(os.path.join(model_final_dir, "dqn_final.zip"))

print("Modelo final guardado:")
print(f"   â†’ {model_final_dir}/dqn_final.zip")
print("\nResultados completos en:")
print(f"   {experiment_dir}")


Dataset cargado â†’ 3655 filas, 9 columnas.
Using cpu device
Entrenando por 300,000 pasos

Logging to ../results/models/dqn/tensorboard/DQN_15
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90       |
|    ep_rew_mean      | 46.4     |
|    exploration_rate | 0.994    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5017     |
|    time_elapsed     | 0        |
|    total_timesteps  | 360      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90       |
|    ep_rew_mean      | 46.6     |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4967     |
|    time_elapsed     | 0        |
|    total_timesteps  | 720      |
----------------------------------




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90       |
|    ep_rew_mean      | 47.1     |
|    exploration_rate | 0.983    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 4596     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1080     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.407    |
|    n_updates        | 5        |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90       |
|    ep_rew_mean      | 47       |
|    exploration_rate | 0.977    |
| time/               |          |
|    episodes         | 16       |
|    fps              | 3877     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1440     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.17     |
|    n_updates      