In [None]:
import numpy as np
import pandas as pd
import gc
import os
import matplotlib.pyplot as plt
import polars as pl
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import Parallel, delayed
from more_itertools import chunked
from functools import reduce
from typing import List
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
import joblib
import os
import torch.nn as nn

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [None]:
df_full = pd.read_parquet('./data/product_train_val_NN_TORCH.parquet', engine='fastparquet')

In [None]:
# Separar conjuntos
df_train = df_full[df_full['PERIODO'] <= 201909].copy()
df_val = df_full[(df_full['PERIODO'] == 201910)].copy()
df_pred = df_full[df_full['PERIODO'] == 201912].copy()
del df_full
gc.collect()

In [None]:
import joblib

target_col = 'CLASE_LOG1P_Z'
cat_cols = ['ID_CAT1', 'ID_CAT2', 'ID_CAT3', 'ID_BRAND', 'SKU_SIZE', 'MES_PROBLEMATICO', 'PRODUCT_RANK_BIN']
label_encoders = {}

# Cargar los encoders entrenados
for col in cat_cols:
    le = joblib.load(f'encoders/{col}_encoder.pkl')
    label_encoders[col] = le

    # Transformar los datasets (train, val, pred) usando ese encoder
    for df in [df_train, df_val, df_pred]:
        df[col] = df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else 0)

# Definir embedding_sizes (lo mejor es usar el encoder para ver la cantidad de clases)
embedding_sizes = [
    (len(label_encoders[col].classes_) + 1, min(50, (len(label_encoders[col].classes_) + 1) // 2))
    for col in cat_cols
]

# Excluir columnas que no deben ir al modelo
excluir = ['PERIODO', 'CUSTOMER_ID', 'PRODUCT_ID', 'CLASE_LOG1P_Z', 'ORDINAL']
feature_cols = [col for col in df.columns if col not in excluir and col not in cat_cols]

In [None]:
# No los incluyas en ninguna de estas dos listas
assert 'CUSTOMER_ID' not in feature_cols
assert 'CUSTOMER_ID' not in cat_cols
assert 'PRODUCT_ID' not in feature_cols
assert 'PRODUCT_ID' not in cat_cols
assert 'PERIODO' not in feature_cols
assert 'PERIODO' not in cat_cols
assert 'CLASE_LOG1P_Z' not in feature_cols
assert 'CLASE_LOG1P_Z' not in cat_cols
assert 'ORDINAL' not in feature_cols
assert 'ORDINAL' not in cat_cols


In [None]:
assert all(col in df_train.columns for col in cat_cols), "Faltan columnas categóricas"
assert all(col in df_train.columns for col in feature_cols), "Faltan columnas numéricas"
assert target_col in df_train.columns, "Falta la variable objetivo"

In [None]:
from torch.utils.data import Dataset
import torch

class TabularDataset(Dataset):
    def __init__(self, df, cat_cols, num_cols, target_col=None):
        self.cat_data = torch.tensor(df[cat_cols].values, dtype=torch.long)
        self.num_data = torch.tensor(df[num_cols].values, dtype=torch.float32)
        self.has_target = target_col is not None
        if self.has_target:
            self.y = torch.tensor(df[target_col].values, dtype=torch.float32).unsqueeze(1)
        else:
            self.y = None

    def __len__(self):
        return len(self.cat_data)

    def __getitem__(self, idx):
        if self.has_target:
            return self.cat_data[idx], self.num_data[idx], self.y[idx]
        else:
            return self.cat_data[idx], self.num_data[idx]

In [None]:
batch_size = 4096

train_dataset = TabularDataset(df_train, cat_cols, feature_cols, target_col)
val_dataset = TabularDataset(df_val, cat_cols, feature_cols, target_col)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
print(cat_cols)
print(feature_cols)
print(target_col)

In [None]:
import torch.nn.functional as F

class TabularNNImproved(nn.Module):
    def __init__(self, embedding_sizes, num_numerical, hidden_sizes=[512, 512, 256, 128], dropout=0.1):
        super().__init__()
        
        # Embedding layers
        self.embeddings = nn.ModuleList([
            nn.Embedding(ni, nf) for ni, nf in embedding_sizes
        ])
        embedding_dim = sum([nf for _, nf in embedding_sizes])
        self.embedding_dropout = nn.Dropout(dropout)

        # Total input size after embedding + numerical
        input_size = embedding_dim + num_numerical

        # Hidden layers
        layers = []
        for h in hidden_sizes:
            layers.append(nn.Linear(input_size, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            input_size = h

        # Output layer
        layers.append(nn.Linear(input_size, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x_cat, x_num):
        x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(x, dim=1)
        x = self.embedding_dropout(x)
        x = torch.cat([x, x_num], dim=1)
        return self.model(x)


In [None]:
import torch

# Detectar si hay GPU disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Crear el modelo
model = TabularNNImproved(
    embedding_sizes=embedding_sizes,
    num_numerical=len(feature_cols),
    hidden_sizes=[4096,2048,1024,512, 512, 256, 128],
    dropout=0.3
).to(device)
model.to(device)
print(model)


In [None]:
import torch
import torch.nn as nn

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, y_pred, y_true):
        return torch.sqrt(torch.mean((y_pred - y_true) ** 2))


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import numpy as np

def train_model(
    model, train_loader, val_loader=None, n_epochs=20, lr=1e-3, patience=3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = RMSELoss()
    best_val_loss = float('inf')
    best_model_state = None
    epochs_without_improvement = 0

    best_epoch = 0  # <--- Nuevo: para trackear la mejor epoch
    best_y_true = None
    best_y_pred = None

    for epoch in range(n_epochs):
        # Entrenamiento
        model.train()
        train_loss = 0.0
        for cats, conts, y in train_loader:
            cats, conts, y = cats.to(device), conts.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred = model(cats, conts)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * y.size(0)
        train_loss /= len(train_loader.dataset)

        # Validación (solo si hay val_loader)
        if val_loader is not None:
            model.eval()
            val_loss = 0.0
            y_true_list = []
            y_pred_list = []
            with torch.no_grad():
                for cats, conts, y in val_loader:
                    cats, conts, y = cats.to(device), conts.to(device), y.to(device)
                    y_pred = model(cats, conts)
                    loss = criterion(y_pred, y)
                    val_loss += loss.item() * y.size(0)
                    y_true_list.append(y.cpu().numpy())
                    y_pred_list.append(y_pred.cpu().numpy())

            val_loss /= len(val_loader.dataset)
            y_true = np.concatenate(y_true_list)
            y_pred = np.concatenate(y_pred_list)

            mae = mean_absolute_error(y_true, y_pred)
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            r2 = r2_score(y_true, y_pred)

            print(
                f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss:.4f} | "
                f"Val Loss: {val_loss:.4f} | MAE: {mae:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}"
            )

            # Early stopping y trackeo de la mejor epoch
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = model.state_dict()
                best_epoch = epoch + 1  # <--- Guardamos la mejor epoch (1-based)
                best_y_true = y_true
                best_y_pred = y_pred
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if patience is not None and epochs_without_improvement >= patience:
                    print("🔴 Early stopping triggered")
                    break
        else:
            # Si no hay val_loader, solo mostrar train_loss
            print(f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss:.4f}")

    # Restaurar el mejor modelo solo si hubo validación
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # Retornar también la mejor epoch y sus preds
    if val_loader is not None:
        return best_y_true, best_y_pred, best_epoch
    else:
        return None, None, None


# Búsqueda de hiperparámetros (Grid Search)
Probamos distintas combinaciones de hiperparámetros y seleccionamos la que da mejor MAE en validación.


In [None]:
import torch
import gc
import pandas as pd
import numpy as np
import random
from sklearn.metrics import mean_absolute_error
from itertools import product

# ---- FUNCIÓN DE STRATIFIED SAMPLING ----
def stratified_sampling_grid(param_grid, stratify_by, N_TOTAL, seed=42):
    keys = list(param_grid.keys())
    all_combos = [
        dict(zip(keys, vals))
        for vals in product(*(param_grid[k] for k in keys))
    ]
    df = pd.DataFrame(all_combos)

    # --- Nueva columna hashable para deduplicar y comparar ---
    for col in stratify_by:
        if df[col].apply(lambda x: isinstance(x, list)).any():
            df[col + '_str'] = df[col].apply(lambda x: str(x))
    # Usar '_str' si existe, sino el nombre original
    stratify_keys = [col + '_str' if (col + '_str') in df.columns else col for col in stratify_by]

    random.seed(seed)
    uniq_strata = df.drop_duplicates(subset=stratify_keys)
    uniq_strata = uniq_strata.sample(frac=1, random_state=seed)  # shuffle

    sampled_rows = []
    for _, row in uniq_strata.iterrows():
        cond = pd.Series(True, index=df.index)
        for col in stratify_by:
            key = col + '_str' if (col + '_str') in df.columns else col
            val = str(row[col]) if key.endswith('_str') else row[col]
            cond &= (df[key] == val)
        group = df[cond]
        sampled_rows.append(group.sample(1, random_state=random.randint(0, 99999)).iloc[0])

    if len(sampled_rows) < N_TOTAL:
        ids_usados = set()
        for row in sampled_rows:
            row_tuple = tuple(
                str(row[k]) if (k + '_str') in df.columns else row[k] for k in keys
            )
            ids_usados.add(row_tuple)
        resto = df[
            ~df.apply(lambda r: tuple(
                str(r[k]) if (k + '_str') in df.columns else r[k] for k in keys
            ) in ids_usados, axis=1)
        ]
        faltan = N_TOTAL - len(sampled_rows)
        if len(resto) > 0:
            sampled_rows += list(resto.sample(n=min(faltan, len(resto)), random_state=seed+1).to_dict('records'))

    sampled_rows = sampled_rows[:N_TOTAL]
    # --- Devolver solo las columnas originales ---
    # ¡Asegurarse de que todos sean dicts!
    sampled_rows = [dict(r) for r in sampled_rows]
    result = pd.DataFrame(sampled_rows)[keys].reset_index(drop=True)
    return result

# ---- DEFINICIÓN DEL GRID Y PARÁMETROS ----
param_grid = {
    'lr': [1e-3, 5e-4, 2e-4, 1e-4],
    'dropout': [0.1, 0.2, 0.3, 0.4, 0.5],
    'hidden_sizes': [
        [512, 256], [1024, 512, 256], [2048, 1024, 512, 256], [512, 256, 128],
        [1024, 512, 256, 128], [4096, 2048, 1024, 512, 256], [256, 128],
        [512, 512, 256], [512, 512, 512], [256, 256], [1024, 256, 1024]
    ],
    'seed': [101, 307, 1009, 2029, 5003, 7777, 9871]
}
N_TOTAL = 300  # Ajustá según tus recursos
stratify_by = ['lr', 'dropout', 'hidden_sizes']

# ---- GENERAR COMBINACIONES A ENTRENAR ----
combos_to_run = stratified_sampling_grid(param_grid, stratify_by=stratify_by, N_TOTAL=N_TOTAL)

# ---- LOOP DE ENTRENAMIENTO ----
results = []
best_rmse = float('inf')

for i, row in combos_to_run.iterrows():
    lr = row['lr']
    dropout = row['dropout']
    hidden_sizes = row['hidden_sizes']
    seed = int(row['seed'])

    print(f"\n🔧 [{i+1}/{len(combos_to_run)}] Entrenando con: lr={lr}, dropout={dropout}, hidden_sizes={hidden_sizes}, seed={seed}")

    # Setear semillas
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Crear modelo y mover a dispositivo
    model = TabularNNImproved(
        embedding_sizes=embedding_sizes,
        num_numerical=len(feature_cols),
        hidden_sizes=hidden_sizes,
        dropout=dropout
    ).to(device)

    # Entrenamiento (ajustá la función train_model según tu código)
    y_true_gs, y_pred_gs, best_epoch = train_model(
        model, train_loader, val_loader,
        n_epochs=20, lr=lr, patience=4)

    rmse = np.sqrt(np.mean((np.array(y_true_gs) - np.array(y_pred_gs)) ** 2))
    mae = mean_absolute_error(y_true_gs, y_pred_gs)

    results.append({
        'lr': lr,
        'dropout': dropout,
        'hidden_sizes': hidden_sizes,
        'seed': seed,
        'rmse': rmse,
        'mae': mae,
        'best_epoch': best_epoch
    })

    print(f"✅ RMSE = {rmse:.4f} | MAE = {mae:.4f}")

    if rmse < best_rmse:
        best_rmse = rmse
        torch.save(model.state_dict(), f"best_model_rmse{rmse:.4f}_lr{lr}_do{dropout}_seed{seed}.pth")
        print("💾 Modelo guardado (mejor hasta ahora)")

    del model
    torch.cuda.empty_cache()
    gc.collect()

# ---- GUARDAR Y MOSTRAR RESULTADOS ----
results_df = pd.DataFrame(results).sort_values(by='rmse')
print("\n📊 Mejores combinaciones:")
print(results_df.head(50))
results_df.to_csv("gridsearch_results.csv", index=False)


In [None]:
import torch
import math
from torch.utils.data import ConcatDataset, DataLoader

# ------------------------
# Prepara el dataset final
# ------------------------
# train_dataset y val_dataset ya deberían estar definidos
train_val_dataset = ConcatDataset([train_dataset, val_dataset])

batch_size = 1024
train_loader_full = DataLoader(train_val_dataset, batch_size=batch_size, shuffle=True)


In [None]:
import pandas as pd
import ast
import json

# --- CONFIGURACIÓN ---
csv_path = "gridsearch_results.csv"
ensemble_size = 50
top_n = 100  # Top-N por métrica combinada a considerar para maximizar diversidad

# --- CARGA Y PREPROCESAMIENTO ---
df = pd.read_csv(csv_path)

# Normalización z-score de métricas
df['mae_z'] = (df['mae'] - df['mae'].mean()) / df['mae'].std()
df['rmse_z'] = (df['rmse'] - df['rmse'].mean()) / df['rmse'].std()
df['score'] = df['mae_z'] + df['rmse_z']

# Selección top-N mejores por score combinado
df_top = df.sort_values('score').head(top_n).copy()

# Convertir string a lista en 'hidden_sizes' si corresponde
def safe_eval(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except Exception:
            return val  # Si falla, lo deja como está
    return val

df_top['hidden_sizes'] = df_top['hidden_sizes'].apply(safe_eval)
# Crear columna auxiliar como string para deduplicar
df_top['hidden_sizes_str'] = df_top['hidden_sizes'].apply(str)

# Elegir modelos diversos: por combinación única de (lr, dropout, hidden_sizes_str)
diverse = df_top.drop_duplicates(subset=['lr', 'dropout', 'hidden_sizes_str'])

# Si hay menos de ensemble_size, completar con otros buenos modelos (distinta seed)
if len(diverse) < ensemble_size:
    faltan = ensemble_size - len(diverse)
    restantes = df_top[~df_top.index.isin(diverse.index)].sort_values('score')
    # Elegir por combinación diferente de seed
    ya = set(tuple(x) for x in diverse[['lr', 'dropout', 'hidden_sizes_str']].values)
    extras = []
    for _, row in restantes.iterrows():
        key = (row['lr'], row['dropout'], row['hidden_sizes_str'])
        if key not in ya:
            extras.append(row)
            ya.add(key)
        if len(extras) >= faltan:
            break
    diverse = pd.concat([diverse, pd.DataFrame(extras)], ignore_index=True)
else:
    diverse = diverse.head(ensemble_size)

# --- CREACIÓN DE CONFIGS ---
configs = []
for i, row in diverse.reset_index(drop=True).iterrows():
    name = f"mlp_lr_{row['lr']}_drop_{row['dropout']}_hs_{'-'.join(map(str, row['hidden_sizes']))}_seed_{int(row['seed'])}"
    name_short = f"modelo_{i+1}"
    configs.append({
        "lr": float(row['lr']),
        "dropout": float(row['dropout']),
        "hidden_sizes": list(row['hidden_sizes']),
        "seed": int(row['seed']),
        "n_epochs": int(row['best_epoch']),
        "name": name_short,
        "name_desc": name
    })

# --- IMPRESIÓN DE RESUMEN ---
print(f"\nModelos seleccionados para ensemble (total={len(configs)}):")
for cfg in configs:
    print(cfg)

print("\nResumen por (lr, dropout):")
print(diverse.groupby(['lr', 'dropout']).size().unstack(fill_value=0))

# (Opcional) Guardar configs a archivo para uso posterior
with open("ensemble_configs.json", "w") as f:
    json.dump(configs, f, indent=2)


In [None]:
# ------------------------
# Entrenamiento y guardado
# ------------------------

def set_seed(seed):
    import random
    import numpy as np
    import torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

for cfg in configs:
    print(f"\n🔧 Entrenando {cfg['name']} - lr={cfg['lr']} dropout={cfg['dropout']} hidden={cfg['hidden_sizes']} seed={cfg['seed']} epochs={cfg['n_epochs']}")
    
    set_seed(cfg['seed'])

    model = TabularNNImproved(
        embedding_sizes=embedding_sizes,
        num_numerical=len(feature_cols),
        hidden_sizes=cfg['hidden_sizes'],
        dropout=cfg['dropout']
    ).to(device)

    # Entrena sobre TODO el set (sin early stopping)
    y_true, y_pred, best_epoch = train_model(
        model,
        train_loader_full,
        val_loader=None,
        n_epochs=cfg['n_epochs'],
        lr=cfg['lr'],
        patience=None  # No early stopping en entrenamiento final
        # Quitar alpha y penalty_indices/coefficients si ya no los usás
    )

    torch.save(model.state_dict(), f"{cfg['name']}_final.pt")
    print(f"💾 Modelo guardado: {cfg['name']}_final.pt")

print("\n✅ ¡Entrenamiento y guardado de los 50 mejores modelos finalizado!")



In [None]:
from torch.utils.data import DataLoader, TensorDataset

# --- Predicción MLP (PyTorch) (50 modelos) ---

# Asegurarse de que las categóricas fueron correctamente encodeadas ANTES de esto.

for col in cat_cols:
    df_pred[col] = df_pred[col].astype(np.int64)
for col in feature_cols:
    df_pred[col] = df_pred[col].astype(np.float32)

X_cats = torch.LongTensor(df_pred[cat_cols].values)
X_conts = torch.FloatTensor(df_pred[feature_cols].values)
ds_pred = TensorDataset(X_cats, X_conts)
pred_loader = DataLoader(ds_pred, batch_size=8192, shuffle=False)

def predict_model(model, loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for X_cats, X_conts in loader:
            X_cats = X_cats.to(device)
            X_conts = X_conts.to(device)
            output = model(X_cats, X_conts)
            preds.append(output.cpu().numpy().reshape(-1))
    return np.concatenate(preds)

# Inicializar DataFrame solo con PRODUCT_ID
df_preds_final = pd.DataFrame({'PRODUCT_ID': df_pred['PRODUCT_ID'].values})

for i, cfg in enumerate(configs):
    print(f"Prediciendo MLP {i+1}/50...")
    model = TabularNNImproved(
        embedding_sizes=embedding_sizes,
        num_numerical=len(feature_cols),
        hidden_sizes=cfg['hidden_sizes'],
        dropout=cfg['dropout']
    ).to(device)
    model.load_state_dict(torch.load(f"{cfg['name']}_final.pt", map_location=device))
    preds = predict_model(model, pred_loader, device)
    # Ir agregando cada predicción como nueva columna
    df_preds_final[f'mlp_pred_LOG1P_Z_{i+1}'] = preds

# Listo, df_preds_mlp contiene solo PRODUCT_ID y las 50 columnas de predicción



In [None]:
df_preds_final.to_csv("predicciones_mlp_50_modelos.csv", index=False)

In [None]:
# Imprimir Solo las columnas PRODUCT_ID, mlp_pred_LOG1P_Z_1 a mlp_pred_LOG1P_Z_10  mostrando todas las columnas dando mas ancho 
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.4f}'.format)
print(df_preds_final[['PRODUCT_ID'] + [f'mlp_pred_LOG1P_Z_{i+1}' for i in range(len(configs))]].head())

In [34]:
import gc
import optuna
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor, early_stopping
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --- SETUP ---
target_col = 'CLASE_LOG1P_Z'
feature_cols = [col for col in df_train.columns if col != target_col]

X_tr = df_train[feature_cols]
y_tr = df_train[target_col]
X_val = df_val[feature_cols]
y_val = df_val[target_col]

# --- OPTUNA OBJECTIVE ---
def objective_lgbm(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.0005, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 1024),
        "max_depth": trial.suggest_int("max_depth", 4, 64),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        "random_state": trial.number,   # Distinto para cada trial
        "n_jobs": -1,
        "verbosity": -1,  # Mínimo verbose en LightGBM
    }
    model = LGBMRegressor(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(stopping_rounds=30, verbose=False)],
    )
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    trial.set_user_attr("mae", mean_absolute_error(y_val, preds))
    gc.collect()
    return rmse

# --- OPTIMIZE ---
N_MODELS = 50
N_TRIALS = 1000  

study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=101))
study.optimize(objective_lgbm, n_trials=N_TRIALS, n_jobs=28, show_progress_bar=False)

# --- EXTRACT BEST 50 PARAMS ---
trials_df = study.trials_dataframe()
trials_df["mae"] = [t.user_attrs.get("mae", np.nan) for t in study.trials]

top_lgbm_trials = trials_df.sort_values("value").head(N_MODELS)
final_configs = []
for i, row in top_lgbm_trials.iterrows():
    params = row.filter(like='params_').to_dict()
    params = {k.replace('params_', ''): v for k, v in params.items()}
    for p in ["num_leaves", "max_depth", "n_estimators", "min_child_samples"]:
        params[p] = int(params[p])
    params["random_state"] = int(row["number"])
    params["n_jobs"] = -1
    params["verbosity"] = -1
    final_configs.append(params)

# --- GUARDADO ---
top_lgbm_trials.to_csv("optuna_lgbm_trials.csv", index=False)
import json
with open("lgbm_ensemble_configs.json", "w") as f:
    json.dump(final_configs, f, indent=2)

# --- PRINT RESUMEN SIMPLE ---
print(f"Top 50 RMSE: {top_lgbm_trials['value'].head(50).values}")
print(f"Top 50 MAE: {top_lgbm_trials['mae'].head(50).values}")
print(final_configs)



[I 2025-07-13 18:19:57,667] A new study created in memory with name: no-name-065eb71e-faa2-423a-987e-0c094bdf9c29
[I 2025-07-13 18:19:59,843] Trial 24 finished with value: 0.26702409138423416 and parameters: {'learning_rate': 0.15875633634463954, 'num_leaves': 939, 'max_depth': 41, 'n_estimators': 408, 'min_child_samples': 49, 'subsample': 0.9097488226116202, 'colsample_bytree': 0.914337130026857, 'reg_alpha': 0.0005700262612054611, 'reg_lambda': 3.155762378404478e-06, 'min_split_gain': 0.4055287629535739}. Best is trial 24 with value: 0.26702409138423416.
[I 2025-07-13 18:20:00,069] Trial 17 finished with value: 0.2653434658760615 and parameters: {'learning_rate': 0.12264239767230868, 'num_leaves': 467, 'max_depth': 17, 'n_estimators': 262, 'min_child_samples': 33, 'subsample': 0.5515815675632874, 'colsample_bytree': 0.876224717658568, 'reg_alpha': 1.5400179784928922, 'reg_lambda': 0.000177911181774729, 'min_split_gain': 0.8133126429426099}. Best is trial 17 with value: 0.265343465876

Top 50 RMSE: [0.25013803 0.25015897 0.25061038 0.25081296 0.2509764  0.25102951
 0.25118281 0.25129197 0.25131252 0.25131616 0.25144026 0.25147312
 0.25148403 0.2515632  0.25177001 0.25186962 0.25190384 0.25195924
 0.25202349 0.25206726 0.2520765  0.25209264 0.25217203 0.25224362
 0.25226043 0.252263   0.25226521 0.25234989 0.25235962 0.25248552
 0.25250802 0.25253931 0.25258298 0.25267482 0.25270297 0.25270371
 0.25272397 0.25274241 0.25277409 0.25282302 0.25287728 0.25297772
 0.25298461 0.25299827 0.25300125 0.25300566 0.25300669 0.25301852
 0.25311975 0.25317351]
Top 50 MAE: [0.16849681 0.16951867 0.16905083 0.17030283 0.16968168 0.16832455
 0.16824666 0.16813887 0.1684779  0.16923551 0.16981503 0.16900283
 0.16872481 0.17014403 0.16962674 0.16998693 0.17026417 0.16995742
 0.16963327 0.16953393 0.16983041 0.17064792 0.17044429 0.17077118
 0.1697433  0.17080232 0.17029002 0.16962541 0.16962285 0.1719581
 0.17116138 0.17104676 0.17043269 0.16908039 0.17090235 0.16940254
 0.17031577 0.

In [35]:
print(final_configs)

[{'colsample_bytree': 0.5366683455202211, 'learning_rate': 0.026505797796996997, 'max_depth': 56, 'min_child_samples': 37, 'min_split_gain': 0.00047950183767811314, 'n_estimators': 913, 'num_leaves': 1002, 'reg_alpha': 0.00435867686025894, 'reg_lambda': 4.619094356259703, 'subsample': 0.5915635117382665, 'random_state': 274, 'n_jobs': -1, 'verbosity': -1}, {'colsample_bytree': 0.5655732670083103, 'learning_rate': 0.05154252449493855, 'max_depth': 48, 'min_child_samples': 18, 'min_split_gain': 0.00021750437433087132, 'n_estimators': 899, 'num_leaves': 674, 'reg_alpha': 0.002067238789656793, 'reg_lambda': 4.979218537566189, 'subsample': 0.8305786077150802, 'random_state': 794, 'n_jobs': -1, 'verbosity': -1}, {'colsample_bytree': 0.5593843342661977, 'learning_rate': 0.05989633870201197, 'max_depth': 31, 'min_child_samples': 44, 'min_split_gain': 0.00040824601669856866, 'n_estimators': 480, 'num_leaves': 773, 'reg_alpha': 0.012387611253839035, 'reg_lambda': 2.224324207762724, 'subsample': 

In [37]:
import joblib
from lightgbm import LGBMRegressor

def train_and_save_model(i, params, X_full, y_full):
    print(f"Entrenando modelo {i+1}/50...")
    model = LGBMRegressor(**params)
    model.fit(X_full, y_full)
    joblib.dump(model, f'lgbm_model_{i+1:02d}.pkl')
    return f"Modelo {i+1} terminado"

results = joblib.Parallel(n_jobs=20)(
    joblib.delayed(train_and_save_model)(i, params, X_full, y_full)
    for i, params in enumerate(final_configs[:50])
)

print(results)
print("¡Entrenamiento y guardado de los 50 modelos finalizado!")



Entrenando modelo 1/50...
Entrenando modelo 3/50...
Entrenando modelo 4/50...
Entrenando modelo 10/50...
Entrenando modelo 8/50...
Entrenando modelo 6/50...
Entrenando modelo 20/50...
Entrenando modelo 15/50...
Entrenando modelo 2/50...
Entrenando modelo 5/50...
Entrenando modelo 18/50...
Entrenando modelo 7/50...
Entrenando modelo 14/50...
Entrenando modelo 11/50...
Entrenando modelo 16/50...
Entrenando modelo 9/50...
Entrenando modelo 19/50...
Entrenando modelo 17/50...
Entrenando modelo 12/50...
Entrenando modelo 13/50...
Entrenando modelo 21/50...
Entrenando modelo 22/50...
Entrenando modelo 23/50...
Entrenando modelo 24/50...
Entrenando modelo 25/50...
Entrenando modelo 26/50...
Entrenando modelo 27/50...
Entrenando modelo 28/50...
Entrenando modelo 29/50...
Entrenando modelo 30/50...
Entrenando modelo 31/50...
Entrenando modelo 32/50...
Entrenando modelo 33/50...
Entrenando modelo 34/50...
Entrenando modelo 35/50...
Entrenando modelo 36/50...
Entrenando modelo 37/50...
Entrenando

In [38]:
import joblib

# Cargar los 50 modelos LightGBM entrenados
lgbm_models = []
for i in range(1, 51):
    model = joblib.load(f'lgbm_model_{i:02d}.pkl')
    lgbm_models.append(model)


In [42]:
import numpy as np

# --- Predicción LightGBM (50 modelos) ---
lgbm_preds = []

feature_cols = [col for col in df_pred.columns if col != target_col]

# Asegurarse de que las columnas categóricas tengan el mismo dtype que en el entrenamiento
for col in cat_cols:
    if col in df_pred.columns and col in df_train.columns:
        df_pred[col] = df_pred[col].astype(df_train[col].dtype)

X_pred_lgbm = df_pred[feature_cols]  

for i, model in enumerate(lgbm_models):
    print(f"Prediciendo LightGBM {i+1}/50...")
    preds = model.predict(X_pred_lgbm)
    lgbm_preds.append(preds)

lgbm_preds = np.stack(lgbm_preds).T  # shape (N, 50)

# --- Agregar predicciones LGBM al DataFrame existente con resultados de MLP ---
for i in range(50):
    df_preds_final[f'lgbm_pred_LOG1P_Z_{i+1}'] = lgbm_preds[:, i]



Prediciendo LightGBM 1/50...
Prediciendo LightGBM 2/50...
Prediciendo LightGBM 3/50...
Prediciendo LightGBM 4/50...
Prediciendo LightGBM 5/50...
Prediciendo LightGBM 6/50...
Prediciendo LightGBM 7/50...
Prediciendo LightGBM 8/50...
Prediciendo LightGBM 9/50...
Prediciendo LightGBM 10/50...
Prediciendo LightGBM 11/50...
Prediciendo LightGBM 12/50...
Prediciendo LightGBM 13/50...
Prediciendo LightGBM 14/50...
Prediciendo LightGBM 15/50...
Prediciendo LightGBM 16/50...
Prediciendo LightGBM 17/50...
Prediciendo LightGBM 18/50...
Prediciendo LightGBM 19/50...
Prediciendo LightGBM 20/50...
Prediciendo LightGBM 21/50...
Prediciendo LightGBM 22/50...
Prediciendo LightGBM 23/50...
Prediciendo LightGBM 24/50...
Prediciendo LightGBM 25/50...
Prediciendo LightGBM 26/50...
Prediciendo LightGBM 27/50...
Prediciendo LightGBM 28/50...
Prediciendo LightGBM 29/50...
Prediciendo LightGBM 30/50...
Prediciendo LightGBM 31/50...
Prediciendo LightGBM 32/50...
Prediciendo LightGBM 33/50...
Prediciendo LightGB

  df_preds_final[f'lgbm_pred_LOG1P_Z_{i+1}'] = lgbm_preds[:, i]


In [46]:
# Imprimir Solo las columnas PRODUCT_ID, mlp_pred_LOG1P_Z_1 a mlp_pred_LOG1P_Z_10  mostrando todas las columnas dando mas ancho 
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.4f}'.format)
print(df_preds_final.head(50))

    PRODUCT_ID  mlp_pred_LOG1P_Z_1  mlp_pred_LOG1P_Z_2  mlp_pred_LOG1P_Z_3  mlp_pred_LOG1P_Z_4  mlp_pred_LOG1P_Z_5  mlp_pred_LOG1P_Z_6  mlp_pred_LOG1P_Z_7  mlp_pred_LOG1P_Z_8  mlp_pred_LOG1P_Z_9  mlp_pred_LOG1P_Z_10  mlp_pred_LOG1P_Z_11  mlp_pred_LOG1P_Z_12  mlp_pred_LOG1P_Z_13  mlp_pred_LOG1P_Z_14  mlp_pred_LOG1P_Z_15  mlp_pred_LOG1P_Z_16  mlp_pred_LOG1P_Z_17  mlp_pred_LOG1P_Z_18  mlp_pred_LOG1P_Z_19  mlp_pred_LOG1P_Z_20  mlp_pred_LOG1P_Z_21  mlp_pred_LOG1P_Z_22  mlp_pred_LOG1P_Z_23  mlp_pred_LOG1P_Z_24  mlp_pred_LOG1P_Z_25  mlp_pred_LOG1P_Z_26  mlp_pred_LOG1P_Z_27  mlp_pred_LOG1P_Z_28  mlp_pred_LOG1P_Z_29  mlp_pred_LOG1P_Z_30  mlp_pred_LOG1P_Z_31  mlp_pred_LOG1P_Z_32  mlp_pred_LOG1P_Z_33  mlp_pred_LOG1P_Z_34  mlp_pred_LOG1P_Z_35  mlp_pred_LOG1P_Z_36  mlp_pred_LOG1P_Z_37  mlp_pred_LOG1P_Z_38  mlp_pred_LOG1P_Z_39  mlp_pred_LOG1P_Z_40  mlp_pred_LOG1P_Z_41  mlp_pred_LOG1P_Z_42  mlp_pred_LOG1P_Z_43  mlp_pred_LOG1P_Z_44  mlp_pred_LOG1P_Z_45  mlp_pred_LOG1P_Z_46  mlp_pred_LOG1P_Z_47  mlp_pr

In [47]:
# Cargar desde disco
scaler_y = joblib.load('scaler_y_CLASE_LOG1P.joblib')
#y_pred_log1p = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()

In [48]:
def inverse_log_transform_signed(x):
    return np.sign(x) * np.expm1(np.abs(x))

def inv_transform_with_scaler(arr, scaler_y):
    # arr debe tener forma (N,)
    arr = arr.reshape(-1, 1)  # scaler espera 2D
    # Paso 1: inversa del escalado
    log1p_vals = scaler_y.inverse_transform(arr).flatten()
    # Paso 2: inversa de log1p con signo
    orig_vals = inverse_log_transform_signed(log1p_vals)
    return orig_vals

# Crear columnas nuevas con sufijo "_ORIG"
for col in df_preds_final.columns:
    if col not in ['CUSTOMER_ID', 'PRODUCT_ID']:
        arr = df_preds_final[col].values
        df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)

  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform_with_scaler(arr, scaler_y)
  df_preds_final[col.replace('_LOG1P_Z_'

In [49]:
# Eliminar de df_preds_final las columnas con "_LOG1P_Z_"
cols_to_remove = [col for col in df_preds_final.columns if '_LOG1P_Z_' in col]
df_preds_final.drop(columns=cols_to_remove, inplace=True)

In [50]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.4f}'.format)
print(df_preds_final.head(50))

    PRODUCT_ID  mlp_pred_ORIG_1  mlp_pred_ORIG_2  mlp_pred_ORIG_3  mlp_pred_ORIG_4  mlp_pred_ORIG_5  mlp_pred_ORIG_6  mlp_pred_ORIG_7  mlp_pred_ORIG_8  mlp_pred_ORIG_9  mlp_pred_ORIG_10  mlp_pred_ORIG_11  mlp_pred_ORIG_12  mlp_pred_ORIG_13  mlp_pred_ORIG_14  mlp_pred_ORIG_15  mlp_pred_ORIG_16  mlp_pred_ORIG_17  mlp_pred_ORIG_18  mlp_pred_ORIG_19  mlp_pred_ORIG_20  mlp_pred_ORIG_21  mlp_pred_ORIG_22  mlp_pred_ORIG_23  mlp_pred_ORIG_24  mlp_pred_ORIG_25  mlp_pred_ORIG_26  mlp_pred_ORIG_27  mlp_pred_ORIG_28  mlp_pred_ORIG_29  mlp_pred_ORIG_30  mlp_pred_ORIG_31  mlp_pred_ORIG_32  mlp_pred_ORIG_33  mlp_pred_ORIG_34  mlp_pred_ORIG_35  mlp_pred_ORIG_36  mlp_pred_ORIG_37  mlp_pred_ORIG_38  mlp_pred_ORIG_39  mlp_pred_ORIG_40  mlp_pred_ORIG_41  mlp_pred_ORIG_42  mlp_pred_ORIG_43  mlp_pred_ORIG_44  mlp_pred_ORIG_45  mlp_pred_ORIG_46  mlp_pred_ORIG_47  mlp_pred_ORIG_48  mlp_pred_ORIG_49  mlp_pred_ORIG_50  lgbm_pred_ORIG_1  lgbm_pred_ORIG_2  lgbm_pred_ORIG_3  lgbm_pred_ORIG_4  lgbm_pred_ORIG_5  lgb

In [51]:
# Columnas de predicción

mlp_cols = [c for c in df_preds_final.columns if c.startswith('mlp_pred_ORIG_')]
lgbm_cols = [c for c in df_preds_final.columns if c.startswith('lgbm_pred_ORIG_')]

df_preds_final['mlp_median'] = df_preds_final[mlp_cols].median(axis=1)
df_preds_final['lgbm_median'] = df_preds_final[lgbm_cols].median(axis=1)
df_preds_final['ensemble_median'] = df_preds_final[mlp_cols + lgbm_cols].median(axis=1)

# Medias (nuevo)
df_preds_final['mlp_mean'] = df_preds_final[mlp_cols].mean(axis=1)
df_preds_final['lgbm_mean'] = df_preds_final[lgbm_cols].mean(axis=1)
df_preds_final['ensemble_mean'] = df_preds_final[mlp_cols + lgbm_cols].mean(axis=1)

# Desvío estándar
df_preds_final['mlp_std'] = df_preds_final[mlp_cols].std(axis=1)
df_preds_final['lgbm_std'] = df_preds_final[lgbm_cols].std(axis=1)
df_preds_final['ensemble_std'] = df_preds_final[mlp_cols + lgbm_cols].std(axis=1)

  df_preds_final['mlp_median'] = df_preds_final[mlp_cols].median(axis=1)
  df_preds_final['lgbm_median'] = df_preds_final[lgbm_cols].median(axis=1)
  df_preds_final['ensemble_median'] = df_preds_final[mlp_cols + lgbm_cols].median(axis=1)
  df_preds_final['mlp_mean'] = df_preds_final[mlp_cols].mean(axis=1)
  df_preds_final['lgbm_mean'] = df_preds_final[lgbm_cols].mean(axis=1)
  df_preds_final['ensemble_mean'] = df_preds_final[mlp_cols + lgbm_cols].mean(axis=1)
  df_preds_final['mlp_std'] = df_preds_final[mlp_cols].std(axis=1)
  df_preds_final['lgbm_std'] = df_preds_final[lgbm_cols].std(axis=1)
  df_preds_final['ensemble_std'] = df_preds_final[mlp_cols + lgbm_cols].std(axis=1)


In [53]:
print(df_preds_final)

     PRODUCT_ID  mlp_pred_ORIG_1  mlp_pred_ORIG_2  mlp_pred_ORIG_3  mlp_pred_ORIG_4  mlp_pred_ORIG_5  mlp_pred_ORIG_6  mlp_pred_ORIG_7  mlp_pred_ORIG_8  mlp_pred_ORIG_9  mlp_pred_ORIG_10  mlp_pred_ORIG_11  mlp_pred_ORIG_12  mlp_pred_ORIG_13  mlp_pred_ORIG_14  mlp_pred_ORIG_15  mlp_pred_ORIG_16  mlp_pred_ORIG_17  mlp_pred_ORIG_18  mlp_pred_ORIG_19  mlp_pred_ORIG_20  mlp_pred_ORIG_21  mlp_pred_ORIG_22  mlp_pred_ORIG_23  mlp_pred_ORIG_24  mlp_pred_ORIG_25  mlp_pred_ORIG_26  mlp_pred_ORIG_27  mlp_pred_ORIG_28  mlp_pred_ORIG_29  mlp_pred_ORIG_30  mlp_pred_ORIG_31  mlp_pred_ORIG_32  mlp_pred_ORIG_33  mlp_pred_ORIG_34  mlp_pred_ORIG_35  mlp_pred_ORIG_36  mlp_pred_ORIG_37  mlp_pred_ORIG_38  mlp_pred_ORIG_39  mlp_pred_ORIG_40  mlp_pred_ORIG_41  mlp_pred_ORIG_42  mlp_pred_ORIG_43  mlp_pred_ORIG_44  mlp_pred_ORIG_45  mlp_pred_ORIG_46  mlp_pred_ORIG_47  mlp_pred_ORIG_48  mlp_pred_ORIG_49  mlp_pred_ORIG_50  lgbm_pred_ORIG_1  lgbm_pred_ORIG_2  lgbm_pred_ORIG_3  lgbm_pred_ORIG_4  lgbm_pred_ORIG_5  lg

In [54]:
df_preds_final.to_csv('predicciones_finales.csv', index=False)