In [None]:
import numpy as np
import pandas as pd
import gc
import os
import matplotlib.pyplot as plt
import polars as pl
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import Parallel, delayed
from more_itertools import chunked
from functools import reduce
from typing import List
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
import joblib
import os
import torch.nn as nn

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [None]:
df_full = pd.read_parquet('./data/train_val_NN_TORCH.parquet', engine='fastparquet')

In [None]:
# Separar conjuntos
df_train = df_full[df_full['PERIODO'] <= 201908].copy()
df_val = df_full[(df_full['PERIODO'] >= 201909) & (df_full['PERIODO'] <= 201910)].copy()
df_pred = df_full[df_full['PERIODO'] == 201912].copy()
del df_full
gc.collect()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {}

target_col = 'CLASE_LOG1P_Z'

# Columnas categóricas a embeddings
cat_cols = ['ID_CAT1', 'ID_CAT2', 'ID_CAT3', 'ID_BRAND', 'SKU_SIZE', 
            'ANIO', 'MES', 'TRIMESTRE', 'MES_PROBLEMATICO', 'CUSTOMER_RANK_BIN', 
            'PRODUCT_RANK_BIN']

# Codificación para embeddings
for col in cat_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    clases_entrenadas = set(le.classes_)
    df_val[col] = df_val[col].map(lambda x: le.transform([x])[0] if x in clases_entrenadas else 0)
    df_pred[col] = df_pred[col].map(lambda x: le.transform([x])[0] if x in clases_entrenadas else 0)
    label_encoders[col] = le

embedding_sizes = [
    (df_train[col].nunique() + 1, min(50, (df_train[col].nunique() + 1) // 2))
    for col in cat_cols
]

# Excluir columnas que no deben ir al modelo
excluir = ['PERIODO', 'CUSTOMER_ID', 'PRODUCT_ID', 'CLASE_LOG1P_Z', 'ORDINAL']

feature_cols = [col for col in df_train.columns if col not in excluir and col not in cat_cols]


In [None]:
# No los incluyas en ninguna de estas dos listas
assert 'CUSTOMER_ID' not in feature_cols
assert 'CUSTOMER_ID' not in cat_cols
assert 'PRODUCT_ID' not in feature_cols
assert 'PRODUCT_ID' not in cat_cols
assert 'PERIODO' not in feature_cols
assert 'PERIODO' not in cat_cols
assert 'CLASE_LOG1P_Z' not in feature_cols
assert 'CLASE_LOG1P_Z' not in cat_cols
assert 'ORDINAL' not in feature_cols
assert 'ORDINAL' not in cat_cols


In [None]:
from torch.utils.data import Dataset
import torch

class TabularDataset(Dataset):
    def __init__(self, df, cat_cols, num_cols, target_col=None):
        self.cat_data = torch.tensor(df[cat_cols].values, dtype=torch.long)
        self.num_data = torch.tensor(df[num_cols].values, dtype=torch.float32)
        self.has_target = target_col is not None
        if self.has_target:
            self.y = torch.tensor(df[target_col].values, dtype=torch.float32).unsqueeze(1)
        else:
            self.y = None

    def __len__(self):
        return len(self.cat_data)

    def __getitem__(self, idx):
        if self.has_target:
            return self.cat_data[idx], self.num_data[idx], self.y[idx]
        else:
            return self.cat_data[idx], self.num_data[idx]


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WeightedMSELossMultiStable(nn.Module):
    def __init__(self, penalty_indices, coefficients, alpha=0.1, debug=False):
        """
        penalty_indices: lista de índices de columnas en x_num (por ejemplo, [10, 11, ..., 20])
        coefficients: lista de coeficientes (ordenados igual que penalty_indices)
        alpha: fuerza de penalización (más bajo por estabilidad en redes grandes)
        debug: si True, imprime el promedio del peso de penalización ocasionalmente
        """
        super().__init__()
        self.penalty_indices = penalty_indices
        self.coefficients = coefficients
        self.alpha = alpha
        self.debug = debug

    def forward(self, preds, targets, x_num):
        # Penalización por muestra (shape: [batch_size, 1])
        penalty = torch.ones_like(targets).float()

        for idx, coef in zip(self.penalty_indices, self.coefficients):
            val = x_num[:, idx:idx+1]  # shape: [batch_size, 1]
            safe_val = torch.tanh(val)  # limitar la magnitud para evitar outliers
            penalty += self.alpha * coef * safe_val.abs()  # siempre positivo

        if self.debug and torch.rand(1).item() < 0.01:
            print(f"[LossDebug] Mean penalty: {penalty.mean().item():.4f}")

        error = (preds - targets) ** 2
        weighted_error = penalty * error

        return weighted_error.mean()


In [None]:
penalty_cols = ['TN_LAG_01_Z', 'TN_LAG_02_Z', 'TN_LAG_03_Z', 'TN_LAG_04_Z','TN_LAG_05_Z','TN_LAG_06_Z','TN_LAG_07_Z',
'TN_LAG_08_Z','TN_LAG_09_Z','TN_LAG_10_Z','TN_LAG_11_Z']
penalty_indices = [feature_cols.index(col) for col in penalty_cols]
print(penalty_cols)
print(penalty_indices)
coefficients = [
    0.236558,
    0.178208,
   -0.060031,
   -0.161875,
   -0.007775,
    0.151936,
    0.043933,
    0.142839,
    0.103804,
    0.119211,
    0.073671
]
# loss_fn = WeightedMSELossMulti(penalty_indices, coefficients, alpha=0.5)

loss_fn = WeightedMSELossMultiStable(
    penalty_indices=penalty_indices,
    coefficients=coefficients,
    alpha=0.1,      # más suave
    debug=True      # activalo si querés monitorear internamente
)

#loss_fn = WeightedMSELossMulti(penalty_indices=penalty_cols, alpha=0.5)

In [None]:
assert all(col in df_train.columns for col in cat_cols), "Faltan columnas categóricas"
assert all(col in df_train.columns for col in feature_cols), "Faltan columnas numéricas"
assert target_col in df_train.columns, "Falta la variable objetivo"


In [None]:
batch_size = 4096

train_dataset = TabularDataset(df_train, cat_cols, feature_cols, target_col)
val_dataset = TabularDataset(df_val, cat_cols, feature_cols, target_col)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
print(cat_cols)
print(feature_cols)
print(target_col)

In [None]:
import torch.nn.functional as F

class TabularNNImproved(nn.Module):
    def __init__(self, embedding_sizes, num_numerical, hidden_sizes=[512, 512, 256, 128], dropout=0.1):
        super().__init__()
        
        # Embedding layers
        self.embeddings = nn.ModuleList([
            nn.Embedding(ni, nf) for ni, nf in embedding_sizes
        ])
        embedding_dim = sum([nf for _, nf in embedding_sizes])
        self.embedding_dropout = nn.Dropout(dropout)

        # Total input size after embedding + numerical
        input_size = embedding_dim + num_numerical

        # Hidden layers
        layers = []
        for h in hidden_sizes:
            layers.append(nn.Linear(input_size, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout))
            input_size = h

        # Output layer
        layers.append(nn.Linear(input_size, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x_cat, x_num):
        x = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(x, dim=1)
        x = self.embedding_dropout(x)
        x = torch.cat([x, x_num], dim=1)
        return self.model(x)


In [None]:
import torch

# Detectar si hay GPU disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Crear el modelo
model = TabularNNImproved(
    embedding_sizes=embedding_sizes,
    num_numerical=len(feature_cols),
    hidden_sizes=[4096,2048,1024,512, 512, 256, 128],
    dropout=0.3
).to(device)
model.to(device)
print(model)


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
import torch
import numpy as np

def train_model(
    model, train_loader, val_loader=None, n_epochs=20, lr=1e-3, alpha=0.5, patience=3,
    penalty_indices=None, coefficients=None
):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = WeightedMSELossMultiStable(
        penalty_indices=penalty_indices, coefficients=coefficients, alpha=alpha
    )
    best_val_loss = float('inf')
    best_model_state = None
    epochs_without_improvement = 0

    for epoch in range(n_epochs):
        # Entrenamiento
        model.train()
        train_loss = 0.0
        for cats, conts, y in train_loader:
            cats, conts, y = cats.to(device), conts.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred = model(cats, conts)
            loss = criterion(y_pred, y, conts)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * y.size(0)

        train_loss /= len(train_loader.dataset)

        # Validación (solo si hay val_loader)
        if val_loader is not None:
            model.eval()
            val_loss = 0.0
            y_true_list = []
            y_pred_list = []
            with torch.no_grad():
                for cats, conts, y in val_loader:
                    cats, conts, y = cats.to(device), conts.to(device), y.to(device)
                    y_pred = model(cats, conts)
                    loss = criterion(y_pred, y, conts)
                    val_loss += loss.item() * y.size(0)
                    y_true_list.append(y.cpu().numpy())
                    y_pred_list.append(y_pred.cpu().numpy())

            val_loss /= len(val_loader.dataset)
            y_true = np.concatenate(y_true_list)
            y_pred = np.concatenate(y_pred_list)

            mae = mean_absolute_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)

            print(
                f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss:.4f} | "
                f"Val Loss: {val_loss:.4f} | MAE: {mae:.4f} | R²: {r2:.4f}"
            )

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = model.state_dict()
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if patience is not None and epochs_without_improvement >= patience:
                    print("🔴 Early stopping triggered")
                    break
        else:
            # Si no hay val_loader, solo mostrar train_loss
            print(f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss:.4f}")

    # Restaurar el mejor modelo solo si hubo validación
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # Retornar resultados solo si hubo validación, si no, devolver None
    if val_loader is not None:
        return y_true, y_pred
    else:
        return None, None



# Búsqueda de hiperparámetros (Grid Search)
Probamos distintas combinaciones de hiperparámetros y seleccionamos la que da mejor MAE en validación.


In [None]:
from itertools import product
import torch
import gc
import pandas as pd
from sklearn.metrics import mean_absolute_error

# Definir el espacio de búsqueda
param_grid = {
    'lr': [1e-3, 5e-4],
    'dropout': [0.2, 0.3],
    'hidden_sizes': [
        [1024, 512, 256],
        [2048, 1024, 512, 256]
    ],
    'alpha': [0,0.1,0.3, 0.5, 0.7,0.9]
}

# Generar todas las combinaciones posibles
param_combinations = list(product(
    param_grid['lr'],
    param_grid['dropout'],
    param_grid['hidden_sizes'],
    param_grid['alpha']
))

results = []
best_mae = float('inf')

# Loop de entrenamiento por combinación
for lr, dropout, hidden_sizes, alpha in param_combinations:
    print(f"\n🔧 Entrenando con: lr={lr}, dropout={dropout}, hidden_sizes={hidden_sizes}, alpha={alpha}")

    # Crear modelo y mover a dispositivo
    model = TabularNNImproved(
        embedding_sizes=embedding_sizes,
        num_numerical=len(feature_cols),
        hidden_sizes=hidden_sizes,
        dropout=dropout
    ).to(device)

    # Entrenamiento corto para tuning
    y_true_gs, y_pred_gs = train_model(
        model, train_loader, val_loader,
        n_epochs=8, lr=lr, alpha=alpha, patience=2,penalty_indices=penalty_indices,coefficients=coefficients
    )

    mae = mean_absolute_error(y_true_gs, y_pred_gs)

    # Guardar resultados
    results.append({
        'lr': lr,
        'dropout': dropout,
        'hidden_sizes': hidden_sizes,
        'alpha': alpha,
        'mae': mae
    })

    print(f"✅ MAE = {mae:.4f}")

    # Guardar modelo si es el mejor
    if mae < best_mae:
        best_mae = mae
        torch.save(model.state_dict(), f"best_model_mae{mae:.4f}_lr{lr}_do{dropout}_a{alpha}.pth")
        print("💾 Modelo guardado (mejor hasta ahora)")

    # Limpiar memoria GPU
    del model
    torch.cuda.empty_cache()
    gc.collect()

# Convertir a DataFrame y mostrar top 5
results_df = pd.DataFrame(results).sort_values(by='mae')
print("\n📊 Mejores combinaciones:")
print(results_df.head())

# Guardar resultados a disco
results_df.to_csv("gridsearch_results.csv", index=False)


In [None]:
import torch
import math
from torch.utils.data import ConcatDataset, DataLoader

# ------------------------
# Prepara el dataset final
# ------------------------
# train_dataset y val_dataset ya deberían estar definidos
train_val_dataset = ConcatDataset([train_dataset, val_dataset])

batch_size = 1024
train_loader_full = DataLoader(train_val_dataset, batch_size=batch_size, shuffle=True)


In [None]:

# ------------------------
# Lista de hiperparámetros de los 10 mejores modelos
# (¡completa esta lista con tus resultados!)
# ------------------------
configs = [
    {"lr": 0.0005, "dropout": 0.3, "hidden_sizes": [2048, 1024, 512, 256], "alpha": 0,   "n_epochs": 8, "name": "modelo_1"},
    {"lr": 0.001,  "dropout": 0.2, "hidden_sizes": [2048, 1024, 512, 256], "alpha": 0,   "n_epochs": 4, "name": "modelo_2"},
    {"lr": 0.001,  "dropout": 0.3, "hidden_sizes": [2048, 1024, 512, 256], "alpha": 0.1, "n_epochs": 8, "name": "modelo_3"},
    {"lr": 0.0005, "dropout": 0.2, "hidden_sizes": [2048, 1024, 512, 256], "alpha": 0.3, "n_epochs": 6, "name": "modelo_4"},
    {"lr": 0.001,  "dropout": 0.3, "hidden_sizes": [2048, 1024, 512, 256], "alpha": 0.3, "n_epochs": 8, "name": "modelo_5"},
    {"lr": 0.001,  "dropout": 0.2, "hidden_sizes": [1024, 512, 256],       "alpha": 0.7, "n_epochs": 8, "name": "modelo_6"},
    {"lr": 0.001,  "dropout": 0.3, "hidden_sizes": [1024, 512, 256],       "alpha": 0.1, "n_epochs": 8, "name": "modelo_7"},
    {"lr": 0.001,  "dropout": 0.2, "hidden_sizes": [2048, 1024, 512, 256], "alpha": 0.7, "n_epochs": 5, "name": "modelo_8"},
    {"lr": 0.001,  "dropout": 0.3, "hidden_sizes": [2048, 1024, 512, 256], "alpha": 0.3, "n_epochs": 5, "name": "modelo_9"},
    {"lr": 0.0005, "dropout": 0.2, "hidden_sizes": [1024, 512, 256],       "alpha": 0.1, "n_epochs": 4, "name": "modelo_10"},
]

# Actualizar epochs
for cfg in configs:
    cfg['n_epochs'] = int(math.ceil(cfg['n_epochs'] * 1.3))

# Mostramos resultado
for cfg in configs:
    print(cfg)
    


In [None]:
# ------------------------
# Entrenamiento y guardado
# ------------------------

for cfg in configs:
    print(f"\n🔧 Entrenando {cfg['name']} - lr={cfg['lr']} dropout={cfg['dropout']} hidden={cfg['hidden_sizes']} alpha={cfg['alpha']} epochs={cfg['n_epochs']}")

    model = TabularNNImproved(
        embedding_sizes=embedding_sizes,
        num_numerical=len(feature_cols),
        hidden_sizes=cfg['hidden_sizes'],
        dropout=cfg['dropout']
    ).to(device)

    # Entrena sobre TODO el set
    y_true, y_pred = train_model(
        model,
        train_loader_full,
        val_loader=None,
        n_epochs=cfg['n_epochs'],
        lr=cfg['lr'],
        alpha=cfg['alpha'],
        patience=None,  # No early stopping en entrenamiento final
        penalty_indices=penalty_indices,
        coefficients=coefficients
    )

    torch.save(model.state_dict(), f"{cfg['name']}_final.pt")
    print(f"💾 Modelo guardado: {cfg['name']}_final.pt")

print("\n✅ ¡Entrenamiento y guardado de los 10 mejores modelos finalizado!")


In [None]:
import lightgbm
print(lightgbm.__version__)

In [None]:
import gc
import optuna
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping

# --- SETUP ---
target_col = 'CLASE_LOG1P_Z'
feature_cols = [col for col in df_train.columns if col != target_col]

X = df_train[feature_cols].values
y = df_train[target_col].values

# --- OPTUNA OBJECTIVE ---
def objective_lgbm(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", 4, 16),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        "random_state": 42,
        "n_jobs": -1,
    }
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.15, random_state=trial.number)
    model = LGBMRegressor(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(stopping_rounds=30, verbose=False)],
    )
    preds = model.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    return mae


# --- OPTIMIZE ---
N_MODELS = 10
N_TRIALS = 80

study = optuna.create_study(direction="minimize")
study.optimize(objective_lgbm, n_trials=N_TRIALS, n_jobs=28)

# --- EXTRACT BEST 10 PARAMS ---
top_lgbm_trials = study.trials_dataframe().sort_values("value").head(N_MODELS)
final_configs = []
for i, row in top_lgbm_trials.iterrows():
    params = row.filter(like='params_').to_dict()
    params = {k.replace('params_', ''): v for k, v in params.items()}
    # Asegurar enteros donde corresponde
    for p in ["num_leaves", "max_depth", "n_estimators", "min_child_samples"]:
        params[p] = int(params[p])
    params["random_state"] = 42
    params["n_jobs"] = -1
    final_configs.append(params)

print(final_configs)


In [None]:
print(final_configs)

In [None]:

# Concatenar train y val
df_full_train = pd.concat([df_train, df_val], axis=0).reset_index(drop=True)

target_col = 'CLASE_LOG1P_Z'
feature_cols = [col for col in df_full_train.columns if col != target_col]

X_full = df_full_train[feature_cols].values
y_full = df_full_train[target_col].values

# Tus hiperparámetros copiados (reemplazá si lo vas a cargar de un archivo)
param_list = [{'colsample_bytree': 0.9956156662176132, 'learning_rate': 0.150647911151466, 'max_depth': 14, 'min_child_samples': 62, 'min_split_gain': 0.18618312765512604, 'n_estimators': 861, 'num_leaves': 222, 'reg_alpha': 0.0008184678353621553, 'reg_lambda': 0.00181071360138272, 'subsample': 0.8288045639730978, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.9900835480198793, 'learning_rate': 0.15285688565610836, 'max_depth': 14, 'min_child_samples': 22, 'min_split_gain': 0.21318221132522686, 'n_estimators': 863, 'num_leaves': 218, 'reg_alpha': 0.005832497428105893, 'reg_lambda': 0.00048332566435359725, 'subsample': 0.966212591402847, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.9810381286523686, 'learning_rate': 0.17126526528861452, 'max_depth': 15, 'min_child_samples': 20, 'min_split_gain': 0.18620761002164188, 'n_estimators': 887, 'num_leaves': 233, 'reg_alpha': 7.525959386634591, 'reg_lambda': 0.002932489509766247, 'subsample': 0.9855006418021718, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.9991282388618083, 'learning_rate': 0.04595382767022815, 'max_depth': 11, 'min_child_samples': 41, 'min_split_gain': 0.21027442904661392, 'n_estimators': 876, 'num_leaves': 216, 'reg_alpha': 0.0009948983243008944, 'reg_lambda': 0.0007851771585517262, 'subsample': 0.8348381160448405, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.9997415495087778, 'learning_rate': 0.207311488631083, 'max_depth': 11, 'min_child_samples': 40, 'min_split_gain': 0.21104245822261147, 'n_estimators': 883, 'num_leaves': 256, 'reg_alpha': 0.0010361594632463222, 'reg_lambda': 1.430438254956134e-07, 'subsample': 0.8317847564939153, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.587328586423979, 'learning_rate': 0.1861100303288766, 'max_depth': 11, 'min_child_samples': 17, 'min_split_gain': 0.003592855256288212, 'n_estimators': 879, 'num_leaves': 226, 'reg_alpha': 0.07837411453395908, 'reg_lambda': 2.5489037442005937e-07, 'subsample': 0.8302404263289589, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.5895955174460985, 'learning_rate': 0.14941003478023016, 'max_depth': 15, 'min_child_samples': 20, 'min_split_gain': 0.19615530216306443, 'n_estimators': 896, 'num_leaves': 221, 'reg_alpha': 5.010927643947572, 'reg_lambda': 0.0007780698199438967, 'subsample': 0.9998177336233038, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.5837101193747908, 'learning_rate': 0.15338979834994065, 'max_depth': 11, 'min_child_samples': 64, 'min_split_gain': 0.19477841520356795, 'n_estimators': 856, 'num_leaves': 217, 'reg_alpha': 0.07087365761730123, 'reg_lambda': 6.887002502845986e-08, 'subsample': 0.8244503737000027, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.9891294837026124, 'learning_rate': 0.14710934806711512, 'max_depth': 14, 'min_child_samples': 66, 'min_split_gain': 0.1888950104677536, 'n_estimators': 898, 'num_leaves': 230, 'reg_alpha': 0.045761956230443775, 'reg_lambda': 0.000545124313416254, 'subsample': 0.8224562428405393, 'random_state': 42, 'n_jobs': -1}, 
{'colsample_bytree': 0.9998748104919317, 'learning_rate': 0.15126642557505776, 'max_depth': 14, 'min_child_samples': 20, 'min_split_gain': 0.25087912345115637, 'n_estimators': 849, 'num_leaves': 219, 'reg_alpha': 0.001191096159067221, 'reg_lambda': 0.0016155410949331161, 'subsample': 0.8372252476108392, 'random_state': 42, 'n_jobs': -1}]

models = []
for i, params in enumerate(param_list):
    print(f"Entrenando modelo {i+1}/10...")
    model = LGBMRegressor(**params)
    model.fit(X_full, y_full)
    models.append(model)
    # Guardar modelo
    joblib.dump(model, f'lgbm_model_{i+1:02d}.pkl')

print("¡Entrenamiento y guardado de modelos finalizado!")


In [None]:
import joblib

# Cargar los 10 modelos LightGBM entrenados
lgbm_models = []
for i in range(1, 11):
    model = joblib.load(f'lgbm_model_{i:02d}.pkl')
    lgbm_models.append(model)


In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

# --- Configuración de columnas ---
target_col = 'CLASE_LOG1P_Z'
feature_cols = [col for col in df_pred.columns if col != target_col]

# --- Predicción LightGBM (10 modelos) ---
lgbm_preds = []

# Usá DataFrame, no .values para evitar el warning de feature names
X_pred_lgbm = df_pred[feature_cols]

for i, model in enumerate(lgbm_models):
    print(f"Prediciendo LightGBM {i+1}/10...")
    preds = model.predict(X_pred_lgbm)
    lgbm_preds.append(preds)

lgbm_preds = np.stack(lgbm_preds)  # shape (10, N)
df_pred['lgbm_ensemble_mean'] = lgbm_preds.mean(axis=0)
df_pred['lgbm_ensemble_median'] = np.median(lgbm_preds, axis=0)
# Opcional: guardar cada predicción individual
for i in range(10):
    df_pred[f'lgbm_pred_LOG1P_Z_{i+1}'] = lgbm_preds[i]


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {}

target_col = 'CLASE_LOG1P_Z'

# Columnas categóricas a embeddings
cat_cols = ['ID_CAT1', 'ID_CAT2', 'ID_CAT3', 'ID_BRAND', 'SKU_SIZE', 
            'ANIO', 'MES', 'TRIMESTRE', 'MES_PROBLEMATICO', 'CUSTOMER_RANK_BIN', 
            'PRODUCT_RANK_BIN']

# Codificación para embeddings
for col in cat_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    clases_entrenadas = set(le.classes_)
    df_val[col] = df_val[col].map(lambda x: le.transform([x])[0] if x in clases_entrenadas else 0)
    df_pred[col] = df_pred[col].map(lambda x: le.transform([x])[0] if x in clases_entrenadas else 0)
    label_encoders[col] = le

embedding_sizes = [
    (df_train[col].nunique() + 1, min(50, (df_train[col].nunique() + 1) // 2))
    for col in cat_cols
]

# Excluir columnas que no deben ir al modelo
excluir = ['PERIODO', 'CUSTOMER_ID', 'PRODUCT_ID', 'CLASE_LOG1P_Z', 'ORDINAL']

feature_cols = [col for col in df_train.columns if col not in excluir and col not in cat_cols]


In [None]:

# --- Predicción MLP (PyTorch) (10 modelos) ---
# Asegurar tipos correctos en el DataFrame
for col in cat_cols:
    df_pred[col] = df_pred[col].astype(np.int64)
for col in feature_cols:
    df_pred[col] = df_pred[col].astype(np.float32)

X_cats = torch.LongTensor(df_pred[cat_cols].values)
X_conts = torch.FloatTensor(df_pred[feature_cols].values)
ds_pred = TensorDataset(X_cats, X_conts)
pred_loader = DataLoader(ds_pred, batch_size=8192, shuffle=False)

def predict_model(model, loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for X_cats, X_conts in loader:
            X_cats = X_cats.to(device)
            X_conts = X_conts.to(device)
            output = model(X_cats, X_conts)
            preds.append(output.cpu().numpy().squeeze())
    return np.concatenate(preds)

mlp_preds = []

for i, cfg in enumerate(configs):
    print(f"Prediciendo MLP {i+1}/10...")
    model = TabularNNImproved(
        embedding_sizes=embedding_sizes,
        num_numerical=len(feature_cols),
        hidden_sizes=cfg['hidden_sizes'],
        dropout=cfg['dropout']
    ).to(device)
    model.load_state_dict(torch.load(f"{cfg['name']}_final.pt", map_location=device))
    preds = predict_model(model, pred_loader, device)
    mlp_preds.append(preds)

# Guardar cada predicción individual
for i, cfg in enumerate(configs):
    df_pred[f'mlp_pred_LOG1P_Z_{i+1}'] = mlp_preds[i]

In [None]:
cols = ['CUSTOMER_ID', 'PRODUCT_ID'] + [
    col for col in df_pred.columns 
    if ('mlp_pred_LOG1P_Z_' in col) or ('lgbm_pred_LOG1P_Z_' in col)
]
df_pred_final = df_pred[cols].copy()


In [None]:
# Parametros usados en la transformación original
mean_train = 0.4756487705286022
std_train = 0.6596333033693771

def inverse_log_transform_signed(x):
    return np.sign(x) * np.expm1(np.abs(x))

def inv_transform(arr):
    # Paso 1: desescalar Z-score
    log1p_vals = arr * std_train + mean_train
    # Paso 2: inversa de log1p con signo
    orig_vals = inverse_log_transform_signed(log1p_vals)
    return orig_vals

# Crear columnas nuevas con sufijo "_ORIG"
for col in df_pred_final.columns:
    if col not in ['CUSTOMER_ID', 'PRODUCT_ID']:
        df_pred_final[col.replace('_LOG1P_Z_', '_ORIG_')] = inv_transform(df_pred_final[col].values)




In [None]:
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 400)
df_pred_final = df_pred_final.loc[:, ~df_pred_final.columns.str.contains('LOG1P_Z')]

In [None]:
# Seleccioná solo las columnas de predicción de modelos individuales
pred_cols = [col for col in df_pred_final.columns if col.startswith('lgbm_pred_ORIG_') or col.startswith('mlp_pred_ORIG_')]
# Por cada columna pred_cols de df_pred_final hacerla cero si es negativa
for col in pred_cols:
    df_pred_final[col] = df_pred_final[col].clip(lower=0)

In [None]:


# Calculá media y mediana fila a fila
df_pred_final['df_pred_mean'] = df_pred_final[pred_cols].mean(axis=1)
df_pred_final['df_pred_median'] = df_pred_final[pred_cols].median(axis=1)


In [None]:
# Paso 1: Seleccionar solo las columnas LGBM
lgbm_cols = [col for col in df_pred_final.columns if col.startswith('lgbm_pred_ORIG_')]

# Paso 2: Calcular media y mediana por fila (e.g. por combinación CUSTOMER_ID + PRODUCT_ID)
df_pred_final['lgbm_pred_mean'] = df_pred_final[lgbm_cols].mean(axis=1)
df_pred_final['lgbm_pred_median'] = df_pred_final[lgbm_cols].median(axis=1)


In [None]:
df_pred_final_mediana = df_pred_final[['CUSTOMER_ID', 'PRODUCT_ID', 'df_pred_median']].copy()

In [None]:
df_pred_final_lgbm_mediana = df_pred_final[['CUSTOMER_ID', 'PRODUCT_ID', 'lgbm_pred_median']].copy()

In [None]:
df_pred_final_lgbm_mediana.to_csv('df_pred_final_lgbm_mediana.csv', index=False)

In [None]:
print(df_pred_final_lgbm_mediana)

In [None]:
# cargar en df_promedios promedios_tn_no_mueven_agujas.csv
df_promedios = pd.read_csv('promedios_tn_no_mueven_aguja.csv')

In [None]:
# Primer DataFrame: de predicciones (mediana del modelo)
df1 = df_pred_final_mediana.copy()
df1 = df1.rename(columns={'df_pred_median': 'TN'})
df1['tipo'] = 'mediana_pred'

# Segundo DataFrame: promedios históricos
df2 = df_promedios.copy()
df2 = df2.rename(columns={'TN_MEAN': 'TN'})
df2['tipo'] = 'media_train'

# Dejar columnas consistentes y unir
cols = ['CUSTOMER_ID', 'PRODUCT_ID', 'TN', 'tipo']
df_union = pd.concat([df1[cols], df2[cols]], axis=0, ignore_index=True)

print(df_union.head(10))


In [None]:
# Agrupar por PRODUCT_ID y sumar TN (puede ser mean si preferís)
df_out = (
    df_union.groupby('PRODUCT_ID', as_index=False)['TN']
    .sum()   # Cambiá a .mean() si querés promedio
    .rename(columns={'PRODUCT_ID': 'product_id', 'TN': 'tn'})
)

# Guardar a CSV
df_out.to_csv('tn_por_producto.csv', index=False, float_format='%.5f')

print(df_out.head())


In [None]:
# Primer DataFrame: de predicciones (mediana del modelo)
df4 = df_pred_final_lgbm_mediana.copy()
df4 = df4.rename(columns={'lgbm_pred_median': 'TN'})
df4['tipo'] = 'mediana_lightGBMpred'
print(df4.head(10))
print(df_completa.head(10))

In [None]:
# Primero, asegurate de que los nombres de las columnas coincidan exactamente
df_merge = df4[['CUSTOMER_ID', 'PRODUCT_ID', 'TN']].merge(
    df_completa[['CUSTOMER_ID', 'PRODUCT_ID', 'TN_median']],
    on=['CUSTOMER_ID', 'PRODUCT_ID'],
    how='inner' 
)

print(df_merge.head())


In [None]:

# Segundo DataFrame: promedios históricos
df2 = df_promedios.copy()
df2 = df2.rename(columns={'TN_MEAN': 'TN'})
df2['tipo'] = 'media_train'

# Dejar columnas consistentes y unir
cols = ['CUSTOMER_ID', 'PRODUCT_ID', 'TN', 'tipo']
df_union = pd.concat([df4[cols], df2[cols]], axis=0, ignore_index=True)

print(df_union.head(10))

In [None]:
# Agrupar por PRODUCT_ID y sumar TN (puede ser mean si preferís)
df_out = (
    df_union.groupby('PRODUCT_ID', as_index=False)['TN']
    .sum()   
    .rename(columns={'PRODUCT_ID': 'product_id', 'TN': 'tn'})
)

# Guardar a CSV
df_out.to_csv('tn_por_producto_LightGBM.csv', index=False, float_format='%.5f')

print(df_out.head())

In [None]:
df_completa = pd.read_parquet('./data/l_vm_completa_train.parquet', engine='fastparquet')
# Agrupás y calculás la mediana

df_completa = (
    df_completa
    .groupby(['CUSTOMER_ID', 'PRODUCT_ID'], as_index=False)['TN']
    .median()
    .rename(columns={'TN': 'TN_median'})
    .sort_values('TN_median', ascending=True)
    .reset_index(drop=True)
)




In [None]:
print(df_completa.head())


In [None]:
print(df_pred_final_lgbm_mediana.head())

In [None]:
df_medianas_lightgbm_e_historicas = pd.merge(
    df_completa,
    df_pred_final_lgbm_mediana[['CUSTOMER_ID', 'PRODUCT_ID', 'lgbm_pred_median']],
    on=['CUSTOMER_ID', 'PRODUCT_ID'],
    how='inner'
)

print(df_medianas_lightgbm_e_historicas.head())


In [None]:
df_medianas_lightgbm_e_historicas.to_csv('df_medianas_lightgbm_e_historicas.csv', index=False, float_format='%.5f')

In [None]:
# Suponiendo que df_pred tiene columnas: CUSTOMER_ID, PRODUCT_ID, TN, tipo
df_merge = pd.merge(
    df_completa,          # históricos
    df_union,      # predicciones
    on=['CUSTOMER_ID', 'PRODUCT_ID'],
    how='inner'
)

In [None]:
df_merge['diff_TN_vs_median'] = df_merge['TN'] - df_merge['TN_median']

# También podés ver el valor absoluto (absoluto de la diferencia)
df_merge['abs_diff_TN_vs_median'] = df_merge['diff_TN_vs_median'].abs()


In [None]:
top_diff = df_merge.sort_values('abs_diff_TN_vs_median', ascending=False).head(100)
print(top_diff[['CUSTOMER_ID', 'PRODUCT_ID', 'TN', 'TN_median', 'abs_diff_TN_vs_median']])
