In [None]:
# --- Standard Library ---
import os
import json
import gc
from functools import reduce
from typing import List

# --- Data Handling ---
import numpy as np
import pandas as pd
import polars as pl
import joblib

# --- Visualization ---
import matplotlib.pyplot as plt

# --- Machine Learning ---
import optuna
from lightgbm import LGBMRegressor, early_stopping
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

# --- Parallel Processing ---
from joblib import Parallel, delayed
from more_itertools import chunked

# --- PyTorch (if needed) ---
#import torch.nn as nn


In [None]:
df_full = pd.read_parquet('./data/product_interm_LGBM.parquet', engine='fastparquet')
# Separar conjuntos
df_train = df_full[df_full['PERIODO'] <= 201812].copy()
df_val = df_full[df_full['PERIODO'].between(201901, 201909)].copy()
df_test = df_full[(df_full['PERIODO'] == 201910)].copy()
df_target_201912 = df_full[df_full['PERIODO'] == 201912].copy()
gc.collect()

In [None]:
def total_forecast_error(y_true, y_pred):
    numerador = np.sum(np.abs(y_true - y_pred))
    denominador = np.sum(y_true)
    if denominador == 0:
        return np.nan
    return numerador / denominador


In [None]:
import gc
import optuna
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor, early_stopping
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --- SETUP ---
target_col = 'CLASE_LOG1P'
feature_cols = [col for col in df_train.columns if col != target_col]

X_tr = df_train[feature_cols]
y_tr = df_train[target_col]
X_val = df_val[feature_cols]
y_val = df_val[target_col]

In [None]:

# --- 1. Definir función objetivo ---
def objective_lgbm(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.0005, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", 4, 32),
        "n_estimators": trial.suggest_int("n_estimators", 800, 3000),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 120),
        "subsample": trial.suggest_float("subsample", 0.7, 0.95),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.95),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 100.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 100.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        "random_state": trial.number,
        "n_jobs": -1,
        "verbosity": -1,
        "objective": "mae",
        "random_state": 42
    }
    model = LGBMRegressor(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[early_stopping(stopping_rounds=100, verbose=False)],
    )
    preds = model.predict(X_val)
    
    y_val_orig = inv_transform_log1p(y_val.values)
    preds_orig = inv_transform_log1p(preds)

    tfe = total_forecast_error(y_val_orig, preds_orig)
    trial.set_user_attr("mae", mean_absolute_error(y_val_orig, preds_orig))
    gc.collect()
    return tfe

# --- 2. Crear estudio con almacenamiento en SQLite ---
optuna_db_path = "sqlite:///optuna_lgbm_study.db"
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=101),
    storage=optuna_db_path,
    study_name="lgbm_tfe_optim",
    load_if_exists=True  # si ya existe, lo continúa
)

# --- 3. Ejecutar la optimización ---
N_MODELS = 50
N_TRIALS = 500

study.optimize(objective_lgbm, n_trials=N_TRIALS, n_jobs=28, show_progress_bar=True)

# --- 4. Extraer y guardar los mejores parámetros ---
trials_df = study.trials_dataframe()
trials_df["mae"] = [t.user_attrs.get("mae", np.nan) for t in study.trials]

top_lgbm_trials = trials_df.sort_values("value").head(N_MODELS)

final_configs = []
for i, row in top_lgbm_trials.iterrows():
    params = row.filter(like='params_').to_dict()
    params = {k.replace('params_', ''): v for k, v in params.items()}
    for p in ["num_leaves", "max_depth", "n_estimators", "min_child_samples"]:
        params[p] = int(params[p])
    params["random_state"] = int(row["number"])
    params["n_jobs"] = -1
    params["verbosity"] = -1
    final_configs.append(params)

# --- 5. Guardar en archivos externos ---
top_lgbm_trials.to_csv("optuna_lgbm_trials.csv", index=False)

with open("lgbm_ensemble_configs.json", "w") as f:
    json.dump(final_configs, f, indent=2)

# --- 6. Resumen en consola ---
print(f"Top 50 TFE: {top_lgbm_trials['value'].head(50).values}")
print(f"Top 50 MAE: {top_lgbm_trials['mae'].head(50).values}")
print(final_configs)


In [None]:
X_full = pd.concat([df_train, df_val], ignore_index=True)[feature_cols]
y_full = pd.concat([df_train[target_col], df_val[target_col]], ignore_index=True)

In [None]:
import pandas as pd

top_lgbm_trials = pd.read_csv("optuna_lgbm_trials.csv")


In [None]:
import json

with open("lgbm_ensemble_configs.json", "r") as f:
    final_configs = json.load(f)


In [None]:
import joblib
from lightgbm import LGBMRegressor

def train_and_save_model(i, params, X_full, y_full):
    print(f"Entrenando modelo {i+1}/50...")
    params = params.copy()  # Para no modificar el original en la lista
    params["objective"] = "mae"  # Forzar MAE aunque el config no lo tenga
    model = LGBMRegressor(**params)
    model.fit(X_full, y_full)
    joblib.dump(model, f'lgbm_model_{i+1:02d}.pkl')
    return f"Modelo {i+1} terminado"


results = joblib.Parallel(n_jobs=20)(
    joblib.delayed(train_and_save_model)(i, params, X_full, y_full)
    for i, params in enumerate(final_configs[:50])
)

print(results)
print("¡Entrenamiento y guardado de los 50 modelos finalizado!")


In [None]:
import joblib

# Cargar los 50 modelos LightGBM entrenados
lgbm_models = []
for i in range(1, 51):
    model = joblib.load(f'lgbm_model_{i:02d}.pkl')
    lgbm_models.append(model)


In [None]:
import numpy as np

# --- Predicción LightGBM (50 modelos) ---
lgbm_preds = []

feature_cols = [col for col in df_test.columns if col != target_col]

X_pred_lgbm = df_test[feature_cols]  # Si podés, asegurate que están en el mismo orden y tipo que en train

for i, model in enumerate(lgbm_models):
    print(f"Prediciendo LightGBM {i+1}/50...")
    preds = model.predict(X_pred_lgbm)
    lgbm_preds.append(preds)

lgbm_preds = np.stack(lgbm_preds).T  # shape (N, 50)

# --- Agregar predicciones LGBM al DataFrame existente con resultados de MLP ---
for i in range(50):
    df_test[f'lgbm_pred_LOG1P_{i+1}'] = lgbm_preds[:, i]


## 📊 Cálculo de métricas estadísticas sobre el ensemble de modelos LightGBM

Una vez agregadas las predicciones individuales de los 50 modelos LightGBM al DataFrame `df_test`, se calculan métricas agregadas por fila para capturar información estadística sobre la dispersión y tendencia central del ensemble.

Estas métricas permiten construir predicciones más robustas, como la media o mediana del conjunto de modelos, y analizar la incertidumbre asociada a cada predicción.

In [None]:
import numpy as np

# Seleccioná las columnas de predicción
lgbm_pred_cols = [col for col in df_test.columns if col.startswith('lgbm_pred_LOG1P_')]

# Calcula métricas fila a fila
df_test['lgbm_pred_mean'] = df_test[lgbm_pred_cols].mean(axis=1)
df_test['lgbm_pred_median'] = df_test[lgbm_pred_cols].median(axis=1)
df_test['lgbm_pred_std'] = df_test[lgbm_pred_cols].std(axis=1)
df_test['lgbm_pred_q25'] = df_test[lgbm_pred_cols].quantile(0.25, axis=1)
df_test['lgbm_pred_q75'] = df_test[lgbm_pred_cols].quantile(0.75, axis=1)
df_test['lgbm_pred_min'] = df_test[lgbm_pred_cols].min(axis=1)
df_test['lgbm_pred_max'] = df_test[lgbm_pred_cols].max(axis=1)
df_test['lgbm_pred_iqr'] = df_test['lgbm_pred_q75'] - df_test['lgbm_pred_q25']


In [None]:
import matplotlib.pyplot as plt

# Rango de productos a analizar
productos = range(20001, 20051)

plt.figure(figsize=(18, 7))

box_data = []
obs_pos = []
obs_vals = []

for i, pid in enumerate(productos, 1):
    # Todas las predicciones fila a fila para ese producto
    vals = df_test[df_test['PRODUCT_ID'] == pid][lgbm_pred_cols].values.flatten()
    box_data.append(vals)
    # Todos los valores observados para ese producto
    vals_obs = df_test[df_test['PRODUCT_ID'] == pid]['CLASE_LOG1P'].values
    for obs in vals_obs:
        obs_pos.append(i)
        obs_vals.append(obs)

# Dibujar los boxplots
plt.boxplot(box_data, positions=range(1, len(productos)+1), widths=0.6, patch_artist=True, showmeans=True)

# Dibujar todos los valores observados como círculos rojos
plt.plot(obs_pos, obs_vals, 'ro', markersize=7, label='Valor observado')

plt.title('Dispersión de predicciones y observado para PRODUCT_ID 20001 a 20050')
plt.ylabel('Predicción (LOG1P)')
plt.xlabel('PRODUCT_ID')
plt.xticks(range(1, len(productos)+1), [str(pid) for pid in productos], rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.legend()

plt.tight_layout()
plt.show()




In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Calcula la dispersión (std) para cada producto
pred_cols = [c for c in df_test.columns if c.startswith('lgbm_pred_LOG1P_')]
df_test['pred_std'] = df_test[pred_cols].std(axis=1)
top_n = 20
productos_disp = df_test.sort_values('pred_std', ascending=False)['PRODUCT_ID'].unique()[:top_n]

# Subset de los productos seleccionados
df_plot = df_test[df_test['PRODUCT_ID'].isin(productos_disp)].copy()
df_plot = df_plot[['PRODUCT_ID', 'CLASE_LOG1P'] + pred_cols]

# Boxplots
plt.figure(figsize=(15, 6))
data = df_plot[pred_cols].values  # Solo las columnas de predicción
positions = np.arange(1, len(df_plot)+1)

plt.boxplot(data.T, labels=df_plot['PRODUCT_ID'], showfliers=True)

# Marcar el valor observado con un punto rojo para cada producto
for idx, (prod, y_true) in enumerate(zip(df_plot['PRODUCT_ID'], df_plot['CLASE_LOG1P'])):
    plt.scatter(idx+1, y_true, color='crimson', marker='o', s=100, zorder=5, label='Valor observado' if idx==0 else "")

plt.xticks(rotation=90)
plt.xlabel('PRODUCT_ID')
plt.ylabel('Predicción (LOG1P)')
plt.title(f'Dispersión de predicciones para los {top_n} productos más dispersos')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()



In [None]:
# Calcula la predicción central y la dispersión
df_test['pred_median'] = df_test[pred_cols].median(axis=1)
df_test['pred_mean'] = df_test[pred_cols].mean(axis=1)
df_test['pred_std'] = df_test[pred_cols].std(axis=1)

# Error absoluto respecto al real
df_test['abs_err_median'] = (df_test['pred_median'] - df_test['CLASE_LOG1P']).abs()
df_test['abs_err_mean'] = (df_test['pred_mean'] - df_test['CLASE_LOG1P']).abs()

plt.figure(figsize=(8,5))
plt.scatter(df_test['pred_std'], df_test['abs_err_median'], alpha=0.6)
plt.xlabel('Dispersión de predicción (std de 50 modelos)')
plt.ylabel('Error absoluto (mediana)')
plt.title('Dispersión vs Error absoluto por producto')
plt.grid(True)
plt.show()


In [None]:
# Selección de columnas de predicción transformadas
pred_cols = [c for c in df_test.columns if c.startswith('lgbm_pred_LOG1P_')]

# Inversa de la transformación log1p sobre la variable objetivo
df_test['CLASE_TN'] = np.expm1(df_test['CLASE_LOG1P'])

# Inversa de log1p sobre las predicciones
preds_orig = np.expm1(df_test[pred_cols].values)  # Deshace log1p

# Crear nuevas columnas con las predicciones en escala original
for i, col in enumerate(pred_cols):
    new_col = col.replace('LOG1P', 'TN')
    df_test[new_col] = preds_orig[:, i]

In [None]:
# Las nuevas columnas de predicción en escala original
pred_cols_tn = [c for c in df_test.columns if c.startswith('lgbm_pred_TN_')]
df_test['pred_std_tn'] = df_test[pred_cols_tn].std(axis=1)
df_test['pred_median_tn'] = df_test[pred_cols_tn].median(axis=1)
df_test['abs_error_median_tn'] = np.abs(df_test['pred_median_tn'] - df_test['CLASE_TN'])


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(df_test['pred_std_tn'], df_test['abs_error_median_tn'], alpha=0.7)

# Etiquetar solo los puntos más dispersos
umbral_disp = 5  # Cambia el umbral si querés más o menos etiquetas
df_label = df_test[df_test['pred_std_tn'] > umbral_disp]

for _, row in df_label.iterrows():
    plt.text(
        row['pred_std_tn'],
        row['abs_error_median_tn'],
        f"{int(row['PRODUCT_ID'])}",  # fuerza a int sin decimales
        fontsize=6,                  # más pequeño
        color='crimson',
        alpha=0.8,
        ha='left',
        va='bottom'
    )

plt.xlabel('Dispersión de predicción (std de 50 modelos) [TN]')
plt.ylabel('Error absoluto (mediana) [TN]')
plt.title('Dispersión vs Error absoluto por producto (escala original)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()




In [None]:
import matplotlib.pyplot as plt

# Top N más dispersos
top_n = 20
productos_disp = df_test.sort_values('pred_std_tn', ascending=False)['PRODUCT_ID'].unique()[:top_n]
df_plot = df_test[df_test['PRODUCT_ID'].isin(productos_disp)].copy()

plt.figure(figsize=(15, 6))

# Armar datos para el boxplot
data = [row[1:] for row in df_plot[['PRODUCT_ID'] + pred_cols_tn].values]  # omite PRODUCT_ID
labels = df_plot['PRODUCT_ID'].astype(str).values

# Plotear los boxplots
plt.boxplot(data, labels=labels, showfliers=True)

# Sobreponer los valores observados (puede haber más de uno por producto, si hay varias filas)
for idx, prod_id in enumerate(df_plot['PRODUCT_ID']):
    valores_reales = df_test[df_test['PRODUCT_ID'] == prod_id]['CLASE_TN'].values
    for vreal in valores_reales:
        plt.plot(idx + 1, vreal, 'ro', markersize=7, label='Valor observado' if idx == 0 else "")

plt.xticks(rotation=90)
plt.xlabel('PRODUCT_ID')
plt.ylabel('Predicción (TN)')
plt.title(f'Dispersión de predicciones y observados para los {top_n} productos más dispersos (escala original)')

# Mostrar solo un label para 'Valor observado'
handles, labels = plt.gca().get_legend_handles_labels()
if handles:
    plt.legend([handles[0]], ['Valor observado'])

plt.tight_layout()
plt.show()



In [None]:
top_n = 30

# Top por dispersión
top_disp = df_test.sort_values('pred_std_tn', ascending=False).head(top_n)
print("Top 30 productos por dispersión:")
print(top_disp[['PRODUCT_ID', 'pred_std_tn', 'abs_error_median_tn', 'CLASE_TN']])

# Top por error absoluto
top_error = df_test.sort_values('abs_error_median_tn', ascending=False).head(top_n)
print("\nTop 30 productos por error absoluto:")
print(top_error[['PRODUCT_ID', 'pred_std_tn', 'abs_error_median_tn', 'CLASE_TN']])



In [None]:
import numpy as np
import pandas as pd

# 1. Identificá las columnas de predicción
pred_cols_tn = [c for c in df_test.columns if c.startswith('lgbm_pred_TN_')]

# 2. Calculá media y mediana de las predicciones
df_test['ensemble_median'] = df_test[pred_cols_tn].median(axis=1)
df_test['ensemble_mean']   = df_test[pred_cols_tn].mean(axis=1)

# 3. Calculá el MAE para ambas estrategias
mae_median = np.mean(np.abs(df_test['ensemble_median'] - df_test['CLASE_TN']))
mae_mean   = np.mean(np.abs(df_test['ensemble_mean']   - df_test['CLASE_TN']))

print(f"MAE usando la **mediana** de 50 modelos: {mae_median:.3f} TN")
print(f"MAE usando la **media** de 50 modelos:   {mae_mean:.3f} TN")

# 4. Si querés comparar producto por producto:
df_test['error_median'] = np.abs(df_test['ensemble_median'] - df_test['CLASE_TN'])
df_test['error_mean']   = np.abs(df_test['ensemble_mean']   - df_test['CLASE_TN'])

print("\nTop 10 productos donde la media le gana a la mediana:")
print(df_test[df_test['error_mean'] < df_test['error_median']][['PRODUCT_ID', 'error_median', 'error_mean']].sort_values('error_median', ascending=False).head(10))


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))
plt.scatter(df_test['error_median'], df_test['error_mean'], alpha=0.7)

plt.plot(
    [df_test['error_median'].min(), df_test['error_median'].max()],
    [df_test['error_median'].min(), df_test['error_median'].max()],
    'r--', label='y=x'
)

# Etiquetar PRODUCT_ID donde error_median > 75 o error_mean > 75
for _, row in df_test.iterrows():
    if row['error_median'] > 75 or row['error_mean'] > 75:
        plt.text(
            row['error_median'],
            row['error_mean'],
            str(int(row['PRODUCT_ID'])),
            fontsize=5,
            color='crimson',
            alpha=0.85,
            ha='left',
            va='bottom'
        )

plt.xlabel('Error absoluto usando MEDIANA')
plt.ylabel('Error absoluto usando MEDIA')
plt.title('Comparación error absoluto por producto: Media vs Mediana')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:
import numpy as np

lambdas = np.linspace(0, 1, 21)
maes = []

for l in lambdas:
    pred = l * df_test['ensemble_median'] + (1 - l) * df_test['ensemble_mean']
    mae = np.mean(np.abs(pred - df_test['CLASE_TN']))
    maes.append(mae)

plt.figure(figsize=(7, 4))
plt.plot(lambdas, maes, marker='o')
plt.xlabel('Lambda (ponderación de la MEDIANA)')
plt.ylabel('MAE (ensemble)')
plt.title('Búsqueda de mejor combinación media/mediana')
plt.grid(True)
plt.tight_layout()
plt.show()

best_lambda = lambdas[np.argmin(maes)]
print(f"Mejor lambda = {best_lambda:.2f} (MAE={min(maes):.3f} TN)")


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Suponiendo que en tu df_test ya tenés:
# 'ensemble_median', 'ensemble_mean', y la observación real 'CLASE_TN'
lambdas = np.linspace(0, 1, 21)
tfe_scores = []

for l in lambdas:
    y_pred = l * df_test['ensemble_median'] + (1 - l) * df_test['ensemble_mean']
    tfe = total_forecast_error(df_test['CLASE_TN'].values, y_pred.values)
    tfe_scores.append(tfe)

plt.figure(figsize=(8, 4))
plt.plot(lambdas, tfe_scores, marker='o')
plt.xlabel('Lambda (ponderación de la MEDIANA)')
plt.ylabel('Total Forecast Error (TFE)')
plt.title('Búsqueda de mejor combinación media/mediana (TFE)')
plt.grid(True)
plt.show()

# Mejor lambda encontrado y su TFE
best_idx = np.argmin(tfe_scores)
print(f"Mejor lambda: {lambdas[best_idx]:.2f}  ->  TFE = {tfe_scores[best_idx]:.6f}")


In [None]:
print(df_test.columns.tolist())

In [None]:
import pandas as pd
import numpy as np

# Si ensemble_median es UNA predicción por producto (no por fila):
# Entonces podés hacer así, suponiendo que tenés una única fila por PRODUCT_ID en df_test:
df_test['abs_error_median'] = np.abs(df_test['ensemble_median'] - df_test['CLASE_TN'])

# Ordenar por el error absoluto de la mediana, descendente (de mayor a menor error)
df_ordenado = df_test.sort_values('abs_error_median', ascending=False)[['PRODUCT_ID', 'ensemble_median', 'CLASE_TN', 'abs_error_median']]

# A df_ordenado agregar el desvio estándar de las predicciones
df_ordenado['pred_std'] = df_test[pred_cols_tn].std(axis=1)

print(df_ordenado.head(50))
df_ordenado.to_csv('validacion_lgbm_ordenadas.csv', index=False)

In [None]:
# Calcula el TFE por producto usando media y mediana
tfe_prod_median = []
tfe_prod_mean = []

for pid, subdf in df_test.groupby('PRODUCT_ID'):
    tfe_median = total_forecast_error(subdf['CLASE_TN'].values, subdf['ensemble_median'].values)
    tfe_mean = total_forecast_error(subdf['CLASE_TN'].values, subdf['ensemble_mean'].values)
    tfe_prod_median.append((pid, tfe_median))
    tfe_prod_mean.append((pid, tfe_mean))

df_tfe = pd.DataFrame({
    'PRODUCT_ID': [pid for pid, _ in tfe_prod_median],
    'TFE_median': [tfe for _, tfe in tfe_prod_median],
    'TFE_mean': [tfe for _, tfe in tfe_prod_mean]
})
df_tfe['TFE_diff'] = df_tfe['TFE_median'] - df_tfe['TFE_mean']

# Top 10 productos donde la media mejora más vs la mediana
print(df_tfe.sort_values('TFE_diff', ascending=False).head(10))


In [None]:
# Calcular el TFE total para el conjunto de test
tfe_total_median = total_forecast_error(df_test['CLASE_TN'].values, df_test['ensemble_median'].values)
tfe_total_mean = total_forecast_error(df_test['CLASE_TN'].values, df_test['ensemble_mean'].values)
print(f"TFE total usando mediana: {tfe_total_median:.6f}")
print(f"TFE total usando media: {tfe_total_mean:.6f}")
print(np.sum(df_test['CLASE_TN'].values))  # Para verificar el denominador de TFE

In [None]:
import numpy as np
import pandas as pd

# --- Cargar tu DataFrame ---
# Debe contener: PRODUCT_ID, CLASE_TN (real), ensemble_median (predicción LGBM)
df_test = pd.read_csv("validacion_lgbm_ordenadas.csv")

# --- 1. Función y TFE total ---
def total_forecast_error(y_true, y_pred):
    numerador = np.sum(np.abs(y_true - y_pred))
    denominador = np.sum(y_true)
    return numerador / denominador if denominador != 0 else np.nan

tfe_total = total_forecast_error(df_test["CLASE_TN"], df_test["ensemble_median"])
print(f"TFE total (mediana LGBM): {tfe_total:.4f}")

# --- 2. Error absoluto y aporte individual al TFE ---
denominador = df_test["CLASE_TN"].sum()
df_test["abs_error_median"] = np.abs(df_test["CLASE_TN"] - df_test["ensemble_median"])
df_test["tfe_aporte"] = df_test["abs_error_median"] / denominador          # proporción (0–1)
df_test["tfe_pct"]    = df_test["tfe_aporte"] * 100                        # porcentaje

# --- 3. Ordenar por aporte y acumulado ---
df_test = df_test.sort_values("tfe_aporte", ascending=False).reset_index(drop=True)
df_test["aporte_acumulado"] = df_test["tfe_aporte"].cumsum()
df_test["aporte_acum_pct"]  = df_test["aporte_acumulado"] * 100

# --- 4. Productos que generan el 80 % del TFE ---
df_test["top_80pct_error"] = df_test["aporte_acumulado"] <= 0.80
productos_criticos = df_test.loc[df_test["top_80pct_error"], "PRODUCT_ID"].tolist()

# --- 5. Imprimir en formato “PID:XX %” ---
print("\nPRODUCT_ID y % del TFE (hasta cubrir el 80 % acumulado):")
formatted = df_test.loc[df_test["top_80pct_error"]] \
                   .apply(lambda r: f"{int(r['PRODUCT_ID'])}:{r['tfe_pct']:.1f}%", axis=1)
print(", ".join(formatted))

# --- 6. Guardar archivo con todo el detalle ---
df_test.to_csv("aporte_TFE_por_producto.csv",
               columns=["PRODUCT_ID", "CLASE_TN", "ensemble_median",
                        "abs_error_median", "tfe_pct", "aporte_acum_pct",
                        "top_80pct_error"],
               index=False)

print("\nArchivo 'aporte_TFE_por_producto.csv' generado.")


In [None]:
# Top productos donde la mediana es mejor que la media
print(df_tfe[df_tfe['TFE_diff'] < 0].sort_values('TFE_diff').head(10))


In [None]:
# Top productos donde la media es mejor que la mediana
print(df_tfe[df_tfe['TFE_diff'] > 0].sort_values('TFE_diff').head(10))


Entrenamiento final

In [None]:
df_full = pd.read_parquet('./data/product_interm_LGBM.parquet', engine='fastparquet')
# Separar conjuntos
df_train = df_full[df_full['PERIODO'] <= 201812].copy()
df_val = df_full[df_full['PERIODO'].between(201901, 201909)].copy()
df_test = df_full[(df_full['PERIODO'] == 201910)].copy()
df_target_201912 = df_full[df_full['PERIODO'] == 201912].copy()
gc.collect()

In [None]:
X_full = pd.concat([df_train, df_val,df_test], ignore_index=True)[feature_cols]
y_full = pd.concat([df_train[target_col], df_val[target_col],df_test[target_col]], ignore_index=True)
df_pred = df_pred = df_full[df_full['PERIODO'] == 201912].copy()


In [None]:
import joblib
from lightgbm import LGBMRegressor

def train_and_save_model(i, params, X_full, y_full):
    print(f"Entrenando modelo {i+1}/50...")
    model = LGBMRegressor(**params)
    model.fit(X_full, y_full)
    joblib.dump(model, f'lgbm_model_{i+1:02d}.pkl')
    return f"Modelo {i+1} terminado"

results = joblib.Parallel(n_jobs=20)(
    joblib.delayed(train_and_save_model)(i, params, X_full, y_full)
    for i, params in enumerate(final_configs[:50])
)

print(results)
print("¡Entrenamiento y guardado de los 50 modelos finalizado!")


In [None]:
import joblib

# Cargar los 50 modelos LightGBM entrenados
lgbm_models = []
for i in range(1, 51):
    model = joblib.load(f'lgbm_model_{i:02d}.pkl')
    lgbm_models.append(model)

In [None]:
print(df_pred.columns.tolist())

In [None]:
import numpy as np

# --- Predicción LightGBM (50 modelos) ---
lgbm_preds = []

feature_cols = [col for col in df_pred.columns if col != target_col]

X_pred_lgbm = df_pred[feature_cols]  

for i, model in enumerate(lgbm_models):
    print(f"Prediciendo LightGBM {i+1}/50...")
    preds = model.predict(X_pred_lgbm)
    lgbm_preds.append(preds)

lgbm_preds = np.stack(lgbm_preds).T  # shape (N, 50)

# --- Agregar predicciones LGBM al DataFrame existente con resultados de MLP ---
for i in range(50):
    df_pred[f'lgbm_pred_LOG1P_{i+1}'] = lgbm_preds[:, i]


In [None]:
print(df_pred.columns.tolist())

In [None]:
# Selección de columnas de predicción transformadas (log1p)
pred_cols = [c for c in df_pred.columns if c.startswith('lgbm_pred_LOG1P_')]

# Inversa de la transformación log1p sobre la variable objetivo
df_pred['CLASE_TN'] = np.expm1(df_pred['CLASE_LOG1P'].values)

# Inversa de log1p sobre las predicciones (matriz n x 50)
preds_inv = df_pred[pred_cols].values
preds_orig = np.expm1(preds_inv)  # Deshace log1p

# Crear nuevas columnas en escala original
for i, col in enumerate(pred_cols):
    new_col = col.replace('LOG1P', 'TN')
    df_pred[new_col] = preds_orig[:, i]



In [None]:
# Eliminar de df_preds_final las columnas con "_LOG1P_Z_"
cols_to_remove = [col for col in df_pred.columns if '_LOG1P_' in col]
df_pred.drop(columns=cols_to_remove, inplace=True)

In [None]:
lgbm_cols = [c for c in df_pred.columns if c.startswith('lgbm_pred_TN')]
print(lgbm_cols)

In [None]:
df_pred['lgbm_mean'] = df_pred[lgbm_cols].mean(axis=1)
df_pred['lgbm_median'] = df_pred[lgbm_cols].median(axis=1)

In [None]:
df_pred['lgbm_mean'] = df_pred['lgbm_mean'].clip(lower=0)
df_pred['lgbm_media'] = df_pred['lgbm_median'].clip(lower=0)


df_pred[['PRODUCT_ID', 'lgbm_mean']]\
    .rename(columns={'PRODUCT_ID': 'product_id', 'lgbm_mean': 'tn'})\
    .to_csv('lgbm_predictions_mean.csv', index=False)

df_pred[['PRODUCT_ID', 'lgbm_median']]\
    .rename(columns={'PRODUCT_ID': 'product_id', 'lgbm_median': 'tn'})\
    .to_csv('lgbm_predictions_median.csv', index=False)

