In [None]:
#validación ampliada a septiembre, octubre y noviembre. 0.260 en el public
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
import lightgbm as lgb
import xgboost as xgb
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

warnings.filterwarnings("ignore")

# 1. Cargar dataset
df = pd.read_csv("sell-in.txt", sep="\t")
df['periodo'] = pd.to_datetime(df['periodo'], format='%Y%m')
df = df.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()

# 2. Cargar listado fijo de productos
with open("product_id_apredecir201912.TXT", "r") as f:
    productos = [int(line.strip()) for line in f if line.strip().isdigit()]

# 3. Inicializar salida
resultados = []
log = []
maes_resumen = []

# 4. Carpeta autogluon
os.makedirs("autogluon_temp_ts", exist_ok=True)

productos_predichos = set()

# 5. Loop por producto
for prod in tqdm(productos, desc="Procesando productos"):
    datos = df[df['product_id'] == prod].sort_values('periodo').copy()
    datos['mes'] = datos['periodo'].dt.month

    train = datos[datos['periodo'] < '2019-09-01'].copy()
    val = datos[datos['periodo'].isin([
        pd.Timestamp('2019-09-01'),
        pd.Timestamp('2019-10-01'),
        pd.Timestamp('2019-11-01')
    ])].copy()

    if len(train) < 12 or val.empty:
        continue

    X_train = train[['mes']]
    y_train = train['tn']
    X_val = val[['mes']]
    y_val = val['tn']

    maes = {}
    preds = {}

    # 1. Regresión lineal
    try:
        lr = LinearRegression()
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_val)
        maes['regresion'] = mean_absolute_error(y_val, y_pred)
        preds['regresion'] = lr.predict([[2]])[0]
    except:
        maes['regresion'] = np.inf

    # 2. ARIMA
    try:
        serie = train.set_index('periodo')['tn']
        modelo_arima = ARIMA(serie, order=(1, 1, 1)).fit()
        y_pred = modelo_arima.forecast(steps=3)
        maes['arima'] = mean_absolute_error(y_val.values, y_pred.values)
        feb_pred = modelo_arima.forecast(steps=5)[-1]
        preds['arima'] = feb_pred
    except:
        maes['arima'] = np.inf

    # 3. LightGBM
    try:
        lgb_model = lgb.LGBMRegressor(
            n_estimators=834,
            learning_rate=0.06449926163783713,
            max_depth=13,
            num_leaves=197,
            min_data_in_leaf=208,
            min_child_weight=3.7932779938198546,
            subsample=0.7032151245633396,
            subsample_freq=7,
            colsample_bytree=0.9893937066314805,
            colsample_bynode=0.8148358693555268,
            reg_alpha=4.962755134948597,
            reg_lambda=3.8191748367071927,
            max_bin=512,
            min_split_gain=0.006311109685921704,
            cat_smooth=49.82693114488869,
            random_state=42,
            boosting_type='dart',
            verbosity=-1,
            linear_tree=True
        )
        lgb_model.fit(X_train, y_train)
        y_pred = lgb_model.predict(X_val)
        maes['lgbm'] = mean_absolute_error(y_val, y_pred)
        preds['lgbm'] = lgb_model.predict([[2]])[0]
    except:
        maes['lgbm'] = np.inf

    # 4. XGBoost
    try:
        xgb_model = xgb.XGBRegressor(verbosity=0)
        xgb_model.fit(X_train, y_train)
        y_pred = xgb_model.predict(X_val)
        maes['xgboost'] = mean_absolute_error(y_val, y_pred)
        preds['xgboost'] = xgb_model.predict([[2]])[0]
    except:
        maes['xgboost'] = np.inf

    # 5. AutoGluon
    try:
        df_serie = train[['periodo', 'tn']].copy()
        df_serie['item_id'] = str(prod)
        df_serie = df_serie.rename(columns={'periodo': 'timestamp'})
        df_serie = df_serie[['item_id', 'timestamp', 'tn']]

        ts_data = TimeSeriesDataFrame.from_data_frame(
            df_serie, id_column='item_id', timestamp_column='timestamp'
        ).fill_missing_values()

        predictor = TimeSeriesPredictor(
            prediction_length=5,
            target='tn',
            freq='MS',
            eval_metric='MASE',
            path=f"autogluon_temp_ts/{prod}",
            verbosity=0
        )

        predictor.fit(
            ts_data,
            num_val_windows=2,
            time_limit=60,
            enable_ensemble=False,
            hyperparameters={"ETS": {}, "AutoARIMA": {}, "Naive": {}}
        )

        forecast = predictor.predict(ts_data)
        val_preds = [forecast.loc[(str(prod), pd.Timestamp(d)), 'mean'] for d in ['2019-09-01', '2019-10-01', '2019-11-01']]
        maes['autogluon'] = mean_absolute_error(y_val, val_preds)
        preds['autogluon'] = forecast.loc[(str(prod), pd.Timestamp("2020-02-01")), 'mean']
    except:
        maes['autogluon'] = np.inf

    mejor_modelo = min(maes, key=maes.get)
    pred_final = preds[mejor_modelo]
    resultados.append({'product_id': prod, 'tn_predicho': pred_final})
    productos_predichos.add(prod)
    log.append(f"Producto {prod}: mejor modelo = {mejor_modelo}, MAE sep-nov = {maes[mejor_modelo]:.4f}")

    mae_row = {'product_id': prod}
    for modelo in ['regresion', 'arima', 'lgbm', 'xgboost', 'autogluon']:
        mae_row[f'mae_{modelo}'] = maes.get(modelo, np.nan)
    maes_resumen.append(mae_row)

# 6. Fallback
productos_faltantes = set(productos) - productos_predichos
for prod in productos_faltantes:
    datos = df[df['product_id'] == prod].sort_values('periodo').copy()
    ultimos_12 = datos[datos['periodo'] < '2020-01-01'].tail(12)
    pred_fallback = ultimos_12['tn'].mean() if not ultimos_12.empty else 0
    resultados.append({'product_id': prod, 'tn_predicho': pred_fallback})
    log.append(f"Producto {prod}: fallback promedio últimos 12 meses = {pred_fallback:.2f}")

# 7. Guardar
pd.DataFrame(resultados).sort_values("product_id").to_csv("predicciones_febrero2020_porproducto3.csv", index=False)
maes_df = pd.DataFrame(maes_resumen).sort_values("product_id")
maes_df.to_csv("maes_por_modelo.csv", index=False)
with open("log_modelos3.txt", "w") as f:
    for linea in log:
        f.write(linea + "\n")

# 8. Gráfico local
maes_long = maes_df.melt(id_vars='product_id', value_vars=[col for col in maes_df.columns if col.startswith('mae_')],
                         var_name='modelo', value_name='mae')
maes_long['modelo'] = maes_long['modelo'].str.replace('mae_', '')
mae_promedios = maes_long.groupby('modelo')['mae'].mean().reset_index().sort_values('mae')

plt.figure(figsize=(10, 5))
sns.barplot(data=mae_promedios, x='modelo', y='mae')
plt.title("MAE Promedio por Modelo (Sep-Nov 2019)")
plt.ylabel("MAE promedio")
plt.xlabel("Modelo")
plt.grid(True)
plt.tight_layout()
plt.savefig("grafico_mae_promedio.png")
plt.show()
