# 🧩 1. Setup inicial


In [20]:
# 1.1 Cargar librerías necesarias
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from autogluon.tabular import TabularPredictor
import os
import glob
from datetime import datetime

# 1.2 Paths de entrada y salida
ruta_parquet = "C:/Developer/Laboratorio_III/data/dataset_product_periodo_con_clusters.parquet"
output_dir = "output_forecasts_by_cluster_tabular_full"
os.makedirs(output_dir, exist_ok=True)

# 1.3 Archivo de log
log_file = open("log_forecast_clusters_3.txt", "w")
def log(msg):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{timestamp}] {msg}\\n"
    log_file.write(line)
    log_file.flush()
    print(line)


# 📥 2. Cargar dataset y clusters

In [21]:
# 2.1 Cargar dataset con clusters
df = pd.read_parquet(ruta_parquet)
# Normalizar fecha
df['fecha'] = pd.to_datetime(df['fecha']).dt.normalize()

# 2.2 Crear campo clase (tn_total desplazado +2)
df['clase'] = df.groupby('product_id')['tn_total'].shift(-2)

# 2.3 Revisar clusters únicos
clusters_unicos = df['cluster_dtw'].dropna().unique()
log(f"Clusters únicos: {clusters_unicos}")

[2025-07-07 11:38:08] Clusters únicos: [ 1  0 31  4 39  7 43 48  9 41 24 45 35 10 20 18 32 12 15 21 44 19 30 47
 46 37 17 25  3 26 38 22  5  2 42 14 34  8 27 13 23 36 49 29 33 28 16 40
 11  6]\n


# ✅ 2.1 Bloque hyperparameters sugerido (árboles)

In [22]:
hyperparameters = {
    'GBM': {
        'extra_trees': True,     # Usar LightGBM con Extra Trees (más robustez)
        'num_boost_round': 300,  # N° de iteraciones boosting
        'early_stopping_rounds': 20,
    },
    'CAT': {
        'iterations': 300,
        'learning_rate': 0.05,
        'depth': 16,
    },
    'XGB': {
        'num_boost_round': 600,
        'learning_rate': 0.05,
        'max_depth': 16,
    },
    'RF': {
        'n_estimators': 300,
    },
    'XT': {
        'n_estimators': 300,
    }
}


# 🔄 3. Loop por cluster

In [23]:
# ====================================
#  Normalizar fecha
# ====================================
df['fecha'] = pd.to_datetime(df['fecha']).dt.normalize()

# ====================================
#  Crear clase = tn_total + 2 meses
# ====================================
df = df.sort_values(['product_id', 'fecha'])
df['clase'] = df.groupby('product_id')['tn_total'].shift(-2)

# ====================================
#  Ingeniería de features
# ====================================
for lag in range(1, 37):
    df[f'tn_{lag}'] = df.groupby('product_id')['tn_total'].shift(lag)
    df[f'diff_tn_{lag}'] = df['clase'] - df[f'tn_{lag}']

df['rollmean_3'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(3).mean())
df['rollmean_6'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(6).mean())
df['rollmean_9'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(9).mean())
df['rollmean_12'] = df.groupby('product_id')['tn_total'].transform(lambda x: x.shift(1).rolling(12).mean())
df['diff_rollmean_12'] = df['tn_total'] - df['rollmean_12']
df['diff_rollmean_9'] = df['tn_total'] - df['rollmean_9']
df['diff_rollmean_6'] = df['tn_total'] - df['rollmean_6']
df['diff_rollmean_3'] = df['tn_total'] - df['rollmean_3']

# ====================================
#  Factorizar cluster_dtw y otras categóricas
# ====================================
df['cluster_dtw_factorized'], _ = pd.factorize(df['cluster_dtw'].fillna(-1))

for col in df.select_dtypes(include='object').columns:
    if col not in ['cluster_dtw']:
        df[col + '_factorized'], _ = pd.factorize(df[col])

log("✅ Ingeniería de features completada para todo el dataset")

# ====================================
#  Definir lista de features
# ====================================
numeric_cols = ['tn_total'] + [f'tn_{i}' for i in range(1, 37)] + \
               [f'diff_tn_{i}' for i in range(1, 37)] + \
               ['rollmean_3', 'rollmean_6', 'rollmean_9', 'rollmean_12', 'diff_rollmean_3', 'diff_rollmean_6', 'diff_rollmean_9', 'diff_rollmean_12']

#factor_cols = ['cluster_dtw_factorized'] + [c for c in df.columns if c.endswith('_factorized') and c != 'cluster_dtw_factorized']

features = numeric_cols + ['product_id']

# ====================================
#  Split train / test
# ====================================
train_set = df[(df['fecha'] <= '2019-10-01') & df['clase'].notnull()].copy()
test_set = df[df['fecha'] == '2019-12-01'].copy()

# ====================================
#  Validación de shape y pesos
# ====================================
if train_set.empty or test_set.empty:
    log(f"Train o Test vacío. Verifica tus datos.")
else:
    train_mean_tn = train_set['tn_total'].mean()
    train_set['weight'] = train_set['tn_total']
    train_set['weight'] = train_set['weight'].fillna(1).clip(lower=0.1, upper=10)

    log(f"✅ Shapes => Train: {train_set.shape} | Test: {test_set.shape}")
    log(f"✅ Peso promedio: {train_set['weight'].mean():.4f}")

    # ====================================
    #  Entrenar modelo global
    # ====================================
    predictor = TabularPredictor(label='clase', problem_type='regression', eval_metric='mae')
    predictor.fit(
        train_data=train_set[features + ['clase']],
        ag_args_fit={'sample_weight': 'weight'},
        hyperparameters=hyperparameters,
        presets='best_quality',
        time_limit=7200,
        num_bag_folds=5,
        num_stack_levels=1
    )

    # ====================================
    #  Predicción final
    # ====================================
    test_set['tn'] = predictor.predict(test_set[features])
    test_set['tn'] = test_set['tn'].clip(lower=0)

    # ====================================
    #  Guardar salida final
    # ====================================
    output_file = f"{output_dir}/forecast_modelo_global_v4.csv"
    test_set[['product_id', 'tn']].to_csv(output_file, index=False)
    log(f"✅ Predicciones guardadas: {output_file} | Productos: {test_set['product_id'].nunique()}")



No path specified. Models will be saved in: "AutogluonModels\ag-20250707_143810"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.9.21
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
CPU Count:          14
Memory Avail:       1.69 GB / 15.31 GB (11.0%)
Disk Space Avail:   199.09 GB / 475.95 GB (41.8%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitt

[2025-07-07 11:38:10] ✅ Ingeniería de features completada para todo el dataset\n
[2025-07-07 11:38:10] ✅ Shapes => Train: (20815, 105) | Test: (780, 104)\n
[2025-07-07 11:38:10] ✅ Peso promedio: 7.0110\n


Leaderboard on holdout data (DyStack):
                 model  score_holdout  score_val          eval_metric  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  WeightedEnsemble_L3      -2.086603  -2.556011  mean_absolute_error        9.755493       9.303828  1545.832778                 0.026768                0.000997           0.136171            3       True         10
1    ExtraTrees_BAG_L2      -2.093862  -2.615540  mean_absolute_error        7.560785       6.789911  1222.671886                 0.817483                2.116324          22.205643            2       True          8
2  RandomForest_BAG_L2      -2.172816  -2.710361  mean_absolute_error        7.586477       6.141037  1284.137637                 0.843175                1.467450          83.671394            2       True          7
3       XGBoost_BAG_L2      -2.245620  -3.041034  mean_absolute_error        7.733441       5

[2025-07-07 12:23:47] ✅ Predicciones guardadas: output_forecasts_by_cluster_tabular_full/forecast_modelo_global_v4.csv | Productos: 780\n


# 🗃️ 4. Combinar todos los resultados

In [24]:
# 4.1 Combinar todos los CSV
csv_files = glob.glob(f"{output_dir}/forecast_modelo_global_v4.csv")
dfs = [pd.read_csv(f) for f in csv_files]
df_final = pd.concat(dfs, axis=0)

df_final.to_csv("forecast_tabular_clusters_202002_full.csv", index=False)
log(f"Archivo final forecast_tabular_clusters_202002.csv generado.")

log_file.close()


[2025-07-07 12:23:47] Archivo final forecast_tabular_clusters_202002.csv generado.\n
