# Modelado tabular con Autgluon

In [18]:
!sudo apt-get update
!sudo apt-get install gcsfuse

Hit:1 https://nvidia.github.io/libnvidia-container/stable/deb/amd64  InRelease
Hit:2 https://deb.debian.org/debian bullseye InRelease              
Hit:3 https://download.docker.com/linux/debian bullseye InRelease   
Hit:4 https://deb.debian.org/debian-security bullseye-security InRelease
Hit:5 https://deb.debian.org/debian bullseye-updates InRelease
Hit:6 https://deb.debian.org/debian bullseye-backports InRelease
Hit:7 https://packages.cloud.google.com/apt gcsfuse-bullseye InRelease
Hit:8 https://packages.cloud.google.com/apt google-compute-engine-bullseye-stable InRelease
Hit:9 https://packages.cloud.google.com/apt cloud-sdk-bullseye InRelease
Hit:10 https://packages.cloud.google.com/apt google-fast-socket InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
gcsfuse is already the newest version (3.1.0).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.


In [2]:
#!pip install autogluon.tabular

# Carga librerías

In [19]:
from autogluon.tabular import TabularPredictor
import pandas as pd
import numpy as np
import pickle

In [3]:
!mkdir -p /home/jupyter/franco_maestria/gcs_model_dir_fullpower_escalando

In [22]:
#!fusermount -u /home/jupyter/franco_maestria/gcs_model_dir_fullpower_escalando

In [23]:
!gcsfuse forecasting_customer_product /home/jupyter/franco_maestria/gcs_model_dir_fullpower_escalando

{"timestamp":{"seconds":1752536718,"nanos":520581833},"severity":"INFO","message":"Start gcsfuse/3.1.0 (Go version go1.24.0) for app \"\" using mount point: /home/jupyter/franco_maestria/gcs_model_dir_fullpower_escalando\n"}
{"timestamp":{"seconds":1752536718,"nanos":520621282},"severity":"INFO","message":"GCSFuse config","config":{"AppName":"","CacheDir":"","Debug":{"ExitOnInvariantViolation":false,"Fuse":false,"Gcs":false,"LogMutex":false},"DisableAutoconfig":false,"EnableAtomicRenameObject":true,"EnableGoogleLibAuth":false,"EnableHns":true,"EnableNewReader":false,"FileCache":{"CacheFileForRangeRead":false,"DownloadChunkSizeMb":200,"EnableCrc":false,"EnableODirect":false,"EnableParallelDownloads":false,"ExperimentalExcludeRegex":"","ExperimentalParallelDownloadsDefaultOn":true,"MaxParallelDownloads":96,"MaxSizeMb":-1,"ParallelDownloadsPerFile":16,"WriteBufferSize":4194304},"FileSystem":{"DirMode":"755","DisableParallelDirops":false,"ExperimentalEnableDentryCache":false,"ExperimentalE

# ✅ 1) Cálculo y aplicación de la estandarización

### 👉 Idea clave:

- Calculas media y desvío por product_id usando solo los registros de entrenamiento (train_set).
- Creas un scaler_dict para mapear cada product_id a su (mean, std).
- Normalizas tn y clase solo en training.
- En test, aplicas el mismo scaler_dict para transformar los features antes de predecir y reviertes la predicción después.

In [26]:
# -------------------------------
# 1) Cargar parquet escalado + scaler_dict
# -------------------------------
df_scaled = pd.read_parquet("panel_cliente_producto_fe_scaled.parquet")

with open("scaler_dict.pkl", 'rb') as f:
    scaler_dict = pickle.load(f)

print(f"✅ Datos cargados: {df_scaled.shape}")
print(f"✅ Diccionario de escalado cargado: {len(scaler_dict)} productos")

✅ Datos cargados: (12138186, 194)
✅ Diccionario de escalado cargado: 780 productos


In [27]:
# -------------------------------
# 2) Separar train/test
# -------------------------------
train_set = df_scaled[(df_scaled['fecha'] <= '2019-10-01') & df_scaled['clase'].notnull()].copy()
test_set = df_scaled[df_scaled['fecha'] == '2019-12-01'].copy()

# Crear sample_weight basado en tn_total

train_set['sample_weight'] = train_set['tn']

print(f"Train shape: {train_set.shape} | Test shape: {test_set.shape}")

Train shape: (10297990, 195) | Test shape: (333840, 194)


In [28]:
# -------------------------------
# 3) Features finales
# -------------------------------
features = [col for col in df_scaled.columns if col not in [
    'periodo', 'fecha', 'clase'
]]

print(f"Total features: {len(features)}")

Total features: 191


In [None]:
# -------------------------------
# 4) Entrenar Autogluon Tabular
# -------------------------------
predictor = TabularPredictor(
    label='clase',
    problem_type='regression',
    eval_metric='mae',
    path='gcs_model_dir_fullpower_escalando'
)

predictor.fit(
    train_data=train_set[features + ['clase']],
    time_limit=14400,
     ag_args_fit={'sample_weight': 'sample_weight'},
    presets='best_quality',
    num_bag_folds=5,
    num_stack_levels=4
)

print("✅ Entrenamiento finalizado.")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Debian 5.10.237-1 (2025-05-19)
CPU Count:          48
Memory Avail:       259.39 GB / 377.89 GB (68.6%)
Disk Space Avail:   1048576.00 GB / 1048576.00 GB (100.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=4, num_bag_folds=5, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 3600s of t

In [None]:
# -------------------------------
# 5) Predicciones
# -------------------------------
test_set['clase_pred_scaled'] = predictor.predict(test_set[features])

# -------------------------------
# 6) Inversa de escalado
# -------------------------------
def inverse_scale(row):
    pid = row['product_id']
    if pid in scaler_dict:
        m = scaler_dict[pid]['mean']
        s = scaler_dict[pid]['std']
        return (row['clase_pred_scaled'] * s) + m
    else:
        return row['clase_pred_scaled']

test_set['tn_pred'] = test_set.apply(inverse_scale, axis=1)
test_set['tn_pred'] = test_set['tn_pred'].clip(lower=0)

# -------------------------------
# 7) Agregar por producto
# -------------------------------
df_final = (
    test_set.groupby('product_id', as_index=False)['tn_pred']
    .sum()
    .rename(columns={'tn_pred': 'tn'})
)

print(df_final.head())

# -------------------------------
# 8) Exportar CSV final
# -------------------------------
df_final.to_csv('forecast_final_desescalado.csv', index=False)

print("✅ Forecast final generado: forecast_final_desescalado.csv")
print(f"Productos únicos: {df_final['product_id'].nunique()}")
print(f"Total TN predichas: {df_final['tn'].sum():,.2f}")


In [None]:
# -------------------------------
# 9) Leaderboard (performance interna)
# -------------------------------
print("\n🔍 Leaderboard:")
lb = predictor.leaderboard(silent=True)
print(lb)

# -------------------------------
# 10) Importancia de features
# -------------------------------
print("\n🔍 Importancia de Features:")
fi = predictor.feature_importance(train_set[features + ['clase']])
fi = fi.reset_index().rename(columns={'index': 'feature'})
print(fi.head(50))