AutoGluon - Predicción de ventas (tn) por producto para febrero 2020

In [45]:
%pip install autogluon.timeseries




In [46]:
# 📦 1. Importar librerías
import pandas as pd

In [47]:
# 💬 Instalar AutoGluon si es necesario
#%pip install autogluon.timeseries

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

In [48]:
import os
import urllib.request
from pathlib import Path
from typing import Dict, List, Tuple
from datetime import datetime

import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna

# Configurar directorios
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)
os.makedirs("models", exist_ok=True)

In [49]:
CONFIG = {
    "files": {
        "sellin": {
            "local": "sell-in.txt.gz",
            "url": "https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/sell-in.txt.gz",
            "read_args": {
                "sep": "\t",
                "compression": "gzip",
                "dtype": {"periodo": str, "customer_id": str, "product_id": str}
            }
        },
        "productos": {
            "local": "tb_productos.txt",
            "url": "https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/tb_productos.txt",
            "read_args": {
                "sep": "\t",
                "dtype": {"product_id": str}
            }
        },
        "stocks": {
            "local": "tb_stocks.txt",
            "url": "https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/tb_stocks.txt",
            "read_args": {
                "sep": "\t",
                "dtype": {"periodo": str, "product_id": str}
            }
        },
        "productos_a_predecir": {
            "local": "product_id_apredecir201912.txt",
            "url": "https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt",
            "read_args": {
                "sep": "\t",
                "dtype": {"product_id": str}
            }
        }
    },
    "dates": {
        "start": "2017-01-01",
        "end": "2019-12-31"
    }
}

In [50]:
def download_if_needed(local: str, url: str) -> str:
    if not os.path.exists(local):
        print(f"⬇️  Descargando {os.path.basename(local)} ...")
        urllib.request.urlretrieve(url, local)
    return local

def load_raw_files(file_cfg: dict) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Lee sell-in, productos, stocks e ids_a_predecir."""
    paths = {}
    for name, meta in file_cfg.items():
        local_path = os.path.join("data/raw", meta["local"])
        download_if_needed(local_path, meta["url"])
        paths[name] = local_path

    df_sellin = pd.read_csv(paths["sellin"], **file_cfg["sellin"]["read_args"])
    df_productos = pd.read_csv(paths["productos"], **file_cfg["productos"]["read_args"])
    stocks = pd.read_csv(paths["stocks"], **file_cfg["stocks"]["read_args"])
    product_ids = pd.read_csv(paths["productos_a_predecir"], **file_cfg["productos_a_predecir"]["read_args"])

    return df_sellin, df_productos, stocks, product_ids

# Descargar y cargar datos
df_sellin, df_productos, stocks, product_ids = load_raw_files(CONFIG["files"])
print(f"✅ Datos descargados: {len(df_sellin):,} registros de sell-in")

✅ Datos descargados: 2,945,818 registros de sell-in


In [51]:
# 📄 2. Cargar datasets
df_sellin = pd.read_csv("/content/data/raw/sell-in.txt.gz", sep="\t")
df_productos = pd.read_csv("/content/data/raw/tb_productos.txt", sep="\t")

In [52]:
# 📄 Leer lista de productos a predecir
with open("/content/data/raw/product_id_apredecir201912.txt", "r") as f:
    product_ids = [int(line.strip()) for line in f if line.strip().isdigit()]

In [53]:
# 🧹 3. Preprocesamiento
# Convertir periodo a datetime
df_sellin['timestamp'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')

In [54]:
# Filtrar hasta dic 2019 y productos requeridos
df_filtered = df_sellin[
    (df_sellin['timestamp'] <= '2019-12-01') &
    (df_sellin['product_id'].isin(product_ids))
]

if df_filtered.empty:
    print("⚠️ Warning: The filtered DataFrame is empty. Please check the filtering criteria and the input data.")

In [55]:
# Agregar tn por periodo, cliente y producto
df_grouped = df_filtered.groupby(['timestamp', 'customer_id', 'product_id'], as_index=False)['tn'].sum()

In [56]:
# Agregar tn total por periodo y producto
df_monthly_product = df_grouped.groupby(['timestamp', 'product_id'], as_index=False)['tn'].sum()

In [57]:
# Agregar columna 'item_id' para AutoGluon
df_monthly_product['item_id'] = df_monthly_product['product_id']

In [58]:
# ⏰ 4. Crear TimeSeriesDataFrame
ts_data = TimeSeriesDataFrame.from_data_frame(
    df_monthly_product,
    id_column='item_id',
    timestamp_column='timestamp'
)

In [59]:
# Completar valores faltantes
ts_data = ts_data.fill_missing_values()

In [60]:
# ⚙️ 5. Definir y entrenar predictor
predictor = TimeSeriesPredictor(
    prediction_length=2,
    target='tn',
    freq='MS'  # Frecuencia mensual (Month Start),
)

predictor.fit(ts_data, num_val_windows=2, time_limit=60*60)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to '/content/AutogluonModels/ag-20250702_112222'
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          2
GPU Count:          0
Memory Avail:       10.54 GB / 12.67 GB (83.2%)
Disk Space Avail:   64.99 GB / 107.72 GB (60.3%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'tn',
 'time_limit': 3600,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency 'MS'.
Provided train_data has 22375 rows (NaN fraction=0.1%), 7

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/821M [00:00<?, ?B/s]

	-0.1905       = Validation score (-WQL)
	42.32   s     = Training runtime
	12.75   s     = Validation (prediction) runtime
Training timeseries model ChronosFineTuned[bolt_small]. Training for up to 577.5s of the 3464.7s of remaining time.
	Skipping covariate_regressor since the dataset contains no covariates or static features.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/191M [00:00<?, ?B/s]

	Fine-tuning on the CPU detected. We recommend using a GPU for faster fine-tuning of Chronos.
	Saving fine-tuned model to /content/AutogluonModels/ag-20250702_112222/models/ChronosFineTuned[bolt_small]/W0/fine-tuned-ckpt
	Skipping covariate_regressor since the dataset contains no covariates or static features.
	Fine-tuning on the CPU detected. We recommend using a GPU for faster fine-tuning of Chronos.
	Saving fine-tuned model to /content/AutogluonModels/ag-20250702_112222/models/ChronosFineTuned[bolt_small]/W1/fine-tuned-ckpt
	-0.1800       = Validation score (-WQL)
	512.47  s     = Training runtime
	2.21    s     = Validation (prediction) runtime
Training timeseries model TemporalFusionTransformer. Training for up to 590.0s of the 2950.0s of remaining time.
	-0.1858       = Validation score (-WQL)
	467.82  s     = Training runtime
	0.58    s     = Validation (prediction) runtime
Training timeseries model DeepAR. Training for up to 627.2s of the 2481.6s of remaining time.
	-0.1860    

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7b2fd68dc450>

In [61]:
# 🔮 6. Generar predicción
forecast = predictor.predict(ts_data)

data with frequency 'IRREG' has been resampled to frequency 'MS'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [62]:
# Extraer predicción media y filtrar febrero 2020
forecast_mean = forecast['mean'].reset_index()
print(forecast_mean.columns)

Index(['item_id', 'timestamp', 'mean'], dtype='object')


In [63]:
# Tomar solo item_id y la predicción 'mean'
resultado = forecast['mean'].reset_index()[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']

# Filtrar solo febrero 2020
resultado = forecast['mean'].reset_index()
resultado = resultado[resultado['timestamp'] == '2020-02-01']

# Renombrar columnas
resultado = resultado[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']


In [64]:
# 💾 7. Guardar archivo
resultado.to_csv("predicciones_febrero2020_fecha_02_07.csv", index=False)
resultado.head()

Unnamed: 0,product_id,tn
1,20001,1309.387695
3,20002,1083.405365
5,20003,701.027577
7,20004,517.482591
9,20005,500.457023
