AutoGluon - Predicción de ventas (tn) por producto para febrero 2020

In [2]:
import torch
print(torch.cuda.is_available())  # Debe dar True
print(torch.cuda.device_count())  # Debe mostrar tu cantidad de GPUs


True
1


In [3]:
# 📦 1. Importar librerías
import pandas as pd

In [5]:
# 💬 Instalar AutoGluon si es necesario
#%pip install autogluon.timeseries

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# 📄 2. Cargar datasets

df_sellin = pd.read_parquet('../data/l_vm_completa_train.parquet', engine='fastparquet')
df_sellin = df_sellin[df_sellin['A_PREDECIR'] != 'N']

In [7]:
# 🧹 3. Preprocesamiento
# Convertir periodo a datetime
df_sellin['timestamp'] = pd.to_datetime(df_sellin['PERIODO'], format='%Y%m')

In [13]:
# Filtrar hasta dic 2019 y productos requeridos
df_filtered = df_sellin[
    (df_sellin['timestamp'] <= '2019-12-01')
]

In [14]:
# Agregar tn por periodo, cliente y producto
df_grouped = df_filtered.groupby(['timestamp', 'CUSTOMER_ID', 'PRODUCT_ID'], as_index=False)['TN'].sum()

In [15]:
# Agregar tn total por periodo y producto
df_monthly_product = df_grouped.groupby(['timestamp', 'PRODUCT_ID'], as_index=False)['TN'].sum()

In [16]:
# Agregar columna 'item_id' para AutoGluon
df_monthly_product['item_id'] = df_monthly_product['PRODUCT_ID']

In [17]:
# ⏰ 4. Crear TimeSeriesDataFrame
ts_data = TimeSeriesDataFrame.from_data_frame(
    df_monthly_product,
    id_column='item_id',
    timestamp_column='timestamp'
)

In [19]:
# Simplemente reseteá el índice, te devuelve un DataFrame de pandas
df = ts_data.reset_index()

# Extraer año y mes
df['anio'] = df['timestamp'].dt.year
df['mes'] = df['timestamp'].dt.month

# Agrupar y contar filas por año y mes
resumen = df.groupby(['anio', 'mes']).size().reset_index(name='cantidad_filas')

print(resumen)


    anio  mes  cantidad_filas
0   2017    1             496
1   2017    2             500
2   2017    3             502
3   2017    4             502
4   2017    5             506
5   2017    6             513
6   2017    7             525
7   2017    8             530
8   2017    9             536
9   2017   10             549
10  2017   11             564
11  2017   12             564
12  2018    1             568
13  2018    2             569
14  2018    3             575
15  2018    4             587
16  2018    5             599
17  2018    6             599
18  2018    7             603
19  2018    8             608
20  2018    9             627
21  2018   10             646
22  2018   11             656
23  2018   12             656
24  2019    1             656
25  2019    2             660
26  2019    3             675
27  2019    4             705
28  2019    5             718
29  2019    6             734
30  2019    7             756
31  2019    8             771
32  2019  

In [20]:
# Completar valores faltantes
ts_data = ts_data.fill_missing_values()

In [21]:
# Simplemente reseteá el índice, te devuelve un DataFrame de pandas
df = ts_data.reset_index()

# Extraer año y mes
df['anio'] = df['timestamp'].dt.year
df['mes'] = df['timestamp'].dt.month

# Agrupar y contar filas por año y mes
resumen = df.groupby(['anio', 'mes']).size().reset_index(name='cantidad_filas')

print(resumen)

    anio  mes  cantidad_filas
0   2017    1             496
1   2017    2             500
2   2017    3             502
3   2017    4             502
4   2017    5             506
5   2017    6             513
6   2017    7             525
7   2017    8             530
8   2017    9             536
9   2017   10             549
10  2017   11             564
11  2017   12             564
12  2018    1             568
13  2018    2             569
14  2018    3             575
15  2018    4             587
16  2018    5             599
17  2018    6             599
18  2018    7             603
19  2018    8             608
20  2018    9             627
21  2018   10             646
22  2018   11             656
23  2018   12             656
24  2019    1             656
25  2019    2             660
26  2019    3             675
27  2019    4             705
28  2019    5             718
29  2019    6             734
30  2019    7             756
31  2019    8             771
32  2019  

In [23]:
# ⚙️ 5. Definir y entrenar predictor

predictor = TimeSeriesPredictor(
    prediction_length=2,
    target='TN',
    freq='MS'
)

hyperparameters = {
    "AutoARIMA": {},
    "ETS": {},
    "DeepAR": {"num_batches_per_epoch": 100, "num_workers": 20},
    "PatchTST": {"num_workers": 20},
    "TemporalFusionTransformer": {"num_workers": 20},
    "DLinear": {},
    # Podés sumar otros modelos de la lista...
}

predictor.fit(
    ts_data,
    num_val_windows=16,
    time_limit=60*60,
    hyperparameters=hyperparameters
)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to '/workspace/ejemplos_clase/AutogluonModels/ag-20250719_001611'
AutoGluon Version:  1.3.1
Python Version:     3.10.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #29~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jun 26 14:16:59 UTC 2
CPU Count:          28
GPU Count:          1
Memory Avail:       101.35 GB / 125.58 GB (80.7%)
Disk Space Avail:   143.13 GB / 543.17 GB (26.4%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': {'AutoARIMA': {},
                     'DLinear': {},
                     'DeepAR': {'num_batches_per_epoch': 100,
                                'num_workers': 20},
                     'ETS': {},
                     'PatchTST': {'num_workers': 20},
                     'TemporalFusionTransformer': {'num_workers': 20}},
 'known_covariates_names': [],
 'num_val_windows': 16,
 'prediction_length': 2,
 'quan

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7db0a961cc40>

In [24]:
# 🔮 6. Generar predicción
forecast = predictor.predict(ts_data)

Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [25]:
# Extraer predicción media y filtrar febrero 2020
forecast_mean = forecast['mean'].reset_index()
print(forecast_mean.columns)

Index(['item_id', 'timestamp', 'mean'], dtype='object')


In [26]:
# Tomar solo item_id y la predicción 'mean'
resultado = forecast['mean'].reset_index()[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']

# Filtrar solo febrero 2020
resultado = forecast['mean'].reset_index()
resultado = resultado[resultado['timestamp'] == '2020-02-01']

# Renombrar columnas
resultado = resultado[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']


In [27]:
# 💾 7. Guardar archivo
resultado.to_csv("predicciones_febrero2020_fecha_01_07.csv", index=False)
resultado.head()

Unnamed: 0,product_id,tn
1,20001,1154.43729
3,20002,914.357926
5,20003,661.944658
7,20004,490.075366
9,20005,465.822648
