# ML Forecast con categorías

## Importamos las librerías

In [61]:
import pandas as pd
from mlforecast import MLForecast
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np

from window_ops.rolling import rolling_mean
import optuna
from sklearn.model_selection import TimeSeriesSplit
from utilsforecast.feature_engineering import fourier

In [62]:
df = pd.read_csv('../data/sell-in.txt', sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
0,201701,10234,20524,0,2,0.053,0.053
1,201701,10032,20524,0,1,0.13628,0.13628
2,201701,10217,20524,0,1,0.03028,0.03028
3,201701,10125,20524,0,1,0.02271,0.02271
4,201701,10012,20524,0,11,1.54452,1.54452


In [63]:
df_productos_predecir = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t', encoding='utf-8')
df_productos_predecir.head()

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


In [64]:
df = df[df['product_id'].isin(df_productos_predecir['product_id'])]

In [65]:
df['periodo'].sort_values().unique()

array([201701, 201702, 201703, 201704, 201705, 201706, 201707, 201708,
       201709, 201710, 201711, 201712, 201801, 201802, 201803, 201804,
       201805, 201806, 201807, 201808, 201809, 201810, 201811, 201812,
       201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
       201909, 201910, 201911, 201912])

In [66]:
df_pivot = df.pivot_table(
    index=['product_id', 'customer_id'],
    columns='periodo',
    values='tn',
    aggfunc='sum',
    fill_value=None
)
df_pivot = df_pivot.reset_index()
df_pivot.columns.name = None
df_pivot.head()

Unnamed: 0,product_id,customer_id,201701,201702,201703,201704,201705,201706,201707,201708,...,201903,201904,201905,201906,201907,201908,201909,201910,201911,201912
0,20001,10001,99.43861,198.84365,92.46537,13.29728,101.00563,128.04792,101.20711,43.3393,...,130.54927,364.37071,439.90647,65.92436,144.78714,33.63991,109.05244,176.0298,236.65556,180.21938
1,20001,10002,35.72806,6.79415,29.94128,22.81133,31.22847,47.57025,21.84874,17.08052,...,31.97079,55.41679,30.87299,144.07021,37.14616,,72.08551,17.40806,45.61495,113.33165
2,20001,10003,143.49426,20.48319,137.87537,68.89292,135.1219,171.01785,64.66196,83.6341,...,170.89924,230.00152,1.84835,,138.23391,162.07198,233.20532,76.00625,86.14415,102.27517
3,20001,10004,184.72927,104.03894,295.43924,247.65632,188.37819,195.02683,379.4427,237.16848,...,102.64484,91.67799,389.02653,66.71971,228.62366,96.11402,288.34205,324.96172,195.67828,34.6481
4,20001,10005,19.08407,5.17117,5.17117,0.86186,37.95546,19.08407,43.35049,67.53856,...,6.90049,22.18016,15.89578,,8.25595,,12.804,17.13921,12.22149,19.60368


In [67]:
print("\n--- 1. Transformando datos a formato largo ---")
df = df_pivot.melt(
    id_vars=['product_id', 'customer_id'],
    var_name='periodo',
    value_name='y' # MLForecast usa 'y' como nombre de la variable objetivo
)

df.head()


--- 1. Transformando datos a formato largo ---


Unnamed: 0,product_id,customer_id,periodo,y
0,20001,10001,201701,99.43861
1,20001,10002,201701,35.72806
2,20001,10003,201701,143.49426
3,20001,10004,201701,184.72927
4,20001,10005,201701,19.08407


In [68]:
df_real_201912 = df[df['periodo'] == 201912][['product_id', 'customer_id', 'y']]

In [70]:
df_real_201912.shape

(262805, 3)

In [74]:
FECHA_CORTE = '2019-10-01'
horizonte_prediccion = 2

product_ids = df['product_id'].unique()
df_pred_final = pd.DataFrame()

for pid in product_ids:
    df_prod = df[df['product_id'] == pid].copy()
    if df_prod.empty:
        continue

    df_prod['unique_id'] = df_prod['product_id'].astype(str) + "_" + df_prod['customer_id'].astype(str)
    df_prod['ds'] = pd.to_datetime(df_prod['periodo'], format='%Y%m')
    df_prod['y'] = df_prod['y'].fillna(0)
    df_final_prod = df_prod[['unique_id', 'ds', 'y']].sort_values(by=['unique_id', 'ds']).reset_index(drop=True)
    df_final_prod = df_final_prod.loc[:, ~df_final_prod.columns.duplicated()]
    df_entrenamiento_prod = df_final_prod[df_final_prod['ds'] <= FECHA_CORTE]

    fcst_prod = MLForecast(
        models=LGBMRegressor(random_state=42, n_estimators=100),
        freq='MS',
        lags=list(range(1, 25)),
        date_features=['month', 'year'],
    )
    fcst_prod.fit(df_entrenamiento_prod, static_features=[])

    pred_prod = fcst_prod.predict(h=horizonte_prediccion)
    pred_prod['product_id'] = pid

    pred_prod_201912 = pred_prod[pred_prod['ds'] == '2019-12-01'].copy()
    pred_prod_201912['customer_id'] = pred_prod_201912['unique_id'].str.split('_').str[1].astype(int)
    pred_prod_201912.rename(columns={'LGBMRegressor': 'tn'}, inplace=True)

    df_pred_final = pd.concat([df_pred_final, pred_prod_201912[['product_id', 'customer_id', 'tn']]], ignore_index=True)

df_pred_sum = df_pred_final.groupby('product_id', as_index=False)['tn'].sum()
print(df_pred_sum)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6117
[LightGBM] [Info] Number of data points in the train set: 4440, number of used features: 25
[LightGBM] [Info] Start training from score 3.278093
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6123
[LightGBM] [Info] Number of data points in the train set: 4450, number of used features: 25
[LightGBM] [Info] Start training from score 2.605474
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

In [75]:
print(df_real_201912.shape, df_pred_final.shape)

(262805, 3) (262805, 3)


In [76]:
# Calcula el error cuadrático medio (MSE) entre las predicciones y los valores reales
# Para esto, necesitamos los valores reales correspondientes a las predicciones

# Unimos las predicciones con los valores reales
df_eval = pd.merge(df_pred_final, df_real_201912, on=['product_id', 'customer_id'], how='inner')

# Calculamos el error cuadrático medio
mse = mean_squared_error(df_eval['y'].fillna(0), df_eval['tn'].fillna(0))
print(f'Error cuadrático medio (MSE): {mse}')

Error cuadrático medio (MSE): 1.2775500343926338


In [77]:
df_pred_sum.head()

Unnamed: 0,product_id,tn
0,20001,1737.876264
1,20002,1248.617372
2,20003,857.382471
3,20004,754.845754
4,20005,829.485389


In [78]:
FECHA_CORTE = '2019-12-01'
horizonte_prediccion = 2  # enero y febrero 2020

product_ids = df['product_id'].unique()
df_pred_final = pd.DataFrame()

for pid in product_ids:
    df_prod = df[df['product_id'] == pid].copy()
    if df_prod.empty:
        continue

    df_prod['unique_id'] = df_prod['product_id'].astype(str) + "_" + df_prod['customer_id'].astype(str)
    df_prod['ds'] = pd.to_datetime(df_prod['periodo'], format='%Y%m')
    df_prod['y'] = df_prod['y'].fillna(0)
    df_final_prod = df_prod[['unique_id', 'ds', 'y']].sort_values(by=['unique_id', 'ds']).reset_index(drop=True)
    df_final_prod = df_final_prod.loc[:, ~df_final_prod.columns.duplicated()]
    df_entrenamiento_prod = df_final_prod[df_final_prod['ds'] <= FECHA_CORTE]

    fcst_prod = MLForecast(
        models=LGBMRegressor(random_state=42, n_estimators=100),
        freq='MS',
        lags=list(range(1, 25)),
        date_features=['month', 'year'],
    )
    fcst_prod.fit(df_entrenamiento_prod, static_features=[])

    pred_prod = fcst_prod.predict(h=horizonte_prediccion)
    pred_prod['product_id'] = pid

    pred_prod_202002 = pred_prod[pred_prod['ds'] == '2020-02-01'].copy()
    pred_prod_202002['customer_id'] = pred_prod_202002['unique_id'].str.split('_').str[1].astype(int)
    pred_prod_202002.rename(columns={'LGBMRegressor': 'tn'}, inplace=True)

    df_pred_final = pd.concat([df_pred_final, pred_prod_202002[['product_id', 'customer_id', 'tn']]], ignore_index=True)

df_pred_sum = df_pred_final.groupby('product_id', as_index=False)['tn'].sum()
print(df_pred_sum)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6133
[LightGBM] [Info] Number of data points in the train set: 5328, number of used features: 25
[LightGBM] [Info] Start training from score 3.276425
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6133
[LightGBM] [Info] Number of data points in the train set: 5340, number of used features: 25
[LightGBM] [Info] Start training from score 2.641432
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

In [79]:
df_pred_sum.head()

Unnamed: 0,product_id,tn
0,20001,1469.036526
1,20002,1054.117828
2,20003,712.381267
3,20004,562.806841
4,20005,697.190996


In [81]:
df_pred_sum.to_csv('df_pred_sum_b.csv', index=False)