In [1]:
from datetime import datetime
import pandas as pd
import seaborn as sns

In [2]:
classification_df = pd.read_csv('demand_classification_by_item.csv', sep=';')
raw_df = pd.read_csv('df_periodos_rellenados.csv', sep=';')
display(raw_df)
display(classification_df)

Unnamed: 0,time,item,sales,price,unit_price
0,2019-06-24,1002,24,0.72625,0.030260
1,2020-08-31,1002,224,0.28750,0.001283
2,2021-02-08,1002,40,0.28500,0.007125
3,2021-03-22,1002,80,0.28500,0.003562
4,2021-03-29,1002,16,0.28750,0.017969
...,...,...,...,...,...
3338696,2023-11-22,9998,0,,
3338697,2023-11-23,9998,0,,
3338698,2023-11-24,9998,0,,
3338699,2023-11-25,9998,0,,


Unnamed: 0,item,p,cv2,demand_type,unit_price
0,1002,128.071429,0.612206,lumpy,0.009178
1,1003,234.000000,0.213018,intermittent,1.005850
2,1006,84.652174,4.362716,lumpy,2.061909
3,1007,1.000000,0.000000,smooth,0.398125
4,1009,1.000000,0.000000,smooth,0.133375
...,...,...,...,...,...
3812,9988,312.000000,0.000000,intermittent,1.943750
3813,9989,1.000000,0.000000,smooth,0.191875
3814,9994,150.000000,0.291320,intermittent,4.370863
3815,9997,1.000000,0.000000,smooth,0.106354


In [3]:
raw_df.time.min()

'2018-08-20'

In [4]:
raw_df['time'] = pd.to_datetime(raw_df['time'], format="%Y-%m-%d")
raw_df['day'] = raw_df.time.dt.day
raw_df['month'] = raw_df.time.dt.month
raw_df['year'] = raw_df.time.dt.year
raw_df['weekday'] = raw_df.time.dt.weekday
raw_df['is_weekend'] = (raw_df.weekday >= 5).astype(int)
raw_df['days_since_first_data'] = (raw_df.time - raw_df.time.min()).dt.days
raw_df

Unnamed: 0,time,item,sales,price,unit_price,day,month,year,weekday,is_weekend,days_since_first_data
0,2019-06-24,1002,24,0.72625,0.030260,24,6,2019,0,0,308
1,2020-08-31,1002,224,0.28750,0.001283,31,8,2020,0,0,742
2,2021-02-08,1002,40,0.28500,0.007125,8,2,2021,0,0,903
3,2021-03-22,1002,80,0.28500,0.003562,22,3,2021,0,0,945
4,2021-03-29,1002,16,0.28750,0.017969,29,3,2021,0,0,952
...,...,...,...,...,...,...,...,...,...,...,...
3338696,2023-11-22,9998,0,,,22,11,2023,2,0,1920
3338697,2023-11-23,9998,0,,,23,11,2023,3,0,1921
3338698,2023-11-24,9998,0,,,24,11,2023,4,0,1922
3338699,2023-11-25,9998,0,,,25,11,2023,5,1,1923


### Solo consideramos los ítems de demanda Lumpy e Intermittent

In [5]:
intermittent_ids = classification_df[classification_df.demand_type == 'intermittent']['item'].unique()
lumpy_ids = classification_df[classification_df.demand_type == 'lumpy']['item'].unique()
raw_df = raw_df[(raw_df['item'].isin(intermittent_ids)) | (raw_df['item'].isin(lumpy_ids))]
df = raw_df[['item', 'time', 'day', 'month', 'year', 'weekday', 'is_weekend', 'days_since_first_data', 'sales']]
df

Unnamed: 0,item,time,day,month,year,weekday,is_weekend,days_since_first_data,sales
0,1002,2019-06-24,24,6,2019,0,0,308,24
1,1002,2020-08-31,31,8,2020,0,0,742,224
2,1002,2021-02-08,8,2,2021,0,0,903,40
3,1002,2021-03-22,22,3,2021,0,0,945,80
4,1002,2021-03-29,29,3,2021,0,0,952,16
...,...,...,...,...,...,...,...,...,...
3338696,9998,2023-11-22,22,11,2023,2,0,1920,0
3338697,9998,2023-11-23,23,11,2023,3,0,1921,0
3338698,9998,2023-11-24,24,11,2023,4,0,1922,0
3338699,9998,2023-11-25,25,11,2023,5,1,1923,0


### Dividimos en set de entrenamiento, test y validación

In [6]:
# Import necessary libraries
import numpy as np
import pandas as pd
from darts import TimeSeries
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.tsa.seasonal import seasonal_decompose

from sklearn.metrics import mean_squared_error, mean_absolute_error

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from statsmodels.stats.diagnostic import acorr_ljungbox
import pmdarima as pm
from darts.models import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [7]:
train_df, test_df = train_test_split(df, train_size=0.8)

In [8]:
X_train = train_df[['item', 'day', 'month', 'year', 'weekday', 'is_weekend', 'days_since_first_data']]
X_test = test_df[['item', 'day', 'month', 'year', 'weekday', 'is_weekend', 'days_since_first_data']]
y_train = train_df['sales']
y_test = test_df['sales']

In [9]:
train_df.head()

Unnamed: 0,item,time,day,month,year,weekday,is_weekend,days_since_first_data,sales
3257833,9783,2020-07-26,26,7,2020,6,1,706,0
250979,1611,2023-07-01,1,7,2023,5,1,1776,0
1866316,6015,2022-12-25,25,12,2022,6,1,1588,0
2814620,8582,2023-06-13,13,6,2023,1,0,1758,0
8988,2783,2021-12-13,13,12,2021,0,0,1211,15


In [10]:
print(f'Tamaño de entrenamiento: {len(X_train)}')
print(f'Tamaño de prueba: {len(X_test)}')

Tamaño de entrenamiento: 2670194
Tamaño de prueba: 667549


In [11]:
scaler = StandardScaler()

# Escalar características de entrenamiento y prueba
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Entrenar el modelo con los datos de entrenamiento
model.fit(X_train_scaled, y_train)

In [14]:
y_pred = model.predict(X_test_scaled)

In [15]:
test_df['y_pred'] = y_pred
test_df

Unnamed: 0,item,time,day,month,year,weekday,is_weekend,days_since_first_data,sales,y_pred
2573159,7993,2018-12-17,17,12,2018,0,0,119,0,1.65
1276040,4386,2022-01-13,13,1,2022,3,0,1242,0,0.00
1076758,3804,2021-07-21,21,7,2021,2,0,1066,0,0.00
315569,1760,2021-04-27,27,4,2021,1,0,981,0,0.00
286445,1691,2024-07-10,10,7,2024,2,0,2151,0,0.00
...,...,...,...,...,...,...,...,...,...,...
3030369,9164,2021-05-23,23,5,2021,6,1,1007,0,0.00
1021691,3664,2022-03-12,12,3,2022,5,1,1300,0,0.00
2207936,6949,2021-04-18,18,4,2021,6,1,972,0,0.00
2207137,6948,2022-04-06,6,4,2022,2,0,1325,0,0.00


### XGBRegressor

In [16]:
import xgboost as xgb

In [17]:
model_xgb = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model_xgb.fit(X_train_scaled, y_train)

In [18]:
y_pred_xgb = model_xgb.predict(X_test_scaled)
y_pred_xgb

array([ 8.0587044e+00, -1.2350057e-03, -1.0638222e-02, ...,
        2.8458143e-02,  3.0387618e-02, -8.4443539e-03], dtype=float32)

In [19]:
test_df['y_pred_xgb'] = y_pred_xgb
test_df

Unnamed: 0,item,time,day,month,year,weekday,is_weekend,days_since_first_data,sales,y_pred,y_pred_xgb
2573159,7993,2018-12-17,17,12,2018,0,0,119,0,1.65,8.058704
1276040,4386,2022-01-13,13,1,2022,3,0,1242,0,0.00,-0.001235
1076758,3804,2021-07-21,21,7,2021,2,0,1066,0,0.00,-0.010638
315569,1760,2021-04-27,27,4,2021,1,0,981,0,0.00,-0.013190
286445,1691,2024-07-10,10,7,2024,2,0,2151,0,0.00,0.004243
...,...,...,...,...,...,...,...,...,...,...,...
3030369,9164,2021-05-23,23,5,2021,6,1,1007,0,0.00,0.009339
1021691,3664,2022-03-12,12,3,2022,5,1,1300,0,0.00,-0.003237
2207936,6949,2021-04-18,18,4,2021,6,1,972,0,0.00,0.028458
2207137,6948,2022-04-06,6,4,2022,2,0,1325,0,0.00,0.030388


#### Se tienen las predicciones a nivel diario por ítem, pero se procede a agregarlas de forma mensual

In [20]:
test_df['month'] = test_df['time'].dt.to_period('M').dt.start_time
monthly_test_df = test_df[['month', 'item', 'sales', 'y_pred', 'y_pred_xgb']].groupby(['month', 'item']).sum().reset_index()[['month', 'item', 'sales', 'y_pred', 'y_pred_xgb']]
monthly_test_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb
0,2018-08-01,1143,10,42.07,10.602197
1,2018-08-01,1159,5,62.42,25.489206
2,2018-08-01,1187,0,0.00,0.015132
3,2018-08-01,1248,20,28.65,54.370346
4,2018-08-01,1337,0,0.00,-0.033666
...,...,...,...,...,...
111647,2024-09-01,9627,100,69.58,43.607624
111648,2024-09-01,9803,0,0.00,-0.010781
111649,2024-09-01,9877,0,2.90,8.974721
111650,2024-09-01,9891,0,0.00,-0.011862


In [21]:
test_df

Unnamed: 0,item,time,day,month,year,weekday,is_weekend,days_since_first_data,sales,y_pred,y_pred_xgb
2573159,7993,2018-12-17,17,2018-12-01,2018,0,0,119,0,1.65,8.058704
1276040,4386,2022-01-13,13,2022-01-01,2022,3,0,1242,0,0.00,-0.001235
1076758,3804,2021-07-21,21,2021-07-01,2021,2,0,1066,0,0.00,-0.010638
315569,1760,2021-04-27,27,2021-04-01,2021,1,0,981,0,0.00,-0.013190
286445,1691,2024-07-10,10,2024-07-01,2024,2,0,2151,0,0.00,0.004243
...,...,...,...,...,...,...,...,...,...,...,...
3030369,9164,2021-05-23,23,2021-05-01,2021,6,1,1007,0,0.00,0.009339
1021691,3664,2022-03-12,12,2022-03-01,2022,5,1,1300,0,0.00,-0.003237
2207936,6949,2021-04-18,18,2021-04-01,2021,6,1,972,0,0.00,0.028458
2207137,6948,2022-04-06,6,2022-04-01,2022,2,0,1325,0,0.00,0.030388


#### Redondeamos el resultado hacia arriba

In [22]:
monthly_test_df['y_pred'] = np.ceil(monthly_test_df['y_pred'])
monthly_test_df['y_pred_xgb'] = np.ceil(monthly_test_df['y_pred_xgb'])
monthly_test_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb
0,2018-08-01,1143,10,43.0,11.0
1,2018-08-01,1159,5,63.0,26.0
2,2018-08-01,1187,0,0.0,1.0
3,2018-08-01,1248,20,29.0,55.0
4,2018-08-01,1337,0,0.0,-0.0
...,...,...,...,...,...
111647,2024-09-01,9627,100,70.0,44.0
111648,2024-09-01,9803,0,0.0,-0.0
111649,2024-09-01,9877,0,3.0,9.0
111650,2024-09-01,9891,0,0.0,-0.0


In [23]:
y_test = monthly_test_df['sales']
y_pred = monthly_test_df['y_pred']
y_pred_xgb = monthly_test_df['y_pred_xgb']

In [24]:
mae_rf = mean_absolute_error(y_test, y_pred)
mae_rf

7.75225701286139

In [25]:
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mae_xgb

8.16182423960162

In [None]:
"""
from sklearn.model_selection import GridSearchCV

# Definir los parámetros a ajustar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Realizar búsqueda en malla con validación cruzada
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Ver los mejores parámetros
print(grid_search.best_params_)
"""

## Desarrollo de métrica de utilidad

In [28]:
len(classification_df.item.unique())

3817

In [29]:
classification_df

Unnamed: 0,item,p,cv2,demand_type,unit_price
0,1002,128.071429,0.612206,lumpy,0.009178
1,1003,234.000000,0.213018,intermittent,1.005850
2,1006,84.652174,4.362716,lumpy,2.061909
3,1007,1.000000,0.000000,smooth,0.398125
4,1009,1.000000,0.000000,smooth,0.133375
...,...,...,...,...,...
3812,9988,312.000000,0.000000,intermittent,1.943750
3813,9989,1.000000,0.000000,smooth,0.191875
3814,9994,150.000000,0.291320,intermittent,4.370863
3815,9997,1.000000,0.000000,smooth,0.106354


In [30]:
classification_dict = classification_df.to_dict()
item_price = {}
for i in classification_dict['unit_price'].keys():
    item_price[classification_dict['item'][i]] = classification_dict['unit_price'][i]

item_price

{1002: 0.009177519132653,
 1003: 1.00585,
 1006: 2.061908695652174,
 1007: 0.398125,
 1009: 0.133375,
 1013: 0.005403125,
 1014: 0.0154515029761904,
 1018: 0.1462920673076923,
 1019: 0.1056544325017709,
 1020: 5.4347615740740745,
 1024: 1.9328125,
 1028: 0.1854576822916667,
 1030: 0.91075,
 1034: 9.77227356362773,
 1035: 0.4789583333333333,
 1037: 4.2075,
 1040: 5.03875,
 1044: 1.804375,
 1051: 0.3276114180672269,
 1055: 0.3879253787878787,
 1061: 0.7652068494776828,
 1062: 0.5791176470588235,
 1066: 0.2350694444444444,
 1070: 7.504453125,
 1074: 0.1416964285714285,
 1075: 5.9175,
 1080: 6.594414737654321,
 1082: 3.54,
 1084: 1.3653343461398166,
 1085: 11.880624999999998,
 1086: 7.70125,
 1091: 0.01221875,
 1092: 6.78,
 1093: 11.945,
 1095: 0.974625,
 1096: 9.362916666666669,
 1098: 3.889583333333333,
 1101: 0.4560589015151515,
 1103: 2.8982431881390434,
 1105: 2.493125,
 1111: 0.9206644613368542,
 1114: 3.872853331043957,
 1115: 0.4911770656389839,
 1118: 6.1025,
 1119: 0.824041666666

In [31]:
# Agregamos el precio unitario a cada item
monthly_test_df['unit_price'] = monthly_test_df['item'].apply(lambda x: item_price[x])
monthly_test_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb,unit_price
0,2018-08-01,1143,10,43.0,11.0,0.259454
1,2018-08-01,1159,5,63.0,26.0,0.286071
2,2018-08-01,1187,0,0.0,1.0,1.667574
3,2018-08-01,1248,20,29.0,55.0,0.812460
4,2018-08-01,1337,0,0.0,-0.0,0.869505
...,...,...,...,...,...,...
111647,2024-09-01,9627,100,70.0,44.0,0.181902
111648,2024-09-01,9803,0,0.0,-0.0,0.070791
111649,2024-09-01,9877,0,3.0,9.0,1.519509
111650,2024-09-01,9891,0,0.0,-0.0,2.532289


In [32]:
utility_df = monthly_test_df.copy()[['month', 'item', 'sales', 'y_pred', 'y_pred_xgb', 'unit_price']]
utility_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb,unit_price
0,2018-08-01,1143,10,43.0,11.0,0.259454
1,2018-08-01,1159,5,63.0,26.0,0.286071
2,2018-08-01,1187,0,0.0,1.0,1.667574
3,2018-08-01,1248,20,29.0,55.0,0.812460
4,2018-08-01,1337,0,0.0,-0.0,0.869505
...,...,...,...,...,...,...
111647,2024-09-01,9627,100,70.0,44.0,0.181902
111648,2024-09-01,9803,0,0.0,-0.0,0.070791
111649,2024-09-01,9877,0,3.0,9.0,1.519509
111650,2024-09-01,9891,0,0.0,-0.0,2.532289


In [33]:
# Precio promedio productos
classification_df['unit_price'].mean() 

2.374940656558404

In [34]:
# Definimos un costo fijo de inventario para todos los productos
# Este costo lo definimos como un 30% del precio promedio de todos los productos
STOCK_COST = classification_df['unit_price'].mean() * 0.3
STOCK_COST

0.7124821969675212

In [35]:
# Calculamos los costos por exceso de inventario por cada modelo
def get_stock_cost(row, model):
    """
    Calcula el costo de inventario cuando la cantidad predicha es mayor o igual a la cantidad observada,
    según cada modelo.
    """
    if model == 'rf':
        target = 'y_pred'
    elif model == 'xgb':
        target = 'y_pred_xgb'
    else:
        target = 'y_pred'


    if row[target] >= row['sales']:
        stock_in_excess = (row[target] - row['sales']) * STOCK_COST
    else:
        stock_in_excess = 0

    return stock_in_excess

for m in ['rf', 'xgb']:
    utility_df[f'excess_stock_cost_{m}'] = utility_df.apply(lambda x: get_stock_cost(x, m), axis=1)

utility_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb,unit_price,excess_stock_cost_rf,excess_stock_cost_xgb
0,2018-08-01,1143,10,43.0,11.0,0.259454,23.511912,0.712482
1,2018-08-01,1159,5,63.0,26.0,0.286071,41.323967,14.962126
2,2018-08-01,1187,0,0.0,1.0,1.667574,0.000000,0.712482
3,2018-08-01,1248,20,29.0,55.0,0.812460,6.412340,24.936877
4,2018-08-01,1337,0,0.0,-0.0,0.869505,0.000000,-0.000000
...,...,...,...,...,...,...,...,...
111647,2024-09-01,9627,100,70.0,44.0,0.181902,0.000000,0.000000
111648,2024-09-01,9803,0,0.0,-0.0,0.070791,0.000000,-0.000000
111649,2024-09-01,9877,0,3.0,9.0,1.519509,2.137447,6.412340
111650,2024-09-01,9891,0,0.0,-0.0,2.532289,0.000000,-0.000000


In [36]:
# Calculamos los costos por quiebre de stock por cada modelo
def get_stock_out_cost(row, model):
    """
    Calcula el costo de quiebre de stock cuando la cantidad predicha es menor a la cantidad observada,
    según cada modelo. Este costo está dado por el precio de venta del item por la cantidad de ítems no vendidos 
    a causa del quiebre de stock
    """
    if model == 'rf':
        target = 'y_pred'
    elif model == 'xgb':
        target = 'y_pred_xgb'
    else:
        target = 'y_pred'


    if row['sales'] > row[target]:
        stock_out = (row['sales'] - row[target]) * row['unit_price']
    else:
        stock_out = 0

    return stock_out

for m in ['rf', 'xgb']:
    utility_df[f'stock_out_cost_{m}'] = utility_df.apply(lambda x: get_stock_out_cost(x, m), axis=1)

utility_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb,unit_price,excess_stock_cost_rf,excess_stock_cost_xgb,stock_out_cost_rf,stock_out_cost_xgb
0,2018-08-01,1143,10,43.0,11.0,0.259454,23.511912,0.712482,0.000000,0.000000
1,2018-08-01,1159,5,63.0,26.0,0.286071,41.323967,14.962126,0.000000,0.000000
2,2018-08-01,1187,0,0.0,1.0,1.667574,0.000000,0.712482,0.000000,0.000000
3,2018-08-01,1248,20,29.0,55.0,0.812460,6.412340,24.936877,0.000000,0.000000
4,2018-08-01,1337,0,0.0,-0.0,0.869505,0.000000,-0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
111647,2024-09-01,9627,100,70.0,44.0,0.181902,0.000000,0.000000,5.457047,10.186488
111648,2024-09-01,9803,0,0.0,-0.0,0.070791,0.000000,-0.000000,0.000000,0.000000
111649,2024-09-01,9877,0,3.0,9.0,1.519509,2.137447,6.412340,0.000000,0.000000
111650,2024-09-01,9891,0,0.0,-0.0,2.532289,0.000000,-0.000000,0.000000,0.000000


In [37]:
# Calculamos los ingresos por venta de repuesto por cada modelo
def get_income_earned_by_sale(row, model):
    """
    Calcula el ingreso obtenido por la venta de repuesto
    """
    if model == 'rf':
        target = 'y_pred'
    elif model == 'xgb':
        target = 'y_pred_xgb'
    else:
        target = 'y_pred'


    if row[target] >= row['sales']:
        sales = row['sales'] * row['unit_price']
    else:
        sales = row[target] * row['unit_price']

    return sales

for m in ['rf', 'xgb']:
    utility_df[f'sales_income_{m}'] = utility_df.apply(lambda x: get_income_earned_by_sale(x, m), axis=1)

utility_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb,unit_price,excess_stock_cost_rf,excess_stock_cost_xgb,stock_out_cost_rf,stock_out_cost_xgb,sales_income_rf,sales_income_xgb
0,2018-08-01,1143,10,43.0,11.0,0.259454,23.511912,0.712482,0.000000,0.000000,2.594539,2.594539
1,2018-08-01,1159,5,63.0,26.0,0.286071,41.323967,14.962126,0.000000,0.000000,1.430357,1.430357
2,2018-08-01,1187,0,0.0,1.0,1.667574,0.000000,0.712482,0.000000,0.000000,0.000000,0.000000
3,2018-08-01,1248,20,29.0,55.0,0.812460,6.412340,24.936877,0.000000,0.000000,16.249201,16.249201
4,2018-08-01,1337,0,0.0,-0.0,0.869505,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
111647,2024-09-01,9627,100,70.0,44.0,0.181902,0.000000,0.000000,5.457047,10.186488,12.733110,8.003669
111648,2024-09-01,9803,0,0.0,-0.0,0.070791,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000
111649,2024-09-01,9877,0,3.0,9.0,1.519509,2.137447,6.412340,0.000000,0.000000,0.000000,0.000000
111650,2024-09-01,9891,0,0.0,-0.0,2.532289,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000


In [38]:
# Calculamos la utilidad para cada dato
def get_utility(row, model):
    """
    Calculamos la utilidad para cada dato:
    Suma de ingresos - Suma de costos
    """
    total_incomes = row[f'sales_income_{model}']
    total_costs = row[f'excess_stock_cost_{model}'] + row[f'stock_out_cost_{model}']
    utility = total_incomes - total_costs

    return utility

for m in ['rf', 'xgb']:
    utility_df[f'utility_{m}'] = utility_df.apply(lambda x: get_utility(x, m), axis=1)

utility_df

Unnamed: 0,month,item,sales,y_pred,y_pred_xgb,unit_price,excess_stock_cost_rf,excess_stock_cost_xgb,stock_out_cost_rf,stock_out_cost_xgb,sales_income_rf,sales_income_xgb,utility_rf,utility_xgb
0,2018-08-01,1143,10,43.0,11.0,0.259454,23.511912,0.712482,0.000000,0.000000,2.594539,2.594539,-20.917373,1.882057
1,2018-08-01,1159,5,63.0,26.0,0.286071,41.323967,14.962126,0.000000,0.000000,1.430357,1.430357,-39.893611,-13.531770
2,2018-08-01,1187,0,0.0,1.0,1.667574,0.000000,0.712482,0.000000,0.000000,0.000000,0.000000,0.000000,-0.712482
3,2018-08-01,1248,20,29.0,55.0,0.812460,6.412340,24.936877,0.000000,0.000000,16.249201,16.249201,9.836861,-8.687676
4,2018-08-01,1337,0,0.0,-0.0,0.869505,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111647,2024-09-01,9627,100,70.0,44.0,0.181902,0.000000,0.000000,5.457047,10.186488,12.733110,8.003669,7.276063,-2.182819
111648,2024-09-01,9803,0,0.0,-0.0,0.070791,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
111649,2024-09-01,9877,0,3.0,9.0,1.519509,2.137447,6.412340,0.000000,0.000000,0.000000,0.000000,-2.137447,-6.412340
111650,2024-09-01,9891,0,0.0,-0.0,2.532289,0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [39]:
# Calculamos la utilidad final para cada modelo
utility = {}
for m in ['rf', 'xgb']:
    utility[m] = utility_df[f'utility_{m}'].sum()

utility

{'rf': -491581.2316770716, 'xgb': -485878.7216307055}

### Transformers de series de tiempo

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
raw_df[raw_df.sales == 0]

In [None]:
hf_df = raw_df[['time', 'item', 'sales']]
pivot_df = pd.pivot_table(hf_df, index='time', columns=['item'])
pivot_df = pivot_df['sales']

In [None]:
items = pivot_df.columns

In [None]:
pivot_df

In [None]:
# Función para crear secuencias de datos
def crear_secuencias(datos, n_dias):
    X, y = [], []
    for i in range(len(datos) - n_dias):
        X.append(datos[i:i+n_dias])  # Datos históricos (ventas anteriores)
        y.append(datos[i + n_dias])  # El siguiente valor (ventas del día siguiente)
    return X, y

In [None]:
items.difference([])

In [None]:
# Definir el número de días para la secuencia
n_dias = 30

# Crear las secuencias para cada ítem
secuencias = {}
for item in items:
    item_serie = pivot_df[item].dropna()
    secuencias[item] = crear_secuencias(item_serie, n_dias)

# Dividir en conjunto de entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = {}, {}, {}, {}

discarded_items = []
for item in items:
    if len(secuencias[item][0]) > 0 and len(secuencias[item][1]):
        X_item, y_item = secuencias[item]
        X_train[item], X_test[item], y_train[item], y_test[item] = train_test_split(X_item, y_item, test_size=0.2, shuffle=False)
    else:
        discarded_items.append(item)

items = items.difference(discarded_items)
# Ver el tamaño de los datos
for item in items:
    print(f"{item} - Entrenamiento: {len(X_train[item])}, Prueba: {len(X_test[item])}")


In [None]:
len(X_test.keys())

In [None]:
scalers = {}
X_train_scaled, X_test_scaled = {}, {}

# Escalar los datos de ventas por ítem
for item in items:
    scaler = MinMaxScaler(feature_range=(0, 1))
    X_train_scaled[item] = scaler.fit_transform(X_train[item])
    X_test_scaled[item] = scaler.transform(X_test[item])
    scalers[item] = scaler


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=50, output_size=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
        self.fc = nn.Linear(hidden_layer_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)  # La salida de la LSTM
        out = self.fc(out[:, -1, :])  # Solo tomamos la última salida de la secuencia
        return out

In [None]:
# Entrenamiento del modelo para cada ítem
models = {}
criterions = {}
optimizers = {}

for item in items:
    print(f'Item: {item}')
    # Convertir los datos a tensores
    X_train_tensor = torch.tensor(X_train_scaled[item], dtype=torch.float32).unsqueeze(-1)
    y_train_tensor = torch.tensor(y_train[item], dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_scaled[item], dtype=torch.float32).unsqueeze(-1)
    y_test_tensor = torch.tensor(y_test[item], dtype=torch.float32)

    # Inicializar el modelo, la función de pérdida y el optimizador
    model = LSTMModel(input_size=1, hidden_layer_size=50, output_size=1)
    criterion = nn.MSELoss()  # Usamos error cuadrático medio para regresión
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    models[item] = model
    criterions[item] = criterion
    optimizers[item] = optimizer

    # Entrenamiento del modelo
    num_epochs = 20
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs.squeeze(), y_train_tensor)
        loss.backward()
        optimizer.step()

        # if (epoch+1) % 5 == 0:
        #     print(f"Item: {item}, Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


In [None]:
# Hacer predicciones y evaluar el rendimiento
predictions = {}
for item in items:
    model = models[item]
    X_test_tensor = torch.tensor(X_test_scaled[item], dtype=torch.float32).unsqueeze(-1)
    y_test_tensor = torch.tensor(y_test[item], dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        y_pred_tensor = model(X_test_tensor)

    # Convertir predicciones y valores reales a numpy
    y_pred = y_pred_tensor.squeeze().numpy()
    y_test_actual = y_test_tensor.numpy()

    # Calcular el error absoluto medio (MAE)
    mae = mean_absolute_error(y_test_actual, y_pred)
    print(f"Item: {item}, MAE: {mae:.2f}")

    predictions[item] = {
        'mae': mae,
        'y_pred': y_pred,
        'y_test': y_test_actual
    }

    # Graficar las predicciones vs las ventas reales
    # plt.figure(figsize=(10, 6))
    # plt.plot(df.index[-len(y_test_actual):], y_test_actual, label='Ventas reales', color='blue')
    # plt.plot(df.index[-len(y_pred):], y_pred, label='Predicciones', color='red', linestyle='--')
    # plt.legend()
    # plt.title(f'Predicción de ventas con LSTM - {item}')
    # plt.xlabel('Fecha')
    # plt.ylabel('Ventas')
    # plt.xticks(rotation=45)
    # plt.show()


In [None]:
X_test

In [None]:
maes = [predictions[p]['mae'] for p in predictions]
np.mean(maes)

In [None]:
test_items = list(predictions.keys())
len(test_items)

In [None]:
len(predictions)

In [None]:
secuencias

In [None]:
predictions

In [None]:
predictions[5432]['y_pred']
predictions[5432]['y_test']

In [None]:
test_df['y_pred'] = test_df['y_pred'].apply(lambda x: round(x))
test_df['y_pred_xgb'] = test_df['y_pred_xgb'].apply(lambda x: round(x))

test_df

In [None]:
test_df['y_pred_lstm'] = y_pred