In [186]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os
import logging
import time
from datetime import datetime

In [165]:
inference_data = pd.read_csv('../data/raw/stores_sales_forecasting_updated_v3.1.csv', 
                 sep=';', encoding='utf-8')

# Convertir fechas
inference_data['Order Date'] = pd.to_datetime(inference_data['Order Date'], dayfirst=True, errors='coerce')
inference_data['Ship Date'] = pd.to_datetime(inference_data['Ship Date'], dayfirst=True, errors='coerce')

# Variables derivadas
inference_data['Order_Month'] = inference_data['Order Date'].dt.month
inference_data['Order_Quarter'] = inference_data['Order Date'].dt.quarter
inference_data['Days to Ship'] = (inference_data['Ship Date'] - inference_data['Order Date']).dt.days

inference_data.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Order_Month,Order_Quarter,Days to Ship
0,849,CA-2017-107503,2017-01-01,2017-01-06,Standard Class,GA-14725,Guy Armstrong,Consumer,United States,Lorain,...,Furniture,Furnishings,"Linden 10"" Round Wall Clock, Black",48.896,4,0.2,8.5568,1,1,5
1,4010,CA-2017-144463,2017-01-01,2017-01-05,Standard Class,SC-20725,Steven Cartwright,Consumer,United States,Los Angeles,...,Furniture,Furnishings,"Howard Miller 11-1/2"" Diameter Brentwood Wall ...",474.43,11,0.0,199.2606,1,1,4
2,8071,CA-2017-151750,2017-01-01,2017-01-05,Standard Class,JM-15250,Janet Martin,Consumer,United States,Huntsville,...,Furniture,Furnishings,"Tenex Carpeted, Granite-Look or Clear Contempo...",141.42,5,0.6,-187.3815,1,1,4
3,8072,CA-2017-151750,2017-01-01,2017-01-05,Standard Class,JM-15250,Janet Martin,Consumer,United States,Huntsville,...,Furniture,Chairs,Office Star - Contemporary Task Swivel Chair,310.744,4,0.3,-26.6352,1,1,4
4,867,CA-2014-149020,2014-01-10,2014-01-15,Standard Class,AJ-10780,Anthony Jacobs,Corporate,United States,Springfield,...,Furniture,Furnishings,"Howard Miller 11-1/2"" Diameter Ridgewood Wall ...",51.94,1,0.0,21.2954,1,1,5


In [None]:
inference_data = pd.read_csv('../data/raw/stores_sales_forecasting_updated_v3.1.csv', 
                         sep=';',
                         encoding='utf-8')

X = inference_data.drop(['Sales'], axis=1)
y = inference_data['Sales']

# creamos un split temporal (80/20) - SIN shuffle
split_index = int(len(inference_data) * 0.8)

x_train = X.iloc[:split_index].copy()
x_test = X.iloc[split_index:].copy()
y_train = y.iloc[:split_index].copy()
y_test = y.iloc[split_index:].copy()

In [171]:
stores_sales_forecasting_pipeline= joblib.load('../models/stores_sales_forecasting_pipeline.pkl')
stores_sales_forecasting_pipeline

0,1,2
,steps,"[('drop_features', ...), ('cat_missing_imputation', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features_to_drop,"['Customer Name', 'Ship Date', ...]"

0,1,2
,imputation_method,'missing'
,fill_value,'Missing'
,variables,['Segment']
,return_object,False
,ignore_format,False

0,1,2
,imputation_method,'frequent'
,fill_value,'Missing'
,variables,['Sub-Category']
,return_object,False
,ignore_format,False

0,1,2
,imputation_method,'mean'
,variables,"['Quantity', 'Discount']"

0,1,2
,variables,['Ship Mode']
,mappins,"{'First Class': 3, 'Second Class': 2, 'Standard Class': 1}"

0,1,2
,encoding_method,'count'
,variables,"['Segment', 'Sub-Category', ...]"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,variables,['Quantity']
,base,'e'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [175]:
start_time = datetime.now()
predicciones = stores_sales_forecasting_pipeline.predict(x_test)
end_time = datetime.now()
inference_time = (end_time - start_time).total_seconds()

print(f"✅ Predicciones generadas exitosamente")
print(f"   Tiempo de inferencia: {inference_time:.2f} segundos")
print(f"   Número de predicciones: {len(predicciones )}")
print(f"   Tiempo por predicción: {inference_time/len(predicciones )*1000:.2f} ms")

✅ Predicciones generadas exitosamente
   Tiempo de inferencia: 0.07 segundos
   Número de predicciones: 425
   Tiempo por predicción: 0.16 ms




In [182]:
#Calculamos las métricas
rmse = np.sqrt(mean_squared_error(y_test, predicciones))
mae = mean_absolute_error(y_test, predicciones)
r2 = r2_score(y_test, predicciones)
mape = np.mean(np.abs((y_test - predicciones) / y_test)) * 100


print("MÉTRICAS DE EVALUACIÓN:")
print(f"   RMSE (Root Mean Squared Error): {rmse:.2f}")
print(f"   MAE (Mean Absolute Error):      {mae:.2f}")
print(f"   R² Score:                       {r2:.4f}")
print(f"   MAPE (Mean Absolute % Error):   {mape:.2f}%")


# Estadísticas descriptivas
print(f"\nESTADÍSTICAS DE PREDICCIONES:")
print(f"   Mínimo:  {predicciones.min():,.2f}")
print(f"   Máximo:  {predicciones.max():,.2f}")
print(f"   Media:   {predicciones.mean():,.2f}")
print(f"   Mediana: {np.median(predicciones):,.2f}")
print(f"   Std Dev: {predicciones.std():,.2f}")

print(f"\nSTADÍSTICAS DE VALORES REALES:")
print(f"   Mínimo:  {y_test.min():,.2f}")
print(f"   Máximo:  {y_test.max():,.2f}")
print(f"   Media:   {y_test.mean():,.2f}")
print(f"   Mediana: {y_test.median():,.2f}")
print(f"   Std Dev: {y_test.std():,.2f}")

MÉTRICAS DE EVALUACIÓN:
   RMSE (Root Mean Squared Error): 214.89
   MAE (Mean Absolute Error):      74.28
   R² Score:                       0.8535
   MAPE (Mean Absolute % Error):   25.41%

ESTADÍSTICAS DE PREDICCIONES:
   Mínimo:  4.75
   Máximo:  3,334.73
   Media:   357.31
   Mediana: 188.53
   Std Dev: 467.29

STADÍSTICAS DE VALORES REALES:
   Mínimo:  2.78
   Máximo:  4,416.17
   Media:   365.54
   Mediana: 172.76
   Std Dev: 562.03


# Creamos un dataframe con las predicciones y los valores reales

In [187]:
results_df = pd.DataFrame({
    'Index': y_test.index,
    'Actual_Sales': y_test.values,
    'Predicted_Sales': predicciones,
    'Absolute_Error': np.abs(y_test.values - predicciones),
    'Percentage_Error': np.abs((y_test.values - predicciones) / y_test.values) * 100,
    'Residual': y_test.values - predicciones
})

# incluimos las columnas originales
if 'Order Date' in x_test.columns:
    results_df['Order_Date'] = x_test['Order Date'].values
if 'Category' in x_test.columns:
    results_df['Category'] = x_test['Category'].values
if 'Region' in x_test.columns:
    results_df['Region'] = x_test['Region'].values

print("PRIMERAS 10 PREDICCIONES:")
print(results_df[['Actual_Sales', 'Predicted_Sales', 'Absolute_Error', 'Percentage_Error']].head(10).to_string(index=False))

# Guardamos los resultados
os.makedirs('../results/', exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_df.to_csv(f'../results/predicciones_{timestamp}.csv', index=False)

print(f"\nPredicciones guardadas en: ../results/predicciones_{timestamp}.csv")

PRIMERAS 10 PREDICCIONES:
 Actual_Sales  Predicted_Sales  Absolute_Error  Percentage_Error
       69.008        68.535930        0.472070          0.684080
      215.650       287.109560       71.459560         33.136824
       60.288        58.738120        1.549880          2.570794
      253.372       208.149764       45.222236         17.848158
      287.968       303.108240       15.140240          5.257612
       87.210       163.948745       76.738745         87.993057
       63.882       135.376195       71.494195        111.916025
      502.488       705.122950      202.634950         40.326326
      662.880       676.325020       13.445020          2.028274
      145.900       164.488980       18.588980         12.740905

Predicciones guardadas en: ../results/predicciones_20251121_200600.csv
