<center>
  <h2>Trabajo Laboratorio de Implementación III - MCD Virtual Cohorte 2022</h2>
  <h3>Forecasting para una Empresa de Consumo Masivo</h3>
  <h4>Evaluación</h4>
</center>

In [1]:
# Importación librerias

# !pip install awswrangler
# !pip install boto3
import boto3
import awswrangler as wr
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [34]:
# Definición fechas

date_start=datetime.strptime('2017-01-01', '%Y-%m-%d')
date_end=datetime.strptime('2019-12-01', '%Y-%m-%d')
forecast_length = 2
date_future=date_end+relativedelta(months=forecast_length)
years=range(2018, 2020)

print(f"Fecha inicio historia {date_start}")
print(f"Fecha fin historia {date_end}")
print(f"Fecha futura (related time series) {date_future}")

# Versión modelo
data_version = '2'
grouped_columns = ["product_id", "cluster_id"]
grouped_cols_name = "product_cluster"

# Parametros backtest
dataset_frequency='M'
quantiles=["mean"]

Fecha inicio historia 2017-01-01 00:00:00
Fecha fin historia 2019-12-01 00:00:00
Fecha futura (related time series) 2020-02-01 00:00:00


In [3]:
# Conexiones AWS

account_id=boto3.client('sts').get_caller_identity().get('Account')
session = boto3.Session() 
forecast = session.client(service_name='forecast')
region = forecast.meta.region_name
forecastquery = session.client(service_name='forecastquery')
role = f'arn:aws:iam::{account_id}:role/ForecastRole'

In [4]:
# Buckets de datos

# Data
target_s3_path=f's3://datasets-forecast/modelo-{data_version}-forecast-{grouped_cols_name}/target/df_ventas_{grouped_cols_name}.csv'
items_s3_path=f's3://datasets-forecast/modelo-{data_version}-forecast-{grouped_cols_name}/product/df_products.csv'
related_s3_path=f's3://datasets-forecast/modelo-{data_version}-forecast-{grouped_cols_name}/related/df_related_{grouped_cols_name}.csv'

# Outputs
backtest_export_path=f's3://datasets-forecast/modelo-{data_version}-forecast-{grouped_cols_name}/output/'

In [5]:
# Formar ARNs

dataset_group_arn=f'arn:aws:forecast:us-east-1:{account_id}:dataset-group/modelo_{data_version}_{grouped_cols_name}'
# Se busca de la notebook 3
codigo_predictor = '01HZQRGFBR9PMEB0RNENW4X9BQ'
automl_predictor_arn = f'arn:aws:forecast:us-east-1:{account_id}:predictor/modelo_{data_version}_predictor_{codigo_predictor}'

In [6]:
# Verificar comunicación con Amazon Forecast
assert forecast.list_predictors()

### Backtest export job

In [7]:
# Nombre del projecto y dataset group
backtest_export_job_name = f"modelo_{data_version}_predictor_export"
backtest_export_job_name

'modelo_2_predictor_export'

In [8]:
response = forecast.create_predictor_backtest_export_job(
    PredictorBacktestExportJobName=backtest_export_job_name,
    PredictorArn=automl_predictor_arn,
    Destination={
        'S3Config': {
            'Path': backtest_export_path,
            'RoleArn': role
        }
    },
    Format='CSV'
)
backtest_export_job_arn = response['PredictorBacktestExportJobArn']

In [10]:
print(f"Waiting for Backtest export job with ARN {backtest_export_job_arn} to be completed. \nCurrent Status:")
forecast.describe_predictor_backtest_export_job(PredictorBacktestExportJobArn=
                backtest_export_job_arn)['Status']

Waiting for Backtest export job with ARN arn:aws:forecast:us-east-1:637423651905:predictor-backtest-export-job/modelo_2_predictor/modelo_2_predictor_export to be completed. 
Current Status:


'ACTIVE'

### Evaluate

In [35]:
df_forecasts=wr.s3.read_csv(backtest_export_path + 'forecasted-values/')
df_forecasts['periodo']=pd.to_datetime(df_forecasts['timestamp'], format='%Y-%m-%dT%H:%M:%SZ').dt.date
df_forecasts['product_id']=df_forecasts['item_id']
df_forecasts = df_forecasts.drop(columns = {"item_id", "target_value", "backtestwindow_start_time", "backtestwindow_end_time", "timestamp"})
for q in quantiles:
    df_forecasts[q] = df_forecasts[q].astype(float)
df_forecasts.head()
df_forecasts = df_forecasts[["periodo"] + grouped_columns + quantiles]
df_forecasts['product_id']=df_forecasts['product_id'].astype(str)
df_forecasts['periodo']=pd.to_datetime(df_forecasts['periodo'])
df_forecasts.head(5)

Unnamed: 0,periodo,product_id,cluster_id,mean
0,2019-11-01,20824,B,0.009447
1,2019-12-01,20824,B,0.008175
2,2019-11-01,20638,F,0.162267
3,2019-12-01,20638,F,0.133266
4,2019-11-01,21142,E,0.021945


In [36]:
df_forecasts.dtypes

periodo       datetime64[ns]
product_id            object
cluster_id            object
mean                 float64
dtype: object

- Backtest timing

In [37]:
print('Start backtest: ', df_forecasts['periodo'].astype(str).min())
print('End backtest: ', df_forecasts['periodo'].astype(str).max())

Start backtest:  2019-11-01
End backtest:  2019-12-01


In [38]:
date_min_forecast = df_forecasts['periodo'].astype(str).min()
date_min_forecast = pd.to_datetime(date_min_forecast)

- Backtest items check

In [39]:
len(df_forecasts.product_id.unique())

771

- Read historic sales data

In [40]:
# Leer el archivo CSV sin encabezado
df_ventas_mensuales = pd.read_csv("C:/Users/usuario/otros/Desktop/MCD/6 Labo III/Forecasting Problem/Datasets/df_ventas_product_cluster.csv", header=None, index_col=False)
column_names = ["periodo"] + ["cluster_id", "product_id"] + ["venta"]
df_ventas_mensuales.columns = column_names
df_ventas_mensuales['periodo'] = pd.to_datetime(df_ventas_mensuales['periodo'])
df_ventas_mensuales['product_id']=df_ventas_mensuales['product_id'].astype(str)
df_ventas_mensuales.head(5)

Unnamed: 0,periodo,cluster_id,product_id,venta
0,2017-01-01,A,20001,0.170742
1,2017-01-01,A,20003,0.64812
2,2017-01-01,A,20004,0.652958
3,2017-01-01,A,20005,0.781871
4,2017-01-01,A,20006,0.021172


In [41]:
df_ventas_mensuales.dtypes

periodo       datetime64[ns]
cluster_id            object
product_id            object
venta                float64
dtype: object

In [42]:
len(df_ventas_mensuales.product_id.unique())

780

- Productos sin forecast

In [43]:
total_productos = df_ventas_mensuales.product_id.drop_duplicates().to_list()
productos_forecast = df_forecasts.product_id.drop_duplicates().to_list()
set1 = set(total_productos)
set2 = set(productos_forecast)
# Encontrar elementos que están en set1 pero no en set2
elementos_en_set1_no_en_set2 = set1 - set2
# Encontrar elementos que están en set2 pero no en set1
elementos_en_set2_no_en_set1 = set2 - set1
# Unir los resultados
elementos_no_comunes = elementos_en_set1_no_en_set2.union(elementos_en_set2_no_en_set1)
list(elementos_no_comunes)

['21214',
 '20127',
 '21087',
 '20962',
 '20210',
 '20703',
 '20686',
 '20995',
 '20975']

In [44]:
df_ventas_mensuales[df_ventas_mensuales.product_id.isin(list(elementos_no_comunes))].periodo.min()
# Solo tienen historia desde septiembre, no alcanza cantidad de historia requerida por AWS FORECAST.
# Requiere horizonte de prediccion (2 meses) * 2 = 4 meses de historia anteriores a periodo de backtest

Timestamp('2019-09-01 00:00:00')

In [45]:
print('Start historical sales: ', df_ventas_mensuales['periodo'].min())
print('End historical sales: ', df_ventas_mensuales['periodo'].max())

Start historical sales:  2017-01-01 00:00:00
End historical sales:  2019-12-01 00:00:00


- Unión info histórica y forecast

In [46]:
df_forecasts.dtypes

periodo       datetime64[ns]
product_id            object
cluster_id            object
mean                 float64
dtype: object

In [47]:
df_ventas_mensuales.dtypes

periodo       datetime64[ns]
cluster_id            object
product_id            object
venta                float64
dtype: object

In [48]:
# Para los meses en los que coexiste venta real con forecast se realiza un merge outer
df_forecasts_historic=pd.merge(df_forecasts[df_forecasts['periodo']>=date_min_forecast], df_ventas_mensuales[df_ventas_mensuales['periodo']>=date_min_forecast], how='outer', on=['periodo'] + grouped_columns)
df_forecasts_historic=df_forecasts_historic[['periodo'] + grouped_columns + ['venta'] + quantiles]
# Para los meses históricos, previos a la proyección de forecast, se realiza un concat
df_forecasts_historic = pd.concat([df_ventas_mensuales[df_ventas_mensuales['periodo']<date_min_forecast], df_forecasts_historic])
df_forecasts_historic.head(5)

Unnamed: 0,periodo,cluster_id,product_id,venta,mean
0,2017-01-01,A,20001,0.170742,
1,2017-01-01,A,20003,0.64812,
2,2017-01-01,A,20004,0.652958,
3,2017-01-01,A,20005,0.781871,
4,2017-01-01,A,20006,0.021172,


In [49]:
# Ok correcta concatenación: sacamos productos que no generan forecast ya que hacen que los números no coincidan y eso es correcto
df_forecasts_historic_check = df_forecasts_historic[~df_forecasts_historic.product_id.isin(list(elementos_no_comunes))]
df_ventas_mensuales_check = df_ventas_mensuales[~df_ventas_mensuales.product_id.isin(list(elementos_no_comunes))]
len(df_forecasts_historic_check) == len(df_forecasts[df_forecasts['periodo']>=date_min_forecast]) + len(df_ventas_mensuales_check[df_ventas_mensuales_check['periodo']<date_min_forecast])

False

In [50]:
df_forecasts_historic[['venta'] + quantiles]=df_forecasts_historic[['venta'] + quantiles].astype(float)
df_forecasts_historic['month']=pd.to_datetime(df_forecasts_historic['periodo']).dt.to_period('M')
df_forecasts_historic=df_forecasts_historic[['periodo', 'month'] + grouped_columns + ['venta'] + quantiles]

In [51]:
df_forecasts_historic.head(1)

Unnamed: 0,periodo,month,product_id,cluster_id,venta,mean
0,2017-01-01,2017-01,20001,A,0.170742,


- Agregamos fuentes adicionales

In [52]:
df_productos = pd.read_csv("C:/Users/usuario/otros/Desktop\MCD/6 Labo III/Forecasting Problem/Datasets/df_products.csv")
df_productos.columns = ["product_id", "cat1", "cat2", "cat3", "brand", "sku_size"]
df_productos["product_id"] = df_productos["product_id"].astype(str)
df_productos.head(5)

Unnamed: 0,product_id,cat1,cat2,cat3,brand,sku_size
0,20180,HC,ROPA LAVADO,Liquido,LIMPIEX,450
1,20332,HC,ROPA LAVADO,Liquido,LIMPIEX,120
2,20222,HC,ROPA LAVADO,Liquido,LIMPIEX,450
3,20288,HC,ROPA LAVADO,Liquido,LIMPIEX,900
4,20082,HC,ROPA MANCHAS,Ladrillo,LIMPIEX,200


In [53]:
df_forecasts_historic = pd.merge(df_forecasts_historic, df_productos, how = "left", on = "product_id")
df_forecasts_historic.head(5)

Unnamed: 0,periodo,month,product_id,cluster_id,venta,mean,cat1,cat2,cat3,brand,sku_size
0,2017-01-01,2017-01,20001,A,0.170742,,HC,ROPA LAVADO,Liquido,ARIEL,3000.0
1,2017-01-01,2017-01,20003,A,0.64812,,FOODS,ADEREZOS,Mayonesa,NATURA,475.0
2,2017-01-01,2017-01,20004,A,0.652958,,FOODS,ADEREZOS,Mayonesa,NATURA,240.0
3,2017-01-01,2017-01,20005,A,0.781871,,FOODS,ADEREZOS,Mayonesa,NATURA,120.0
4,2017-01-01,2017-01,20006,A,0.021172,,HC,VAJILLA,Cristalino,LIMPIEX,750.0


- Definición funciones de medición de error

In [54]:
def text_format(val):
    color = 'salmon' if val > 0.3 else 'green'
    return 'color: %s' % color

In [55]:
def format_zero(val):
    color = 'black' if type(val) == str else ('salmon' if val < 0 else 'green')
    return 'color: %s' % color

In [56]:
def forecasts_error_total(df_forecasts, quantile):
    df_forecasts=df_forecasts[df_forecasts['periodo']>=date_min_forecast]
    df_forecasts_grouped = df_forecasts.groupby(['month'] + grouped_columns)[['venta', quantile]].sum().reset_index()
    df_forecasts_grouped['error_absoluto']=abs(df_forecasts_grouped['venta']-df_forecasts_grouped[quantile])
    df_forecasts_grouped =(df_forecasts_grouped.groupby(['month'])['error_absoluto'].sum()
                           /df_forecasts_grouped.groupby(['month'])['venta'].sum()).reset_index(name='error')
    df_forecasts_grouped.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df_forecasts_grouped

In [57]:
def forecasts_error_grouped(df_forecasts, feature, time_column, target_value, quantile):
    df_forecasts=df_forecasts[df_forecasts['periodo']>=date_min_forecast]
    df_forecasts_grouped = df_forecasts.groupby([time_column] + grouped_columns + [feature])[[target_value, quantile]].sum().reset_index()
    df_forecasts_grouped['error_absoluto']=abs(df_forecasts_grouped[target_value]-df_forecasts_grouped[quantile])
    df_forecasts_grouped =(df_forecasts_grouped.groupby([time_column, feature])['error_absoluto'].sum()
                           /df_forecasts_grouped.groupby([time_column, feature])[target_value].sum()).reset_index(name='error')
    df_forecasts_grouped=df_forecasts_grouped.pivot(index=[feature], columns=[time_column], values='error')
    df_forecasts_grouped.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_forecasts_grouped['mean_error']=df_forecasts_grouped.mean(axis=1, skipna=True)
    return df_forecasts_grouped

In [58]:
def forecasts_error_item(df_forecasts, features, time_column, quantile, target_value):
    df_forecasts=df_forecasts[df_forecasts['periodo']>=date_min_forecast]
    df_forecasts_grouped = df_forecasts.groupby([time_column, 'product_id']+features)[[target_value, quantile]].sum().reset_index()
    df_forecasts_grouped['error_absoluto']=abs(df_forecasts_grouped[target_value]-df_forecasts_grouped[quantile])
    df_forecasts_grouped =(df_forecasts_grouped.groupby([time_column, 'product_id'] + features)['error_absoluto'].sum()
                           /df_forecasts_grouped.groupby([time_column, 'product_id']+ features)[target_value].sum()).reset_index(name='error')
    df_forecasts_grouped=df_forecasts_grouped.pivot(index=['product_id'] + features, columns=[time_column], values='error')
    df_forecasts_grouped.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_forecasts_grouped['mean_error']=df_forecasts_grouped.mean(axis=1, skipna=True)
    return df_forecasts_grouped

- Evaluación total general

In [59]:
forecasts_error_total(df_forecasts_historic, "mean").style.applymap(lambda x: text_format(x), ['error'])

Unnamed: 0,month,error
0,2019-11,0.305302
1,2019-12,0.371828


- Evaluación por categorias

In [61]:
forecasts_error_grouped(df_forecasts_historic, feature='cat1', time_column='month', target_value='venta', quantile='mean').style.applymap(lambda x: text_format(x))

month,2019-11,2019-12,mean_error
cat1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FOODS,0.332182,0.416481,0.374331
HC,0.302516,0.344003,0.323259
PC,0.291148,0.415579,0.353364
REF,3.093701,4.017806,3.555753


In [62]:
forecasts_error_grouped(df_forecasts_historic, feature='cat2', time_column='month', target_value='venta', quantile='mean').style.applymap(lambda x: text_format(x))

month,2019-11,2019-12,mean_error
cat2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADEREZOS,0.3189,0.35889,0.338895
CABELLO,0.310943,0.493116,0.402029
DENTAL,0.682319,0.407414,0.544866
DEOS,0.268307,0.315467,0.291887
HOGAR,0.251878,0.367797,0.309837
OTROS,0.359995,0.709885,0.53494
PIEL1,0.437722,0.428517,0.43312
PIEL2,0.20562,0.275279,0.240449
PROFESIONAL,0.478804,0.501063,0.489934
ROPA ACONDICIONADOR,0.384644,0.542472,0.463558


- Evaluación por items principales

In [63]:
items_relevantes = ["20001", "20002", "20003", "20004", "20005", "20006", "20007", "20008", "20009", "20010"]

In [64]:
# forecast_mean
forecasts_error_item(df_forecasts_historic[df_forecasts_historic.product_id.isin(items_relevantes)], features=[], time_column='month', target_value='venta', quantile='mean').reset_index().rename_axis(None, axis=1).set_index("product_id").style.applymap(text_format)

Unnamed: 0_level_0,2019-11,2019-12,mean_error
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20001,0.088894,0.020439,0.054666
20002,0.091255,0.390945,0.2411
20003,0.173704,0.063053,0.118378
20004,0.234496,0.194027,0.214262
20005,0.159404,0.060226,0.109815
20006,0.289173,0.177205,0.233189
20007,0.183968,0.045133,0.114551
20008,0.117741,1.211405,0.664573
20009,0.226116,0.118164,0.17214
20010,0.075013,0.330316,0.202664


In [65]:
df_forecasts_historic[df_forecasts_historic.product_id == "20008"].sort_values(by = "periodo").tail(6)

Unnamed: 0,periodo,month,product_id,cluster_id,venta,mean,cat1,cat2,cat3,brand,sku_size
198611,2019-12-01,2019-12,20008,A,,0.220247,HC,VAJILLA,Opaco,LIMPIEX,750.0
195643,2019-12-01,2019-12,20008,J,144.92924,326.576143,HC,VAJILLA,Opaco,LIMPIEX,750.0
194093,2019-12-01,2019-12,20008,F,2.73946,6.048355,HC,VAJILLA,Opaco,LIMPIEX,750.0
193827,2019-12-01,2019-12,20008,G,2.6558,11.201052,HC,VAJILLA,Opaco,LIMPIEX,750.0
200967,2019-12-01,2019-12,20008,E,1.63113,3.875645,HC,VAJILLA,Opaco,LIMPIEX,750.0
207973,2019-12-01,2019-12,20008,D,1.91344,2.291615,HC,VAJILLA,Opaco,LIMPIEX,750.0
