<a href="https://colab.research.google.com/github/lmontaldo/meli_prueba/blob/Jupyter-Colabs/Series_de_tiempo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Series de tiempo

Problema: pronosticar la cantidad de unidades diarias que van a
vender 3 categorías distintas de MELI

El objetivo de este desafío es construir un modelo de forecast que permita estimar las
ventas de 3 semanas a nivel diario utilizando la historia de ventas de la categoría. Es
decir, predecir las ventas de los siguientes 21 días. 


Las métricas y la medición de la
performance del forecast son un punto clave de este desafío.


TIP: Dividir el dataset en entrenamiento, testing y validación correctamente es muy
importante en problemas de forecasting!

In [None]:
! pip install pycaret

In [None]:
# import the regression module
from pycaret.regression import *

In [None]:
import numpy as np
import pandas as pd, datetime
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from time import time
import os
from math import sqrt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import itertools
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf,pacf
from statsmodels.tsa.arima_model import  ARIMA
from sklearn import model_selection
from sklearn.metrics import mean_squared_error, r2_score
from pandas import DataFrame
import xgboost as xgb
#from fbprophet import Prophet
import warnings
warnings.filterwarnings('ignore')

In [None]:
import datetime
import plotly
import plotly.express as px

In [None]:
! git clone https://github.com/lmontaldo/meli_prueba.git


In [None]:
s='https://raw.githubusercontent.com/lmontaldo/meli_prueba/Datos/series.csv'

In [None]:
df=pd.read_csv(s,low_memory=False, header=[0],sep=',')

In [None]:
df.head()

In [None]:
df=df.set_index('DATE')

In [None]:
df.index = pd.to_datetime(df.index)

In [None]:
df_si=df.reset_index()

In [None]:
df.shape

In [None]:
df.index.min(), df.index.max()

In [None]:
df['year'] = pd.DatetimeIndex(df.index).year
df['month'] = pd.DatetimeIndex(df.index).month
df['day'] = pd.DatetimeIndex(df.index).day
df['dayofweek']=pd.DatetimeIndex(df.index).dayofweek

In [None]:
df.columns

In [None]:
df.sort_index().head(3)

## Análisis de las series

### Visualizaciones e insights preliminares

In [None]:
df_si=df.reset_index()

In [None]:
gp_df=df_si.groupby(['DATE', 'CATEGORY']).agg({'UNITS_SOLD':'sum'}).reset_index()

In [None]:
gp_df.head()

In [None]:
fig = px.line(gp_df, x='DATE', y='UNITS_SOLD', color='CATEGORY', title='Unidades vendidas por categoría')
fig.show()

In [None]:
df=df.sort_index(ascending=True)

In [None]:
df.columns

In [None]:
df[df['UNITS_SOLD']==0].tail()

In [None]:
df_si.isnull().sum()

In [None]:
gp_t=df_si.groupby(["month", 'year','CATEGORY']).agg({'UNITS_SOLD':'sum'}).reset_index()

In [None]:
sns.factorplot(data = df_si, x = "year", y = "UNITS_SOLD", hue = "CATEGORY").set(title='Ventas según año de la semana por categoría')
plt.show()

In [None]:
sns.factorplot(data = df_si, x = "month", y = "UNITS_SOLD", hue = "CATEGORY").set(title='Ventas según mes de la semana por categoría')
plt.show()

In [None]:
# domingo=0
# del domingo al sabado las ventas dcrecen
sns.factorplot(data = df_si, x = "dayofweek", y = "UNITS_SOLD", hue = "CATEGORY").set(title='Ventas según dia de la semana por categoría')
plt.show()

### Estacionaridad y estacionalidad de las series

In [None]:
long_vtas=df_si.groupby(['DATE','CATEGORY','year', 'month', 'day']).agg({'UNITS_SOLD':'sum'}).reset_index()
long_vtas.head(2)

In [None]:
wide_vtas=pd.pivot(long_vtas, index=['DATE'], columns = 'CATEGORY',values = 'UNITS_SOLD')
wide_vtas_si=wide_vtas.reset_index()
wide_vtas_si.head()

In [None]:
wide_vtas_si[wide_vtas_si['CATEG-1'].isnull()].max()

In [None]:
# me quedo con las series enteras 
wide=wide_vtas_si[wide_vtas_si['DATE']>'2018-11-11']

In [None]:
wide=wide.set_index('DATE')

In [None]:
wide['year'] = pd.DatetimeIndex(wide.index).year
wide['month'] = pd.DatetimeIndex(wide.index).month
wide['day'] = pd.DatetimeIndex(wide.index).day
wide['dayofweek']=pd.DatetimeIndex(wide.index).dayofweek

In [None]:
wide.head()

In [None]:
#sales_1 = np.log2(df[df.CATEGORY == 'CATEG-1']['UNITS_SOLD'])
#sales_2 = np.log2(df[df.CATEGORY == 'CATEG-2']['UNITS_SOLD'])
#sales_3 = np.log2(df[df.CATEGORY == 'CATEG-3']['UNITS_SOLD'])

In [None]:
#logs
logs=wide.copy()
logs['v_c1'] = np.log2(logs['CATEG-1'])
logs['v_c2'] = np.log2(logs['CATEG-2'])
logs['v_c3'] = np.log2(logs['CATEG-3'])
logs.head()

In [None]:
logs.columns

In [None]:
logs_si=logs.reset_index()

In [None]:
logs_si.plot(x='DATE', y=['v_c1', 'v_c2', 'v_c3'], kind="line", figsize=(9, 8))
plt.title('series en log')
plt.show()

In [None]:
logs_si['dv_c1'] = logs_si['v_c1'].diff()
logs_si['dv_c2'] = logs_si['v_c1'].diff()
logs_si['dv_c3'] = logs_si['v_c1'].diff()

In [None]:
logs_si.head()

In [None]:
logs_si=logs_si.iloc[1:]

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(12,5))
logs_si.plot(x='DATE', y=['dv_c1'], kind="line", ax=axs[0])
logs_si.plot(x='DATE', y=['dv_c2'], kind="line", ax=axs[1])
logs_si.plot(x='DATE', y=['dv_c2'], kind="line", ax=axs[2])
plt.title('Ventas de las categorias en primeras diferencias')
plt.show()

In [None]:
logs=logs_si.set_index('DATE')

In [None]:
# Function to test the stationarity
def test_stationarity(timeseries):
    
    # Determing rolling statistics
    roll_mean = timeseries.rolling(window=7).mean()
    roll_std = timeseries.rolling(window=7).std()
# Plotting rolling statistics:
    orig = plt.plot(timeseries.resample('W').mean(), color='blue',label='Original')
    mean = plt.plot(roll_mean.resample('W').mean(), color='red', label='Rolling Mean')
    std = plt.plot(roll_std.resample('W').mean(), color='green', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Descomposición de las series')
    plt.show(block=False)
    
  # Performing Dickey-Fuller test:
    print('Results of Dickey-Fuller Test, model constant only:')
    result = adfuller(timeseries, autolag='AIC')
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
           print(key, value)
    # Performing Dickey-Fuller test, constant and trend:
    print('Results of Dickey-Fuller Test, model constant and trend:')
    result = adfuller(timeseries, autolag='AIC',regression='ct' )
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
           print(key, value) 
      # Performing Dickey-Fuller test, no constant, no trend:
    print('Results of Dickey-Fuller Test, model no constant and trend:')
    result = adfuller(timeseries, autolag='AIC',regression='nc' )
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
           print(key, value)                 
# Testing stationarity of store type a
test_stationarity(logs.dv_c1)

In [None]:
test_stationarity(logs.dv_c2)

In [None]:
test_stationarity(logs.dv_c3)

$\text{H0) Raíz unitaria, H1) No Raíz unitaria}$ cuando el p-valor está por debajo de un determinado valore crítico, entonces no se rechaza $H_0$. Para el modelo con constante sólo y para el modelo con constante y con tendencia los p-valores son inferiores a un valor crítico de 0.05, entonces no se rechaza $H_0$.
En primeras diferencias, las series son estacionarias. Con esto en mente se podría pensar en modelizar las series con SARIMAS, detectandos patrones estacionales y probar de ajustar distintos modelos para compararlos con los resultados que siguen a continuación.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from matplotlib import pyplot
result1=seasonal_decompose(logs.dv_c1, model='additive', period=36)
result1.plot()
pyplot.show()

In [None]:
result2=seasonal_decompose(logs.dv_c2, model='additive', period=24)
result2.plot()
pyplot.show()

In [None]:
result3=seasonal_decompose(logs.dv_c3, model='additive',period=12)
result3.plot()
pyplot.show()

## Modelos para la series temporales univariadas

In [None]:
df.columns

#### Defino las muestras de train y test 

In [None]:
wide.shape[0]*0.8

In [None]:
wide=wide.reset_index()

In [None]:
train=wide.iloc[:707,:]
train.tail()

In [None]:
test=wide.iloc[707:,:]
test.head()

In [None]:
train.shape, test.shape

In [None]:
train.columns

#### Análisis para la categoría 1

In [None]:
train1=train[['year',	'month'	,'day', 'CATEG-1']]

In [None]:
test1=test[['year',	'month'	,'day', 'CATEG-1']]

In [None]:
wide.columns

In [None]:
wide1=wide[['year',	'month'	,'day', 'CATEG-1']]

In [None]:
# initialize setup
s = setup(data = train1, test_data = test1, target = 'CATEG-1', fold_strategy = 'timeseries', numeric_features = ['year', 'month','day'],fold = 3, transform_target = True, session_id = 123)


In [None]:
best = compare_models(sort = 'MAE')

The best model using 3 fold cross-validation based on Mean Absolute Error (MAE) is AdaBoost Regressor.

In [None]:
train1.columns

In [None]:
train.tail()# 2021-05-19

In [None]:
# creo fechas futuras
future_dates = pd.date_range(start = '2021-11-13', end = '2021-12-04', freq = 'D')
future_df = pd.DataFrame()
future_df['month'] = [i.month for i in future_dates]
future_df['year'] = [i.year for i in future_dates]    
future_df['day'] = [i.day for i in future_dates] 
future_df.head()

In [None]:
predictions_future = predict_model(best, data=future_df)
predictions_future.head()

In [None]:
wide1.tail()

In [None]:
concat_df = pd.concat([wide1,predictions_future], axis=0)
concat_df.head()

In [None]:
concat_df['fechas']=pd.to_datetime(concat_df[['year', 'month', 'day']])

In [None]:
concat_df.set_index('fechas', inplace=True)

In [None]:
fig = px.line(concat_df, x=concat_df.index, y=['CATEG-1', 'Label'], template = 'plotly_dark',title='proyecciones de ventas categoría 1')
fig.show()

#### Análisis para la categoría 2

In [None]:
train2=train[['year',	'month'	,'day', 'CATEG-2']]
test2=test[['year',	'month'	,'day', 'CATEG-2']]

In [None]:
wide2=wide[['year',	'month'	,'day', 'CATEG-2']]

In [None]:
s = setup(data = train2, test_data = test2, target = 'CATEG-2', fold_strategy = 'timeseries', numeric_features = ['year', 'month','day'],fold = 3, transform_target = True, session_id = 123)

In [None]:
best2 = compare_models(sort = 'MAE')

In [None]:
predictions_future2 = predict_model(best2, data=future_df)
predictions_future2.head()

In [None]:
concat_df2 = pd.concat([wide2,predictions_future2], axis=0)
concat_df2['fechas']=pd.to_datetime(concat_df2[['year', 'month', 'day']])
concat_df2.set_index('fechas', inplace=True)
fig = px.line(concat_df2, x=concat_df2.index, y=['CATEG-2', 'Label'], template = 'plotly_dark',title='proyecciones de ventas categoría 2')
fig.show()

#### Análisis para la categoría 3

In [None]:
# cat 3
train3=train[['year',	'month'	,'day', 'CATEG-3']]


In [None]:
test3=test[['year',	'month'	,'day', 'CATEG-3']]
wide3=wide[['year',	'month'	,'day', 'CATEG-3']]


In [None]:
s = setup(data = train3, test_data = test3, target = 'CATEG-3', fold_strategy = 'timeseries', numeric_features = ['year', 'month','day'],fold = 3, transform_target = True, session_id = 123)


In [None]:
best3 = compare_models(sort = 'MAE')

In [None]:
predictions_future3 = predict_model(best3, data=future_df)
predictions_future3.head()

In [None]:
concat_df3 = pd.concat([wide3,predictions_future3], axis=0)
concat_df3['fechas']=pd.to_datetime(concat_df3[['year', 'month', 'day']])
concat_df3.set_index('fechas', inplace=True)
fig = px.line(concat_df3, x=concat_df3.index, y=['CATEG-3', 'Label'], template = 'plotly_dark',title='proyecciones de ventas categoría 3')
fig.show()

## Fuentes

* https://builtin.com/data-science/time-series-forecasting-python

* https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6

* https://www.datacamp.com/tutorial/tutorial-time-series-forecasting

* https://towardsdatascience.com/time-series-forecasting-based-on-the-trend-and-seasonal-components-26b92866e548