In [1]:
!pip install kaggle
!pip install permetrics

Collecting urllib3<1.25,>=1.21.1
  Downloading urllib3-1.24.3-py2.py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 4.4 MB/s eta 0:00:01
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.25.10
    Uninstalling urllib3-1.25.10:
      Successfully uninstalled urllib3-1.25.10
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

pyppeteer 0.2.2 requires urllib3<2.0.0,>=1.25.8, but you'll have urllib3 1.24.3 which is incompatible.
influxdb 5.3.0 requires msgpack==0.6.1, but you'll have msgpack 1.0.0 which is incompatible.[0m
Successfully installed urllib3-1.24.3
Collecting permetrics
  Downloading permetrics-1.1.3-py3-none-any.whl (63 kB)
[K     |███████████

In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
!kaggle competitions download -c m5-forecasting-accuracy

Downloading m5-forecasting-accuracy.zip to /workspace
 98%|█████████████████████████████████████▎| 45.0M/45.8M [00:02<00:00, 20.6MB/s]
100%|██████████████████████████████████████| 45.8M/45.8M [00:02<00:00, 18.4MB/s]


In [5]:
!unzip m5-forecasting-accuracy.zip

Archive:  m5-forecasting-accuracy.zip
  inflating: calendar.csv            
  inflating: sales_train_evaluation.csv  
  inflating: sales_train_validation.csv  
  inflating: sample_submission.csv   
  inflating: sell_prices.csv         


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [7]:
path_m5 = "./"
calendar = pd.read_csv(path_m5+"calendar.csv")
prices = pd.read_csv(path_m5+"sell_prices.csv")
sales = pd.read_csv(path_m5+"sales_train_evaluation.csv")

In [8]:
calendar = calendar.rename(columns={"d":"day"})

In [9]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [10]:
def encode_categorical(df, cols):
    
    for col in cols:
        # Leave NaN as it is.
        le = LabelEncoder()
        #not_null = df[col][df[col].notnull()]
        df[col] = df[col].fillna('nan')
        df[col] = pd.Series(le.fit_transform(df[col]), index=df.index)

    return df

In [11]:
calendar = encode_categorical(
    calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
).pipe(reduce_mem_usage)

sales = encode_categorical(
    sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
).pipe(reduce_mem_usage)

prices = encode_categorical(prices, ["item_id", "store_id"]).pipe(
    reduce_mem_usage
)

Mem. usage decreased to  0.07 Mb (66.9% reduction)
Mem. usage decreased to 95.14 Mb (79.0% reduction)
Mem. usage decreased to 45.67 Mb (78.1% reduction)


In [12]:
prices_dates = pd.merge(prices, calendar[["wm_yr_wk", "date", "day", "weekday", "wday", "month"]],
                  on="wm_yr_wk", how="left")

In [13]:
sales_ts = pd.melt(sales,
                 id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                 var_name = 'day', value_name = 'demand')

In [14]:
cols_group = ['item_id', 'store_id']
prices_dates['full_price'] = prices_dates.groupby(cols_group)['sell_price'].transform('max')

In [15]:
prices_dates['discount_pct'] = (prices_dates['full_price']-prices_dates['sell_price'])/prices_dates['full_price']*100

In [16]:
sales_ts = pd.merge(sales_ts, prices_dates[["day", "sell_price", "full_price", "discount_pct"]], on="day")
df_all = sales_ts.groupby("day").agg({"demand":"sum", 
                                      "sell_price":"mean", 
                                      "full_price":"mean", 
                                      "discount_pct":"mean"}).reset_index()
df_all = pd.merge(df_all, prices_dates[["day", "date", "weekday", "wday", "month"]], on="day")
df_all = df_all.rename(columns={"date":"ds", "demand":"y"})
df_all = df_all[["ds", "y"]]
df_all['ds'] = df_all['ds'].astype(np.datetime64)

MemoryError: Unable to allocate 10.4 TiB for an array with shape (1429422331730,) and data type int64

In [None]:
df_cat = sales_ts.groupby(["cat_id", "day"]).agg({"demand":"sum"}).reset_index()
df_cat = pd.merge(df_cat, calendar[["day", "date"]], on="day")
df_cat = df_cat.rename(columns={"date":"ds", "demand":"y"})
df_cat = df_cat[["ds", "cat_id", "y"]]
df_cat['ds'] = df_cat['ds'].astype(np.datetime64)

In [None]:
len(df_all)

In [33]:
df_train = df_all[:-365]
df_test = df_all[-365:]

NameError: ignored

In [34]:
print(f"Train rows: {len(df_train)}")
print(f"Test rows: {len(df_test)}") 

NameError: ignored

In [156]:
m =Prophet(
                     seasonality_mode = 'multiplicative',
                     daily_seasonality=True,
                     weekly_seasonality=True,
                     yearly_seasonality = True)
m.fit(df_train)
future = m.make_future_dataframe(periods=len(df_test))
forecast = m.predict(future)

{'changepoint_prior_scale': 0.1, 'holidays_prior_scale': 0.1, 'n_changepoints': 100}


In [160]:
len(forecast)

1941

In [157]:
forecast['ds'] = forecast['ds'].astype(np.datetime64)
df_all['ds'] = df_all['ds'].astype(np.datetime64)
forecast = forecast[["ds", "yhat"]]
df_all = pd.merge(df_all, forecast, on=["ds"], how="left")

In [163]:
df_all = df_all.sort_values("ds")

In [164]:
len(df_all)

1941

In [165]:

y_true = df_all['y'].values
y_pred = df_all['yhat'].values
print("Forecast Daily error Prophet:")
print_metris(y_true, y_pred)

Forecast Daily error Prophet:
MAE 2413.94
MSE 11257520.295
RMSE 3355.223
MAPE 5.145
SMAPE 0.075
MAAPE 0.074
R2 0.8


In [166]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=df_all["ds"], y=df_all["y"],
                          mode='lines',
                          name='Sales',
                          marker_color='rgb(121,121,121)'))
fig.add_trace(go.Scatter(x=df_all["ds"], y=df_all["yhat"],
                          mode='lines+markers',
                          name='Prediccion',
                          marker_color='rgb(16,52,166)'))

fig.update_traces(mode='markers+lines')
fig.update_xaxes(rangeslider_visible=True)
fig.update_layout({
    'height': 800,
    'template': 'plotly_white',
    'plot_bgcolor': 'rgba(0,0,0,0)',
    'paper_bgcolor': 'rgba(0,0,0,0)',
    'xaxis_title': 'Fecha',
    'yaxis_title': 'Unidades',
    'yaxis_tickformat': ',.0f',
    'yaxis.rangemode': 'tozero',
    'legend': {'orientation': 'h',
                'yanchor': 'bottom',
                'y': 1.02,
                'xanchor': 'right',
                'x': 1
                }
})

In [104]:
df_all.head()

Unnamed: 0,ds,y,yhat
0,2011-01-29,32631.0,34119.750979
1,2011-01-30,25572.0,34168.583774
2,2011-01-31,23688.0,34512.811513
3,2011-02-01,29260.0,34340.737009
4,2011-02-02,33877.0,34510.630665


In [105]:
df_all["error"] = df_all["y"]-df_all["yhat"]

In [None]:
df_all["error"]

In [106]:
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,1437,11325,9.578125
1,0,1437,11326,9.578125
2,0,1437,11327,8.257812
3,0,1437,11328,8.257812
4,0,1437,11329,8.257812


In [127]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,30,4,4,2,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,30,4,4,2,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,30,4,4,2,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,30,4,4,2,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,30,4,4,2,1,0,1


In [175]:
df_predictions = pd.DataFrame()
for category in df_cat["cat_id"].unique():
  print(category)
  df_tmp = df_cat[df_cat["cat_id"]==category].reset_index(drop=True)
  df_train = df_tmp[:-365]
  df_test = df_tmp[-365:]
  m =Prophet(
                     seasonality_mode = 'multiplicative',
                     daily_seasonality=True,
                     weekly_seasonality=True,
                     yearly_seasonality = True)
  m.fit(df_train)
  future = m.make_future_dataframe(periods=len(df_train))
  forecast = m.predict(future)
  forecast['ds'] = forecast['ds'].astype(np.datetime64)
  df_all['ds'] = df_all['ds'].astype(np.datetime64)
  forecast = forecast[["ds", "yhat"]]
  forecast["cat_id"] = category
  df_predictions = pd.concat([df_predictions, forecast])

0
1
2


In [176]:
df_cat = pd.merge(df_cat,df_predictions, on=["ds", "cat_id"])


In [177]:
df_cat_all = df_cat.groupby("ds").agg({"y":"sum","yhat":"sum"}).reset_index()

In [178]:
y_true = df_all['y'].values
y_pred = df_all['yhat'].values
print("Forecast Daily error Prophet:")
print_metris(y_true, y_pred)

Forecast Daily error Prophet:
MAE 2413.94
MSE 11257520.295
RMSE 3355.223
MAPE 5.145
SMAPE 0.075
MAAPE 0.074
R2 0.8


In [179]:
y_true = df_cat_all['y'].values
y_pred = df_cat_all['yhat'].values
print("Forecast Daily error Prophet:")
print_metris(y_true, y_pred)

Forecast Daily error Prophet:
MAE 2406.782
MSE 11217336.123
RMSE 3349.229
MAPE 5.152
SMAPE 0.074
MAAPE 0.073
R2 0.801


In [180]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=df_cat_all["ds"], y=df_cat_all["y"],
                          mode='lines',
                          name='Sales',
                          marker_color='rgb(121,121,121)'))
fig.add_trace(go.Scatter(x=df_cat_all["ds"], y=df_cat_all["yhat"],
                          mode='lines+markers',
                          name='Prediccion',
                          marker_color='rgb(16,52,166)'))

fig.update_traces(mode='markers+lines')
fig.update_xaxes(rangeslider_visible=True)
fig.update_layout({
    'height': 800,
    'template': 'plotly_white',
    'plot_bgcolor': 'rgba(0,0,0,0)',
    'paper_bgcolor': 'rgba(0,0,0,0)',
    'xaxis_title': 'Fecha',
    'yaxis_title': 'Unidades',
    'yaxis_tickformat': ',.0f',
    'yaxis.rangemode': 'tozero',
    'legend': {'orientation': 'h',
                'yanchor': 'bottom',
                'y': 1.02,
                'xanchor': 'right',
                'x': 1
                }
})