In [1]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import numpy as np
    import pandas as pd
    from collections import OrderedDict
    import re
    import statsmodels
    from statsmodels.nonparametric.smoothers_lowess import lowess
    import statsmodels.api as sm
    import fbprophet
    from fbprophet import Prophet
    import os
    import datetime as dt
    import matplotlib.pyplot as plt
    import bokeh
    from bokeh.models import CustomJS, ColumnDataSource, Slider, Label, Div, HoverTool, Band, Span, BoxAnnotation
    from bokeh.plotting import figure
    from bokeh.palettes import Spectral11
    import ipywidgets as widgets
    from IPython.display import display
    from typing import Union, Dict, List, Callable
    from contextlib import contextmanager
    import sys, os
    import datetime as dt

print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)
print('statsmodels version: ', statsmodels.__version__)
print('prophet version: ', fbprophet.__version__)
print('ipywidgets version: ', widgets.__version__)
warnings.filterwarnings('ignore', module='matplotlib')
bokeh.io.output_notebook()

numpy version:  1.19.5
pandas version:  1.1.5
statsmodels version:  0.10.2
prophet version:  0.7.1
ipywidgets version:  7.6.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/CZ4041 Machine Learning Project/Colab/data

/content/drive/.shortcut-targets-by-id/1ldppAeVJfaoI4cvwZyTCju9jXbD4LI-H/CZ4041 Machine Learning Project/Colab/data


In [4]:
import pandas as pd
df_train = pd.read_csv('train.csv',parse_dates=['date'])
df_test = pd.read_csv('test.csv',parse_dates=['date'])

df_train.sales = df_train.sales.astype(np.float)

print('Entries (Train / Test) : {} / {}'.format(len(df_train), len(df_test)))
s_train, s_test = df_train.store.unique(), df_test.store.unique()
print('Stores (Train / Test) : {} - {} / {} - {}'.format(s_train[0], s_train[-1], s_test[0], s_test[-1]))
s_train, s_test = df_train.item.unique(), df_test.item.unique()
print('Items (Train / Test) : {} - {} / {} - {}'.format(s_train[0], s_train[-1], s_test[0], s_test[-1]))
dates_train, dates_test = df_train.date.unique(), df_test.date.unique()
print('Dates (Train / Test) : {:.10} - {:.10} / {:.10} - {:.10}'.format(dates_train[0], dates_train[-1], dates_test[0], dates_test[-1]))
display(pd.concat([df_train.isnull().sum().rename('Training NaNs'),
                   df_train.isnull().sum().rename('Test NaNs')], axis=1))

Entries (Train / Test) : 913000 / 45000
Stores (Train / Test) : 1 - 10 / 1 - 10
Items (Train / Test) : 1 - 50 / 1 - 50
Dates (Train / Test) : 2013-01-01 - 2017-12-31 / 2018-01-01 - 2018-03-31


Unnamed: 0,Training NaNs,Test NaNs
date,0,0
store,0,0
item,0,0
sales,0,0


In [8]:
# SMAPE - the official metric for the submission

def smape(y: Union[np.ndarray, float], yhat: Union[np.ndarray, float], average=True, signed=False) -> float:
    """SMAPE evaluation metric"""
    
    if signed:
        result = 2. * (yhat - y) / (np.abs(y) + np.abs(yhat)) * 100
    else:
        result = 2. * np.abs(yhat - y) / (np.abs(y) + np.abs(yhat)) * 100
    if average: return np.mean(result)
    return result

def smape_df(df: pd.DataFrame, average=True, signed=False) -> pd.DataFrame:
    return smape(df.y, df.yhat, average=average, signed=signed)

In [138]:
def prophet_show(i, item, store, cutoff_train, cutoff_eval, prophet_kwargs, title, plot_components=False, display_df=False):
    ts = (result.query('item == @item & store == @store & date <= @cutoff_eval')[['date', 'sales']].rename(columns={'date':'ds', 'sales':'y'})).reset_index(drop=True)
    ind_train = pd.eval('ts.ds < cutoff_train')
    ind_eval = ~ ind_train
    len_train, len_eval = ind_train.sum(), ind_eval.sum()
    ts_train = ts.loc[ind_train]
    m = Prophet(**prophet_kwargs)
    m.fit(ts_train)
    ts_hat = m.predict(ts).merge(ts[['ds', 'y']], on='ds', how='left')
    if display_df: display(ts_hat.tail(3))

    df_combined = ts_hat.assign(smape=0, smape_smooth=0)
    df_combined.smape = smape_df(df_combined, average=False)
    df_combined.loc[ind_train, 'smape_smooth'] = lowess(df_combined.loc[ind_train, 'smape'], range(len_train), frac=0.03, return_sorted=False)
    df_combined.loc[ind_eval, 'smape_smooth'] = lowess(df_combined.loc[ind_eval, 'smape'], range(len_eval), frac=0.35, return_sorted=False)
    smape_in = df_combined.loc[ind_train].smape.mean()
    smape_oos = df_combined.loc[ind_eval].smape.mean()
    
    source = ColumnDataSource(data=df_combined)
    p = figure(plot_width=750, plot_height=200, title=("**{}**     item = {} store = {}     train / test = ..{} / ..{}"
                                                       .format(title, item, store, cutoff_train, cutoff_eval)), 
               x_axis_type='datetime', tools="pan,wheel_zoom,reset")
    _ = p.line(x='ds', y='yhat', source=source)
    _ = p.line(x='ds', y='yhat_lower', source=source, line_alpha=0.4)
    _ = p.line(x='ds', y='yhat_upper', source=source, line_alpha=0.4)
    _ = p.scatter(x='ds', y='y', source=source, color='black', radius=0.2, radius_dimension='y', alpha=0.4)
    _ = p.scatter(x='ds', y='y', source=source, color='black', radius=0.2, radius_dimension='y', alpha=0.4)
       
    deltas = np.abs(m.params['delta'][0])
    delta_max = np.max(deltas)
    df_deltas = pd.DataFrame({'ds': m.changepoints.values, 'delta':deltas, 'delta_scaled':ts_hat.yhat.mean() * deltas / delta_max})
    source2 = ColumnDataSource(df_deltas)
    cp1 = p.vbar(x='ds', source=source2, width=1, top=ts_hat.yhat.mean(), color='red', alpha=0.2, hover_color='red', hover_alpha=1)
    cp2 = p.vbar(x='ds', source=source2, width=1.5e+9, top='delta_scaled', color='red', alpha=0.5)
    p.add_tools(HoverTool(tooltips=[('trend delta', '@delta{.000}')], renderers=[cp2], mode='mouse'))
    p.add_layout(Label(x=1e+10, y=10, text='xasfdfsdfsd'))
    p.add_layout(BoxAnnotation(left=ts_train.ds.iloc[-1], right=ts.ds.iloc[-1]))
    
    p2 = figure(plot_width=750, plot_height=100, title="SMAPE IS / OOS = {:.3f} / {:.3f}".format(smape_in, smape_oos), x_axis_type='datetime', tools="",
                x_range=p.x_range)
    sm1 = p2.line(x='ds', y='smape_smooth', source=source, color='green')
    p2.add_tools(HoverTool(tooltips=[('smape', '@smape')], renderers=[sm1], mode='vline', line_policy='interp'))
    p2.add_layout(BoxAnnotation(left=ts_train.ds.iloc[-1], right=ts.ds.iloc[-1]))
    p2.yaxis[0].ticker.desired_num_ticks = 2
    bokeh.io.show(bokeh.layouts.column(p, p2))
    
    if plot_components:
        _ = m.plot_components(ts_hat, uncertainty=True)
        fig, ax = plt.subplots(1, 1, figsize=(12, 2))
        res = ts_hat.query('ds < @cutoff_train').yhat - ts_train.y
        res = (df_combined['y'] - df_combined['yhat'])
      #  adfuller_stat = statsmodels.tsa.stattools.adfuller(res.values)
        ljungbox_stat = statsmodels.stats.diagnostic.acorr_ljungbox(res.values)
        _ = statsmodels.graphics.tsaplots.plot_pacf(res, lags=40, ax=ax,
                                                    title='residuals pacf; ljung-box p-value = {:.2E} / {:.2E}'.format(ljungbox_stat[1][6], 
                                                                                                                      ljungbox_stat[1][30]))
    #df_final=df_final.append(df_combined['yhat'])
    result.loc[:,'pred'+str(1)+str(i)] = df_combined['yhat']
    result.loc[:,'date'+str(1)+str(i)] = df_combined['ds']


for i in range(1,51):   
    prophet_show(i=i, item=i, store=10, cutoff_train="2018-01-01", cutoff_eval="2018-03-31",
                prophet_kwargs={'yearly_seasonality':True, 'daily_seasonality':True, 'weekly_seasonality':True,
                                'uncertainty_samples':500},
                title='Prophet')

Output hidden; open in https://colab.research.google.com to view.

In [72]:
frames = [df_train, df_test]
result = pd.concat(frames)
result

Unnamed: 0,date,store,item,sales,id
0,2013-01-01,1,1,13.0,
1,2013-01-02,1,1,11.0,
2,2013-01-03,1,1,14.0,
3,2013-01-04,1,1,13.0,
4,2013-01-05,1,1,10.0,
...,...,...,...,...,...
44995,2018-03-27,10,50,,44995.0
44996,2018-03-28,10,50,,44996.0
44997,2018-03-29,10,50,,44997.0
44998,2018-03-30,10,50,,44998.0


In [140]:
x = result
x =x.drop(columns=['date','store','item','sales','id'], axis=0)
x = x[x.date12 > '2017-12-31']
x = x[x.columns.drop(list(x.filter(regex='date')))]
x=x[:90]
y = x.unstack().reset_index(drop=True)

In [141]:
a.append(y)

In [145]:
b=np.concatenate((a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],a[8],a[9]))

In [148]:
def createSubmission(predictions,fileName="data/submissionPR.csv"):
  df=pd.DataFrame(predictions)
  df=df.reset_index()
  df.columns=['id','sales']
  df.to_csv(fileName,index=False)

In [149]:
%cd ..

/content/drive/.shortcut-targets-by-id/1ldppAeVJfaoI4cvwZyTCju9jXbD4LI-H/CZ4041 Machine Learning Project/Colab


In [150]:
createSubmission(b)