<a href="https://colab.research.google.com/github/mrcruz117/linkedin-silverkite-forecast-templates/blob/main/LinkedIn_Silverkite_Forecast_Template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Silverkite Forecast

In [1]:
#change directory
%cd /content/drive/MyDrive/Time Series Forecasting Product

/content/drive/MyDrive/Time Series Forecasting Product


In [2]:
# !pip install greykite

In [3]:
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from greykite.framework.templates.autogen.forecast_config import *
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.common.features.timeseries_features import *
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results
from plotly.offline import iplot

In [4]:
#load the data
#YYYY-MM-DD
df = pd.read_csv('nyc_data.csv')
future_df = pd.read_csv('future.csv')
future_df.head()
df.head()

Unnamed: 0,Date,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2015,720.000885,0,0,0,3.68,41.305
1,1/2/2015,581.276773,0,0,0,4.73,131.574
2,1/3/2015,754.117039,0,0,0,7.23,162.7
3,1/4/2015,622.252774,0,0,0,10.96,160.281
4,1/5/2015,785.373319,0,0,0,6.92,51.077


In [5]:
# merge dfs

df = pd.concat([df, future_df])
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,Date,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
2218,1/27/2021,,0,0,0,3.33,39.664
2219,1/28/2021,,0,0,0,1.67,195.314
2220,1/29/2021,,0,0,0,-2.78,235.894
2221,1/30/2021,,0,0,0,1.11,152.752
2222,1/31/2021,,0,0,0,4.44,158.62


In [6]:
#Rename variable
df = df.rename(columns = {'Demand': 'y'})
df.head(0)

Unnamed: 0,Date,y,Easter,Thanksgiving,Christmas,Temperature,Marketing


## Silverkite Prep

In [7]:
# Load the tuned parameters
parameters = pd.read_csv("Forecasting Product/best_params_silverkite.csv",
                         index_col = 0)
parameters

Unnamed: 0,"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]"
mean_test_RMSE,47.4
param_estimator__fit_algorithm_dict,{'fit_algorithm': 'linear'}
param_estimator__growth_term,quadratic


In [8]:
# isolate tuned params
growth_term_param = parameters.loc["param_estimator__growth_term"][0]
fit_algorithm_param = parameters.loc["param_estimator__fit_algorithm_dict"][0]

In [9]:
# time series name specifications
metadata = MetadataParam(
    time_col="Date",
    value_col="y",
    freq="D",
    train_end_date=pd.to_datetime("2020-12-31"),
)
metadata

MetadataParam(anomaly_info=None, date_format=None, freq='D', time_col='Date', train_end_date=Timestamp('2020-12-31 00:00:00'), value_col='y')

In [10]:
# growth term
growth =  dict(
    growth_term=growth_term_param,
)
growth

{'growth_term': 'quadratic'}

In [11]:
# seasonalities
seasonality = dict(
    yearly_seasonality="auto",
    quarterly_seasonality="auto",
    monthly_seasonality="auto",
    weekly_seasonality="auto",
    daily_seasonality="auto",
)
seasonality

{'yearly_seasonality': 'auto',
 'quarterly_seasonality': 'auto',
 'monthly_seasonality': 'auto',
 'weekly_seasonality': 'auto',
 'daily_seasonality': 'auto'}

In [12]:
# holidays modeling
get_available_holiday_lookup_countries(["US"])
get_available_holidays_across_countries(countries=["US"],
                                        year_start=2015,
                                        year_end=2021)

['Christmas Day',
 'Christmas Day (Observed)',
 'Columbus Day',
 'Halloween',
 'Independence Day',
 'Independence Day (Observed)',
 'Juneteenth National Independence Day',
 'Juneteenth National Independence Day (Observed)',
 'Labor Day',
 'Martin Luther King Jr. Day',
 'Memorial Day',
 "New Year's Day",
 "New Year's Day (Observed)",
 'Thanksgiving',
 'Veterans Day',
 'Veterans Day (Observed)',
 "Washington's Birthday"]

In [13]:
# specifying events
events = dict(
    holidays_to_model_separately = ["New Year's Day"],
    holiday_pre_num_days = 2,
    holiday_post_num_days = 2,
    holiday_pre_post_num_dict = {"New Year's Day":(3,1)},
    # adding one thats not there
    # daily_event_df_dict = {"elections": pd.DataFrame({
    #     "date": ["2016-11-03", "2020-11-03"],
    #     "event_name": ["elections"] * 2
    # })}
)
events

{'holidays_to_model_separately': ["New Year's Day"],
 'holiday_pre_num_days': 2,
 'holiday_post_num_days': 2,
 'holiday_pre_post_num_dict': {"New Year's Day": (3, 1)}}

In [14]:
# changepoints: reflects changes in the trend
changepoints = dict(changepoints_dict=dict(method="auto"))

In [15]:
# regressors
regressors = dict(regressor_cols=["Easter", "Temperature", "Marketing"])
regressors

{'regressor_cols': ['Easter', 'Temperature', 'Marketing']}

In [16]:
# lagged regressors
lagged_regressors = dict(lagged_regressor_dict={"Temperature": "auto",
                                                 "Easter": "auto",
                                                 "Marketing": "auto"})

In [17]:
# autoregression: depoendent on the forecasting horizon
autoregression = dict(autoreg_dict= "auto")

### Model notes
- ridge regression model might be better.
- less risk of overfitting
- penalizes extreme slopes
  - makes more sense when we have less data.

- XGBoost
  - widely applicable
  - might work as well

In [18]:
# fitting algos
import yaml
custom = dict(fit_algorithm_dict=yaml.load(fit_algorithm_param,
                                           Loader=yaml.SafeLoader))
custom

{'fit_algorithm_dict': {'fit_algorithm': 'linear'}}

## Building the Model

In [19]:
model_components = ModelComponentsParam(
    growth=growth,
    seasonality=seasonality,
    events=events,
    changepoints=changepoints,
    regressors=regressors,
    lagged_regressors=lagged_regressors,
    autoregression=autoregression,
    custom=custom
)

In [20]:
# cross-validation
evaluation_period = EvaluationPeriodParam(
    cv_min_train_periods=df.shape[0] - 180 - 31,
    cv_expanding_window=True,
    cv_max_splits=30,
    cv_periods_between_splits=16
)

In [21]:
# evaluation metric

evaluation_metric = EvaluationMetricParam(
    cv_selection_metric=EvaluationMetricEnum.RootMeanSquaredError.name
)

In [22]:
# model config
config = ForecastConfig(
    model_template=ModelTemplateEnum.SILVERKITE.name,
    forecast_horizon=31,
    metadata_param=metadata,
    model_components_param=model_components,
    evaluation_period_param=evaluation_period,
    evaluation_metric_param=evaluation_metric,
)

In [23]:
# fit with tuned params
forecaster = Forecaster()
result = forecaster.run_forecast_config(df=df, config=config)

Fitting 8 folds for each of 1 candidates, totalling 8 fits



The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly'

In [24]:
# visualization

fig = result.backtest.plot()
iplot(fig)

In [34]:
# look at summary
result.model[-1].summary()




Number of observations: 2192,   Number of features: 181
Method: Ordinary least squares
Number of nonzero features: 181

Residuals:
         Min           1Q       Median           3Q          Max
      -163.1       -32.35      -0.4705        30.28        236.6

            Pred_col Estimate Std. Err   t value Pr(>|t|) sig. code              95%CI
           Intercept    67.11    3.006     22.32   <2e-16       ***      (61.21, 73.0)
events_New Years Day   -11.49    22.78   -0.5043    0.614              (-56.15, 33.18)
 events_N...rs Day-1   -32.34    22.97    -1.408    0.159               (-77.39, 12.7)
 events_N...rs Day-2   -3.502    22.88   -0.1531    0.878              (-48.37, 41.37)
 events_N...rs Day-3   -35.43    22.74    -1.558    0.119              (-80.03, 9.171)
 events_N...rs Day+1    4.535    22.63    0.2004    0.841              (-39.85, 48.92)
        events_Other   -1.448    3.401   -0.4258    0.670              (-8.118, 5.222)
      events_Other-1   -4.077    3.387   

In [36]:
# components

from greykite.common.constants import DETAILED_SEASONALITY_COMPONENTS_REGEX_DICT
fig = result.forecast.plot_components(grouping_regex_patterns_dict=DETAILED_SEASONALITY_COMPONENTS_REGEX_DICT)
iplot(fig)