# Parameter Tuning - LinkedIn Silverkite

## Libraries and Data

In [None]:
# Install Greykite - wrapper for Silverkite, Prophet, and ARIMA
!pip install greykite

In [57]:
# Libraries
import numpy as np
import pandas as pd
from greykite.framework.templates.autogen.forecast_config import *
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.common.features.timeseries_features import *
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results
from plotly.offline import iplot

In [58]:
# Data
df = pd.read_csv('../Data/nyc-data.csv')
future_df = pd.read_csv('../Data/future.csv')
df

Unnamed: 0,Date,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2015,720.000885,0,0,0,3.68,41.305
1,1/2/2015,581.276773,0,0,0,4.73,131.574
2,1/3/2015,754.117039,0,0,0,7.23,162.700
3,1/4/2015,622.252774,0,0,0,10.96,160.281
4,1/5/2015,785.373319,0,0,0,6.92,51.077
...,...,...,...,...,...,...,...
2187,12/27/2020,685.915026,0,0,0,2.89,38.674
2188,12/28/2020,998.051170,0,0,0,8.83,166.712
2189,12/29/2020,847.123399,0,0,0,3.48,161.865
2190,12/30/2020,857.521043,0,0,0,5.97,179.634


In [59]:
future_df

Unnamed: 0,Date,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2021,,0,0,0,5.0,154.221
1,1/2/2021,,0,0,0,11.11,264.805
2,1/3/2021,,0,0,0,3.89,115.499
3,1/4/2021,,0,0,0,6.67,124.65
4,1/5/2021,,0,0,0,5.56,77.968
5,1/6/2021,,0,0,0,5.56,234.2
6,1/7/2021,,0,0,0,6.11,142.041
7,1/8/2021,,0,0,0,3.89,252.094
8,1/9/2021,,0,0,0,2.78,100.483
9,1/10/2021,,0,0,0,6.11,71.6


In [60]:
# Merging dataframes and reset index
df = pd.concat([df, future_df])
df = df.reset_index(drop=True)
df

Unnamed: 0,Date,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2015,720.000885,0,0,0,3.68,41.305
1,1/2/2015,581.276773,0,0,0,4.73,131.574
2,1/3/2015,754.117039,0,0,0,7.23,162.700
3,1/4/2015,622.252774,0,0,0,10.96,160.281
4,1/5/2015,785.373319,0,0,0,6.92,51.077
...,...,...,...,...,...,...,...
2218,1/27/2021,,0,0,0,3.33,39.664
2219,1/28/2021,,0,0,0,1.67,195.314
2220,1/29/2021,,0,0,0,-2.78,235.894
2221,1/30/2021,,0,0,0,1.11,152.752


In [61]:
# Rename variable
df = df.rename(columns={'Demand': 'y'})
df.head(1)

Unnamed: 0,Date,y,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2015,720.000885,0,0,0,3.68,41.305


## Silverkite Preparations

In [62]:
# Specifying time-series names (variables)
metadata = MetadataParam(time_col='Date',
                         value_col='y',
                         freq='D',
                         train_end_date=pd.to_datetime('2020-12-31'))
metadata

MetadataParam(anomaly_info=None, date_format=None, freq='D', time_col='Date', train_end_date=Timestamp('2020-12-31 00:00:00'), value_col='y')

## Model Components

* Growth terms: linear, quadratic, square root
  * Reflect the shape of the trend curve (tune)
* Seasonalities: yearly, quarterly, monthly, weekly, daily
  * Daily data can have the higher 4 (auto)
* Holidays/events: country holidays/other events
  * Provided by model or user-generated (input)
* Changepoints: When should the trend change
  * Inflection points (auto)
* Regressors: other influencing factors
  * IVs that help model error (input)
* Lagged Regressors: lagged effect of regressors
  * Do they impact on a different day than they occur (auto)
* Auto-regression: using the time-series itself
  * Let past values predict the future (auto)

In [63]:
# Growth terms possibilities
growth = dict(growth_term = ['linear', 'quadratic', 'sqrt'])
growth

{'growth_term': ['linear', 'quadratic', 'sqrt']}

In [65]:
# Seasonalities
seasonality = dict(yearly_seasonality = 'auto',  # Can also set to None or True
                   quarterly_seasonality = 'auto',  # 'auto' lets the model figure it out
                   monthly_seasonality = 'auto',
                   weekly_seasonality = 'auto',
                   daily_seasonality = 'auto')  # Can include hourly_seasonality if you have hourly data
seasonality

{'yearly_seasonality': 'auto',
 'quarterly_seasonality': 'auto',
 'monthly_seasonality': 'auto',
 'weekly_seasonality': 'auto',
 'daily_seasonality': 'auto'}

In [66]:
# Checking which countries are available for holidays
get_available_holiday_lookup_countries(['US'])  # May need to use different format for country

['US']

In [67]:
get_available_holidays_across_countries(countries=['US'],
                                        year_start=2015,
                                        year_end=2021)

['Christmas Day',
 'Christmas Day (Observed)',
 'Columbus Day',
 'Halloween',
 'Independence Day',
 'Independence Day (Observed)',
 'Juneteenth National Independence Day',
 'Juneteenth National Independence Day (Observed)',
 'Labor Day',
 'Martin Luther King Jr. Day',
 'Memorial Day',
 "New Year's Day",
 "New Year's Day (Observed)",
 'Thanksgiving',
 'Veterans Day',
 'Veterans Day (Observed)',
 "Washington's Birthday"]

In [68]:
# Specifying events
events = dict(holidays_to_model_separately = ["New Year's Day"],  # Can include others in list if desired
              holiday_lookup_countries = ['US'],
              holiday_pre_num_days = 2,
              holiday_post_num_days = 2,
              holiday_pre_post_num_dict = {"New Year's Day": (3, 1)},  # Include a term for all separate holidays
              daily_event_df_dict = {"elections": pd.DataFrame({  # Adding your own event
                  'date': ['2016-11-08', '2020-11-03'],
                  'event_name': ['elections'] * 2
              })})
events

{'holidays_to_model_separately': ["New Year's Day"],
 'holiday_lookup_countries': ['US'],
 'holiday_pre_num_days': 2,
 'holiday_post_num_days': 2,
 'holiday_pre_post_num_dict': {"New Year's Day": (3, 1)},
 'daily_event_df_dict': {'elections':          date event_name
  0  2016-11-08  elections
  1  2020-11-03  elections}}

In [69]:
# Changepoints -> reflects changes in trend
changepoints = dict(changepoints_dict = dict(method = 'auto'))
changepoints

{'changepoints_dict': {'method': 'auto'}}

In [70]:
# Regressors (don't include them unless you know they have an impact)
regressors = dict(regressor_cols = ['Easter', 'Temperature', 'Marketing'])
regressors

{'regressor_cols': ['Easter', 'Temperature', 'Marketing']}

In [71]:
# Lagged Regressors (depends on forecasting horizon)
lagged_regressors = dict(lagged_regressor_dict = {'Temperature': 'auto',
                                                  'Easter': 'auto',
                                                  'Marketing': 'auto'})
lagged_regressors

{'lagged_regressor_dict': {'Temperature': 'auto',
  'Easter': 'auto',
  'Marketing': 'auto'}}

In [72]:
# Auto-regression (depends on forecasting horizon)
autoregression = dict(autoreg_dict = 'auto')
autoregression

{'autoreg_dict': 'auto'}

## Fitting Algorithms

9 possibilities:

1. Linear regression* - poor with collinearity
2. Elastic Net
3. Ridge*
4. Lasso
5. Stochastic Gradient Descent - unstable
6. Lars - outlier/noise sensitivity
7. Lasso Lars
8. Random Forest* - tree models don't model growth well
9. Gradient Boosting* - tree models don't model growth well

We'll try the starred* ones.

In [73]:
# Fitting algorithms

custom = dict(fit_algorithm_dict = [dict(fit_algorithm = 'linear'),
                                    dict(fit_algorithm = 'ridge'),
                                    dict(fit_algorithm = 'rf'),
                                    dict(fit_algorithm = 'gradient_boosting')])
custom

{'fit_algorithm_dict': [{'fit_algorithm': 'linear'},
  {'fit_algorithm': 'ridge'},
  {'fit_algorithm': 'rf'},
  {'fit_algorithm': 'gradient_boosting'}]}

## Silverkite Model

In [74]:
# Build the model
model_components = ModelComponentsParam(growth=growth,
                                        seasonality=seasonality,
                                        events=events,
                                        changepoints=changepoints,
                                        regressors=regressors,
                                        lagged_regressors=lagged_regressors,
                                        autoregression=autoregression,
                                        custom=custom)
model_components

ModelComponentsParam(autoregression={'autoreg_dict': 'auto'}, changepoints={'changepoints_dict': {'method': 'auto'}}, custom={'fit_algorithm_dict': [{'fit_algorithm': 'linear'}, {'fit_algorithm': 'ridge'}, {'fit_algorithm': 'rf'}, {'fit_algorithm': 'gradient_boosting'}]}, events={'holidays_to_model_separately': ["New Year's Day"], 'holiday_lookup_countries': ['US'], 'holiday_pre_num_days': 2, 'holiday_post_num_days': 2, 'holiday_pre_post_num_dict': {"New Year's Day": (3, 1)}, 'daily_event_df_dict': {'elections':          date event_name
0  2016-11-08  elections
1  2020-11-03  elections}}, growth={'growth_term': ['linear', 'quadratic', 'sqrt']}, hyperparameter_override=None, regressors={'regressor_cols': ['Easter', 'Temperature', 'Marketing']}, lagged_regressors={'lagged_regressor_dict': {'Temperature': 'auto', 'Easter': 'auto', 'Marketing': 'auto'}}, seasonality={'yearly_seasonality': 'auto', 'quarterly_seasonality': 'auto', 'monthly_seasonality': 'auto', 'weekly_seasonality': 'auto', 

In [75]:
# Cross-validation
evaluation_period = EvaluationPeriodParam(cv_min_train_periods=df.shape[0] - 180 - 31,
                                          cv_expanding_window=True,
                                          cv_max_splits=50,  # Ideally should be None, but there were problems with that
                                          cv_periods_between_splits=16)

In [76]:
# Evaluation metric
evaluation_metric = EvaluationMetricParam(
    cv_selection_metric=EvaluationMetricEnum.RootMeanSquaredError.name)

In [77]:
# Configuration
config = ForecastConfig(model_template=ModelTemplateEnum.SILVERKITE.name,
                        forecast_horizon=31,
                        metadata_param=metadata,
                        model_components_param=model_components,
                        evaluation_period_param=evaluation_period,
                        evaluation_metric_param=evaluation_metric)

In [None]:
# Forecasting
forecaster = Forecaster()
result = forecaster.run_forecast_config(df=df,
                                        config=config)

In [79]:
# Visualization
fig = result.backtest.plot()
iplot(fig)

## Parameter Tuning Results

In [80]:
# CV results
cv_results = summarize_grid_search_results(
    grid_search=result.grid_search,
    decimals=1,
    score_func=EvaluationMetricEnum.RootMeanSquaredError.name)

In [81]:
# Set the CV results index
cv_results['params'] = cv_results['params'].astype(str)
cv_results.set_index('params', drop=True, inplace=True)
cv_results

Unnamed: 0_level_0,rank_test_CORR,rank_test_R2,rank_test_MSE,rank_test_RMSE,rank_test_MAE,rank_test_MedAE,rank_test_MAPE,rank_test_MedAPE,rank_test_sMAPE,rank_test_Q80,...,std_test_OutsideTolerance5p,split0_train_OutsideTolerance5p,split1_train_OutsideTolerance5p,split2_train_OutsideTolerance5p,split3_train_OutsideTolerance5p,split4_train_OutsideTolerance5p,split5_train_OutsideTolerance5p,split6_train_OutsideTolerance5p,split7_train_OutsideTolerance5p,std_train_OutsideTolerance5p
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",2,1,1,1,1,2,1,1,1,6,...,0.1,0.5,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",1,2,2,2,3,1,3,3,3,5,...,0.1,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",3,3,3,3,2,3,2,2,2,7,...,0.1,0.5,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",5,6,6,6,6,6,5,6,6,12,...,0.0,0.4,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",4,4,4,4,4,4,4,4,4,10,...,0.1,0.4,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",6,5,5,5,5,5,6,5,5,11,...,0.0,0.4,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",12,12,12,12,12,12,12,12,12,8,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.0
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",11,11,11,11,11,11,11,10,11,4,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.0
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",10,10,10,10,10,10,10,11,10,9,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.0
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",9,7,7,7,7,7,7,7,7,1,...,0.1,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.0


In [82]:
# Looking at the best results
cv_results[['rank_test_RMSE', 'mean_test_RMSE',
            'param_estimator__fit_algorithm_dict',
            'param_estimator__growth_term']]

Unnamed: 0_level_0,rank_test_RMSE,mean_test_RMSE,param_estimator__fit_algorithm_dict,param_estimator__growth_term
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",1,47.8,{'fit_algorithm': 'linear'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",2,47.8,{'fit_algorithm': 'linear'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",3,47.9,{'fit_algorithm': 'linear'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",6,48.9,{'fit_algorithm': 'ridge'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",4,48.5,{'fit_algorithm': 'ridge'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",5,48.9,{'fit_algorithm': 'ridge'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",12,64.8,{'fit_algorithm': 'rf'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",11,64.3,{'fit_algorithm': 'rf'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",10,63.6,{'fit_algorithm': 'rf'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",7,56.6,{'fit_algorithm': 'gradient_boosting'},linear


In [83]:
# Get best parameters
best_params = cv_results[cv_results.rank_test_RMSE == 1][['mean_test_RMSE',
                                                          'param_estimator__fit_algorithm_dict',
                                                          'param_estimator__growth_term']].transpose()
best_params

params,"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]"
mean_test_RMSE,47.8
param_estimator__fit_algorithm_dict,{'fit_algorithm': 'linear'}
param_estimator__growth_term,linear


In [84]:
best_params.to_csv('../Forecasting-Product/best-params-silverkite.csv')