<a href="https://colab.research.google.com/github/mrcruz117/linkedin-silverkite-forecast-templates/blob/main/LinkedIn_Silverkite_Parameter_Tuning_Template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Silverkite Parameter Tuning

In [4]:
#change directory
%cd /content/drive/MyDrive/Time Series Forecasting Product

/content/drive/MyDrive/Time Series Forecasting Product


In [5]:
# !pip install greykite

In [6]:
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from greykite.framework.templates.autogen.forecast_config import *
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.common.features.timeseries_features import *
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results
from plotly.offline import iplot

In [7]:
#load the data
#YYYY-MM-DD
df = pd.read_csv('nyc_data.csv')
future_df = pd.read_csv('future.csv')
future_df.head()
df.head()

Unnamed: 0,Date,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
0,1/1/2015,720.000885,0,0,0,3.68,41.305
1,1/2/2015,581.276773,0,0,0,4.73,131.574
2,1/3/2015,754.117039,0,0,0,7.23,162.7
3,1/4/2015,622.252774,0,0,0,10.96,160.281
4,1/5/2015,785.373319,0,0,0,6.92,51.077


In [8]:
# merge dfs

df = pd.concat([df, future_df])
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,Date,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
2218,1/27/2021,,0,0,0,3.33,39.664
2219,1/28/2021,,0,0,0,1.67,195.314
2220,1/29/2021,,0,0,0,-2.78,235.894
2221,1/30/2021,,0,0,0,1.11,152.752
2222,1/31/2021,,0,0,0,4.44,158.62


In [9]:
#Rename variable
df = df.rename(columns = {'Demand': 'y'})
df.head(0)

Unnamed: 0,Date,y,Easter,Thanksgiving,Christmas,Temperature,Marketing


## Silverkite Prep

In [10]:
# time series name specifications
metadata = MetadataParam(
    time_col="Date",
    value_col="y",
    freq="D",
    train_end_date=pd.to_datetime("2020-12-31"),
)
metadata

MetadataParam(anomaly_info=None, date_format=None, freq='D', time_col='Date', train_end_date=Timestamp('2020-12-31 00:00:00'), value_col='y')

In [11]:
# growth term possibilities
growth =  dict(
    growth_term=["linear", "quadratic", "sqrt"]
)
growth

{'growth_term': ['linear', 'quadratic', 'sqrt']}

In [12]:
# seasonalities
seasonality = dict(
    yearly_seasonality="auto",
    quarterly_seasonality="auto",
    monthly_seasonality="auto",
    weekly_seasonality="auto",
    daily_seasonality="auto",
)
seasonality

{'yearly_seasonality': 'auto',
 'quarterly_seasonality': 'auto',
 'monthly_seasonality': 'auto',
 'weekly_seasonality': 'auto',
 'daily_seasonality': 'auto'}

In [13]:
# holidays modeling
get_available_holiday_lookup_countries(["US"])
get_available_holidays_across_countries(countries=["US"],
                                        year_start=2015,
                                        year_end=2021)

['Christmas Day',
 'Christmas Day (Observed)',
 'Columbus Day',
 'Halloween',
 'Independence Day',
 'Independence Day (Observed)',
 'Juneteenth National Independence Day',
 'Juneteenth National Independence Day (Observed)',
 'Labor Day',
 'Martin Luther King Jr. Day',
 'Memorial Day',
 "New Year's Day",
 "New Year's Day (Observed)",
 'Thanksgiving',
 'Veterans Day',
 'Veterans Day (Observed)',
 "Washington's Birthday"]

In [14]:
# specifying events
events = dict(
    holidays_to_model_separately = ["New Year's Day"],
    holiday_pre_num_days = 2,
    holiday_post_num_days = 2,
    holiday_pre_post_num_dict = {"New Year's Day":(3,1)},
    # adding one thats not there
    # daily_event_df_dict = {"elections": pd.DataFrame({
    #     "date": ["2016-11-03", "2020-11-03"],
    #     "event_name": ["elections"] * 2
    # })}
)
events

{'holidays_to_model_separately': ["New Year's Day"],
 'holiday_pre_num_days': 2,
 'holiday_post_num_days': 2,
 'holiday_pre_post_num_dict': {"New Year's Day": (3, 1)}}

In [15]:
# changepoints: reflects changes in the trend
changepoints = dict(changepoints_dict=dict(method="auto"))

In [16]:
# regressors
regressors = dict(regressor_cols=["Easter", "Temperature", "Marketing"])
regressors

{'regressor_cols': ['Easter', 'Temperature', 'Marketing']}

In [17]:
# lagged regressors
lagged_regressors = dict(lagged_regressor_dict={"Temperature": "auto",
                                                 "Easter": "auto",
                                                 "Marketing": "auto"})

In [18]:
# autoregression: depoendent on the forecasting horizon
autoregression = dict(autoreg_dict= "auto")

### Model notes
- ridge regression model might be better.
- less risk of overfitting
- penalizes extreme slopes
  - makes more sense when we have less data.

- XGBoost
  - widely applicable
  - might work as well

In [19]:
# fitting algos

custom = dict(fit_algorithm_dict=[
    dict(fit_algorithm="linear"),
    dict(fit_algorithm="ridge"),
    dict(fit_algorithm="rf"),
    dict(fit_algorithm="gradient_boosting"),
])
custom

{'fit_algorithm_dict': [{'fit_algorithm': 'linear'},
  {'fit_algorithm': 'ridge'},
  {'fit_algorithm': 'rf'},
  {'fit_algorithm': 'gradient_boosting'}]}

## Building the Model

In [20]:
model_components = ModelComponentsParam(
    growth=growth,
    seasonality=seasonality,
    events=events,
    changepoints=changepoints,
    regressors=regressors,
    lagged_regressors=lagged_regressors,
    autoregression=autoregression,
    custom=custom
)

In [21]:
# cross-validation
evaluation_period = EvaluationPeriodParam(
    cv_min_train_periods=df.shape[0] - 180 - 31,
    cv_expanding_window=True,
    cv_max_splits=30,
    cv_periods_between_splits=16
)

In [22]:
# evaluation metric

evaluation_metric = EvaluationMetricParam(
    cv_selection_metric=EvaluationMetricEnum.RootMeanSquaredError.name
)

In [23]:
# model config
config = ForecastConfig(
    model_template=ModelTemplateEnum.SILVERKITE.name,
    forecast_horizon=31,
    metadata_param=metadata,
    model_components_param=model_components,
    evaluation_period_param=evaluation_period,
    evaluation_metric_param=evaluation_metric,
)

In [24]:
# forecasting VERY LONG
# fits multiplt models to tune params
forecaster = Forecaster()
result = forecaster.run_forecast_config(df=df, config=config)


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']



In [25]:
# visualization

fig = result.backtest.plot()
iplot(fig)

In [29]:
# cv results
cv_results = summarize_grid_search_results(
    grid_search=result.grid_search,
    decimals=1,
    score_func=EvaluationMetricEnum.RootMeanSquaredError.name
)

In [31]:
# set cv results index
cv_results["params"] = cv_results["params"].astype(str)
cv_results.set_index("params", drop=True, inplace=True)

In [33]:
# results based on RootMeanSquaredError

cv_results[["rank_test_RMSE",
            "mean_test_RMSE" ,
            "param_estimator__fit_algorithm_dict",
            "param_estimator__growth_term"]]


Unnamed: 0_level_0,rank_test_RMSE,mean_test_RMSE,param_estimator__fit_algorithm_dict,param_estimator__growth_term
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",3,47.5,{'fit_algorithm': 'linear'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",1,47.4,{'fit_algorithm': 'linear'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",2,47.5,{'fit_algorithm': 'linear'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",6,48.5,{'fit_algorithm': 'ridge'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",4,48.0,{'fit_algorithm': 'ridge'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",5,48.4,{'fit_algorithm': 'ridge'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",11,64.5,{'fit_algorithm': 'rf'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",12,65.4,{'fit_algorithm': 'rf'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",10,64.0,{'fit_algorithm': 'rf'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",9,57.7,{'fit_algorithm': 'gradient_boosting'},linear


In [36]:
# best results
best_params = cv_results[cv_results.rank_test_RMSE == 1][["mean_test_RMSE" ,
                                            "param_estimator__fit_algorithm_dict",
                                            "param_estimator__growth_term"]].transpose()

In [38]:
# Exporting the best tuned parameters
best_params.to_csv("Forecasting Product/best_params_silverkite.csv")