[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/ourownstory/test-of-time/blob/main/tutorials/BenchmarkingTemplates.ipynb)

# Running benchmarking experiments
Note: The Benchmarking Framework does currently not properly support lagged covariates with multiple step ahead forecasts.

In [2]:
if 'google.colab' in str(get_ipython()):
    !pip install git+https://github.com/ourownstory/test-of-time.git # may take a while
    #!pip install neuralprophet # much faster, but may not have the latest upgrades/bugfixes

# we also need prophet for this notebook
# !pip install prophet

import pandas as pd
from neuralprophet import NeuralProphet, set_log_level
from tot import Dataset, NeuralProphetModel, ProphetModel
from tot.benchmark import SimpleBenchmark, CrossValidationBenchmark
set_log_level("ERROR")

## Load data

In [3]:
data_location = "https://raw.githubusercontent.com/ourownstory/neuralprophet-data/main/datasets/"

air_passengers_df = pd.read_csv(data_location + 'air_passengers.csv')
peyton_manning_df = pd.read_csv(data_location + 'wp_log_peyton_manning.csv')

## 0. Configure Datasets and Model Parameters
First, we define the datasets that we would like to benchmark on.
Next, we define the models that we want to evaluate and set their hyperparameters.

In [4]:
dataset_list = [
    Dataset(df = air_passengers_df, name = "air_passengers", freq = "MS"),
    Dataset(df = peyton_manning_df, name = "peyton_manning", freq = "D"),
]
model_classes_and_params = [
    (NeuralProphetModel, {"seasonality_mode": "multiplicative", "learning_rate": 0.1}),
    (ProphetModel, {"seasonality_mode": "multiplicative"})
]

Note: As all the classes used in the Benchmark framework are dataclasses, 
they have a print function, allowing us to peek into them if we like:

In [5]:
model_classes_and_params

[(tot.models_neuralprophet.NeuralProphetModel,
  {'seasonality_mode': 'multiplicative', 'learning_rate': 0.1}),
 (tot.models_simple.ProphetModel, {'seasonality_mode': 'multiplicative'})]

## 1. SimpleBenchmark
Setting up a series of Train Test Experiments is quick:

In [6]:
benchmark = SimpleBenchmark(
    model_classes_and_params=model_classes_and_params, # iterate over this list of tuples
    datasets=dataset_list, # iterate over this list
    metrics=["MAE", "MSE", "MASE", "RMSE"],
    test_percentage=0.25,
)
results_train, results_test = benchmark.run()

19:37:44 - cmdstanpy - INFO - Chain [1] start processing
19:37:44 - cmdstanpy - INFO - Chain [1] done processing
19:38:23 - cmdstanpy - INFO - Chain [1] start processing
19:38:23 - cmdstanpy - INFO - Chain [1] done processing


In [7]:
results_test

Unnamed: 0,data,model,params,experiment,MAE,MSE,MASE,RMSE
0,air_passengers,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",air_passengers_NeuralProphet_seasonality_mode_...,25.270485,865.87146,1.243764,29.425694
1,air_passengers,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",air_passengers_Prophet_seasonality_mode_multip...,29.798283,1141.566895,1.466613,33.787083
2,peyton_manning,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",peyton_manning_NeuralProphet_seasonality_mode_...,0.388522,0.357514,1.277334,0.597925
3,peyton_manning,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",peyton_manning_Prophet_seasonality_mode_multip...,0.435576,0.327547,1.432033,0.572317


## 2. CrossValidationBenchmark
Setting up a series of crossvalidated experiments is just as simple:

In [8]:
benchmark_cv = CrossValidationBenchmark(
    model_classes_and_params=model_classes_and_params, # iterate over this list of tuples
    datasets=dataset_list, # iterate over this list
    metrics=["MASE", "RMSE"],
    test_percentage=0.10,
    num_folds=3,
    fold_overlap_pct=0,
)
results_summary, results_train, results_test = benchmark_cv.run()

19:39:06 - cmdstanpy - INFO - Chain [1] start processing
19:39:06 - cmdstanpy - INFO - Chain [1] done processing
19:39:07 - cmdstanpy - INFO - Chain [1] start processing
19:39:07 - cmdstanpy - INFO - Chain [1] done processing
19:39:08 - cmdstanpy - INFO - Chain [1] start processing
19:39:08 - cmdstanpy - INFO - Chain [1] done processing
19:40:53 - cmdstanpy - INFO - Chain [1] start processing
19:40:54 - cmdstanpy - INFO - Chain [1] done processing
19:40:56 - cmdstanpy - INFO - Chain [1] start processing
19:40:56 - cmdstanpy - INFO - Chain [1] done processing
19:40:58 - cmdstanpy - INFO - Chain [1] start processing
19:40:59 - cmdstanpy - INFO - Chain [1] done processing


We now also get a summary DataFrame showing the metrics' mean and standard deviation over all folds.

In [9]:
results_summary

Unnamed: 0,data,model,params,experiment,MASE,RMSE,MASE_std,RMSE_std,split
0,air_passengers,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",air_passengers_NeuralProphet_seasonality_mode_...,0.281459,7.611626,0.015282,0.766766,train
1,air_passengers,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",air_passengers_Prophet_seasonality_mode_multip...,0.311314,8.628308,0.020758,1.253572,train
2,peyton_manning,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",peyton_manning_NeuralProphet_seasonality_mode_...,1.131617,0.492711,0.011406,0.013287,train
3,peyton_manning,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",peyton_manning_Prophet_seasonality_mode_multip...,1.117091,0.48272,0.017666,0.009143,train
0,air_passengers,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",air_passengers_NeuralProphet_seasonality_mode_...,0.919185,23.011145,0.366786,8.233665,test
1,air_passengers,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",air_passengers_Prophet_seasonality_mode_multip...,0.898355,22.903313,0.157018,4.120811,test
2,peyton_manning,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",peyton_manning_NeuralProphet_seasonality_mode_...,1.724643,0.688672,0.613771,0.234687,test
3,peyton_manning,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",peyton_manning_Prophet_seasonality_mode_multip...,1.656257,0.67554,0.582417,0.187746,test


The metrics for each fold are also recorded individually:

In [11]:
results_test

Unnamed: 0,data,model,params,experiment,MASE,RMSE
0,air_passengers,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",air_passengers_NeuralProphet_seasonality_mode_...,"[1.0804276, 0.4115997, 1.2655276]","[26.26348, 11.702176, 31.067776]"
1,air_passengers,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",air_passengers_Prophet_seasonality_mode_multip...,"[1.0324912, 0.67802984, 0.9845448]","[24.708176, 17.20208, 26.79968]"
2,peyton_manning,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",peyton_manning_NeuralProphet_seasonality_mode_...,"[2.5805354, 1.4218292, 1.1715646]","[1.0201913, 0.53664005, 0.5091859]"
3,peyton_manning,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",peyton_manning_Prophet_seasonality_mode_multip...,"[2.3569856, 1.6807915, 0.9309951]","[0.9267114, 0.6245031, 0.47540453]"


## 3. Manual Benchmark
If you need more control over the individual Experiments, you can set them up manually:

In [13]:
from tot.experiment import SimpleExperiment, CrossValidationExperiment
from tot.benchmark import ManualBenchmark, ManualCVBenchmark

### 3.1 ManualBenchmark: Manual SimpleExperiment Benchmark

In [14]:
metrics = ["MAE", "MSE", "RMSE", "MASE", "RMSSE", "MAPE", "SMAPE"]
experiments = [
    SimpleExperiment(
        model_class=NeuralProphetModel,
        params={"seasonality_mode": "multiplicative", "learning_rate": 0.1},
        data=Dataset(df=air_passengers_df, name="air_passengers", freq="MS"),
        metrics=metrics,
        test_percentage=0.25,
    ),
    SimpleExperiment(
        model_class=ProphetModel,
        params={"seasonality_mode": "multiplicative", },
        data=Dataset(df=air_passengers_df, name="air_passengers", freq="MS"),
        metrics=metrics,
        test_percentage=0.25,
    ),
    SimpleExperiment(
        model_class=NeuralProphetModel,
        params={"learning_rate": 0.1},
        data=Dataset(df=peyton_manning_df, name="peyton_manning", freq="D"),
        metrics=metrics,
        test_percentage=0.15,
    ),
    SimpleExperiment(
        model_class=ProphetModel,
        params={},
        data=Dataset(df=peyton_manning_df, name="peyton_manning", freq="D"),
        metrics=metrics,
        test_percentage=0.15,
    ),
]
benchmark = ManualBenchmark(
    experiments=experiments,
    metrics=metrics,
)
results_train, results_test = benchmark.run()

19:47:10 - cmdstanpy - INFO - Chain [1] start processing
19:47:10 - cmdstanpy - INFO - Chain [1] done processing
19:47:51 - cmdstanpy - INFO - Chain [1] start processing
19:47:52 - cmdstanpy - INFO - Chain [1] done processing


In [15]:
results_test

Unnamed: 0,data,model,params,experiment,MAE,MSE,RMSE,MASE,RMSSE,MAPE,SMAPE
0,air_passengers,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",air_passengers_NeuralProphet_seasonality_mode_...,25.270485,865.87146,29.425694,1.243764,1.138945,6.051496,2.906882
1,air_passengers,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",air_passengers_Prophet_seasonality_mode_multip...,29.798283,1141.566895,33.787083,1.466613,1.307756,7.474031,3.55934
2,peyton_manning,NeuralProphet,"{'learning_rate': 0.1, '_data_params': {'freq'...",peyton_manning_NeuralProphet_learning_rate_0.1...,0.684821,0.592419,0.769687,2.226078,1.624652,8.813568,4.197721
3,peyton_manning,Prophet,{'_data_params': {'freq': 'D'}},peyton_manning_Prophet__data_params_{freq_ D},0.602923,0.472586,0.687449,1.959859,1.451063,7.778729,3.730783


### 3.2 ManualCVBenchmark: Manual CrossValidationExperiment Benchmark

In [16]:
air_passengers_df = pd.read_csv(data_location + 'air_passengers.csv')
experiments = [
    CrossValidationExperiment(
        model_class=NeuralProphetModel,
        params={"seasonality_mode": "multiplicative", "learning_rate": 0.1},
        data=Dataset(df=air_passengers_df, name="air_passengers", freq="MS"),
        metrics=metrics,
        test_percentage=0.10,
        num_folds=3,
        fold_overlap_pct=0,
    ),
    CrossValidationExperiment(
        model_class=ProphetModel,
        params={"seasonality_mode": "multiplicative", },
        data=Dataset(df=air_passengers_df, name="air_passengers", freq="MS"),
        metrics=metrics,
        test_percentage=0.10,
        num_folds=3,
        fold_overlap_pct=0,
    ),
]
benchmark_cv = ManualCVBenchmark(
    experiments=experiments,
    metrics=metrics,
)
results_summary, results_train, results_test = benchmark_cv.run()

19:48:29 - cmdstanpy - INFO - Chain [1] start processing
19:48:29 - cmdstanpy - INFO - Chain [1] done processing
19:48:30 - cmdstanpy - INFO - Chain [1] start processing
19:48:30 - cmdstanpy - INFO - Chain [1] done processing
19:48:31 - cmdstanpy - INFO - Chain [1] start processing
19:48:32 - cmdstanpy - INFO - Chain [1] done processing


In [17]:
results_summary

Unnamed: 0,data,model,params,experiment,MAE,MSE,RMSE,MASE,RMSSE,MAPE,SMAPE,MAE_std,MSE_std,RMSE_std,MASE_std,RMSSE_std,MAPE_std,SMAPE_std,split
0,air_passengers,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",air_passengers_NeuralProphet_seasonality_mode_...,6.005144,58.52478,7.611626,0.281459,0.277852,3.026435,1.50169,0.659361,11.457359,0.766766,0.015282,0.00942,0.111288,0.05542,train
1,air_passengers,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",air_passengers_Prophet_seasonality_mode_multip...,6.664346,76.019157,8.628308,0.311314,0.31376,3.095889,1.556583,0.9444,20.777409,1.253572,0.020758,0.023332,0.253554,0.128683,train
0,air_passengers,NeuralProphet,"{'seasonality_mode': 'multiplicative', 'learni...",air_passengers_NeuralProphet_seasonality_mode_...,19.813911,597.305969,23.011145,0.919185,0.838727,4.712596,2.361495,8.777608,344.401245,8.233665,0.366786,0.286632,1.946631,1.008415,test
1,air_passengers,Prophet,"{'seasonality_mode': 'multiplicative', '_data_...",air_passengers_Prophet_seasonality_mode_multip...,19.145828,541.542786,22.903313,0.898355,0.839961,4.633598,2.285497,3.770983,179.16922,4.120811,0.157018,0.154204,0.681187,0.339349,test
