# Evaluation

## Imports

In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from xgboost import XGBRegressor

from tqdm.notebook import tqdm
import json
import pickle
from typing import List, Dict, Union, Tuple

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [2]:
import sys
import os

notebook_dir = os.path.dirname(os.path.abspath("__file__"))
scripts_dir = os.path.join(notebook_dir, '../..')
sys.path.append(scripts_dir)

from scripts.utils import *
from scripts.optimization import BayesianOptimization, BayesianOptimizationConfig

## Reporting the Model Families and our Best Model

Overall results reporting.
- State and justify your choice of evaluation metrics used.
- Provide at least one overall summary of results that compares the best model from each family you used, in a clear, concise table.
- If comparing an evaluation metric between model families (e.g. comparing accuracy of support vector machines vs logistic regression), do not use just the result of a single training/test split: you must report the mean metric across multiple cross-validation folds (typically 5-fold CV), along with the standard deviation of the metric.

In [3]:
with open("ARIMA_top3.json", 'r') as f:
    arima_params = json.load(f)
arima_params

[[4, 2, 3], [7, 2, 5], [9, 2, 4]]

In [6]:
with open("ARIMA_params.json", 'r') as f:
    d = json.load(f)
d

{'params': [7, 1, 3], 'mean': 255283624.6019246, 'std': 134643144.14765683}

In [5]:
with open('xgboostforecaster.pkl', 'rb') as picklefile:
    bayesian_opt = pickle.load(picklefile)
bayesian_opt.best_params()

{'params': {'n_estimators': 584,
  'max_depth': 6,
  'learning_rate': 0.545,
  'subsample': 0.776,
  'colsample_bytree': 0.549,
  'gamma': 0.386,
  'reg_alpha': 0.833,
  'reg_lambda': 0.973},
 'lag': 6,
 'differentiation': None,
 'mean': 187443293.5457216,
 'std': 97598006.91018973}

## Deep Dive on the Best Performing Model
### Feature Importance and Ablation Analysis

### Sensitivity Analysis

### Trade-Offs Between Metrics

## Failure Analysis