# Forecasting with Machine Learning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utilsforecast.plotting import plot_series
from utilsforecast.evaluation import evaluate
from utilsforecast.losses import *

from functools import partial

import warnings
warnings.filterwarnings("ignore")

In [None]:
def plot_metrics_bar(eval_df):
    methods = eval_df.columns[1:]
    values = eval_df.iloc[0].values[1:]
    
    sorted_data = sorted(zip(methods, values), key=lambda x: x[1], reverse=True)
    methods_sorted, values_sorted = zip(*sorted_data)
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(methods_sorted, values_sorted)
    
    for bar, value in zip(bars, values_sorted):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05, 
                 f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.xlabel('Methods')
    plt.ylabel('Mean absolute error (MAE)')
    plt.tight_layout()
    
    plt.show()

In [None]:
data_url = "https://raw.githubusercontent.com/marcopeix/youtube_tutorials/refs/heads/main/data/daily_sales_french_bakery.csv"
df = pd.read_csv(data_url, parse_dates=["ds"])
df = df.groupby('unique_id').filter(lambda x: len(x) >= 28)
df.head()

In [None]:
plot_series(df=df, ids=["BAGUETTE", "CROISSANT"], palette="viridis")

In [None]:
plot_series(df=df, ids=["BAGUETTE", "CROISSANT"], max_insample_length=56, palette="viridis")

## Baseline model

In [None]:
from statsforecast import StatsForecast
from statsforecast.models import SeasonalNaive

In [None]:
horizon = 7

In [None]:
unique_ids = ["BAGUETTE", "CROISSANT"]
small_df = df[df["unique_id"].isin(unique_ids)]

models = [
    SeasonalNaive(season_length=7),
]

sf = StatsForecast(models=models, freq="D")
baseline_cv_df = sf.cross_validation(
    h=horizon,
    df=small_df,
    n_windows=8,
    step_size=horizon,
    refit=True
)

temp_test = small_df.groupby("unique_id").tail(7*8)
eval_train_df = small_df.drop(temp_test.index).reset_index(drop=True)

evaluation = evaluate(
    baseline_cv_df.drop(["cutoff"], axis=1),
    metrics=[mae, partial(mase, seasonality=7)],
    train_df = eval_train_df
)
evaluation = evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
evaluation

## Machine learning models

In [None]:
from mlforecast import MLForecast

import lightgbm as lgb
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
models ={
    'lgbm': lgb.LGBMRegressor(verbosity=-1),
    'lasso': Lasso(),
    'lin_reg': LinearRegression(),
    'ridge': Ridge(),
    'knn': KNeighborsRegressor(),
    'gbr': GradientBoostingRegressor()
}

### Building features
#### Lags

In [None]:
plot_series(
    df=small_df, 
    forecasts_df=full_eval_df.drop(["y", "cutoff"], axis=1), 
    ids=["BAGUETTE", "CROISSANT"], 
    models=["lgbm", "knn"],
    max_insample_length=140,
    palette="viridis"
)

#### Lag transformations

In [None]:
from mlforecast.lag_transforms import RollingMean, ExpandingMean

In [None]:
ml_cv_df = mlf.cross_validation(
    df=small_df,
    h=horizon,
    n_windows=8,
    step_size=horizon,
    refit=False,
    static_features=[]
)

full_eval_df = baseline_cv_df.merge(ml_cv_df.drop(["cutoff", "y"], axis=1), "left", ["unique_id", "ds"])

full_evaluation = evaluate(
    full_eval_df.drop(["cutoff"], axis=1),
    metrics=[mae, partial(mase, seasonality=7)],
    train_df = eval_train_df
)
full_evaluation = full_evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
full_evaluation

In [None]:
plot_metrics_bar(full_evaluation)

In [None]:
plot_series(
    df=small_df, 
    forecasts_df=full_eval_df.drop(["y", "cutoff"], axis=1), 
    ids=["BAGUETTE", "CROISSANT"], 
    models=["gbr", "lasso"],
    max_insample_length=140,
    palette="viridis"
)

#### Date features

In [None]:
ml_cv_df = mlf.cross_validation(
    df=small_df,
    h=horizon,
    n_windows=8,
    step_size=horizon,
    refit=False,
    static_features=[]
)

full_eval_df = baseline_cv_df.merge(ml_cv_df.drop(["cutoff", "y"], axis=1), "left", ["unique_id", "ds"])

full_evaluation = evaluate(
    full_eval_df.drop(["cutoff"], axis=1),
    metrics=[mae, partial(mase, seasonality=7)],
    train_df = eval_train_df
)
full_evaluation = full_evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
full_evaluation

In [None]:
plot_metrics_bar(full_evaluation)

#### Target transformations

In [None]:
from mlforecast.target_transforms import Differences

In [None]:
ml_cv_df = mlf.cross_validation(
    df=small_df,
    h=horizon,
    n_windows=8,
    step_size=horizon,
    refit=False,
    static_features=[]
)

full_eval_df = baseline_cv_df.merge(ml_cv_df.drop(["cutoff", "y"], axis=1), "left", ["unique_id", "ds"])

full_evaluation = evaluate(
    full_eval_df.drop(["cutoff"], axis=1),
    metrics=[mae, partial(mase, seasonality=7)],
    train_df = eval_train_df
)
full_evaluation = full_evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
full_evaluation

In [None]:
plot_metrics_bar(full_evaluation)

In [None]:
from scipy.fftpack import fft, fftfreq

def analyze_frequencies(signal, sampling_rate):
    signal = signal.values
    n = len(signal)
    fft_values = fft(signal)

    freq = fftfreq(n, 1/sampling_rate)
    
    positive_freq_idx = np.arange(1, n//2)
    frequencies = freq[positive_freq_idx]
    amplitudes = 2.0/n * np.abs(fft_values[positive_freq_idx])
    
    top_indices = np.argsort(amplitudes)[-5:][::-1]
    
    top_frequencies = frequencies[top_indices]
    top_amplitudes = amplitudes[top_indices]
    
    return top_frequencies, top_amplitudes

### Prediction intervals

In [None]:
from mlforecast.utils import PredictionIntervals

In [None]:
# Select ML models


# Initialize MLForecast


# Run cross-validation


ml_prob_cv_df.head()

In [None]:
models = ["ridge", "lasso"]
metrics = [
    scaled_crps
]

evaluation = evaluate(
    ml_prob_cv_df.drop(["ds", "cutoff"], axis=1),
    metrics=metrics,
    models=models,
    level=[80]
)
evaluation = evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
evaluation

In [None]:
plot_series(
    df=small_df, 
    forecasts_df=ml_prob_cv_df.drop(["y", "cutoff"], axis=1), 
    ids=["BAGUETTE", "CROISSANT"], 
    models=["ridge"],
    max_insample_length=140,
    level=[80],
    palette="viridis"
)

### One model per step in the horizon

In [None]:
# Run cross-validation


full_eval_df = baseline_cv_df.merge(ml_cv_df.drop(["cutoff", "y"], axis=1), "left", ["unique_id", "ds"])

full_evaluation = evaluate(
    full_eval_df.drop(["cutoff"], axis=1),
    metrics=[mae, partial(mase, seasonality=7)],
    train_df = eval_train_df
)
full_evaluation = full_evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
full_evaluation

In [None]:
plot_metrics_bar(full_evaluation)

## Hyperparameter optimization
### Tuning the model

In [None]:
import optuna
from mlforecast.auto import AutoMLForecast, AutoLasso

In [None]:
# init config


# fit config


# Initialize AutoMLForecast


# Fit
auto_mlf.fit(
    df=small_df,
    n_windows=8,
    h=horizon,
    step_size=horizon,
    num_samples=10
)

In [None]:
optimized_lasso_config =  auto_mlf.results_['auto_lasso'].best_trial.user_attrs['config']['model_params']

mlf = MLForecast(
    models={
        "default_lasso": Lasso(),
        "optimized_lasso": Lasso(**optimized_lasso_config)
    },
    freq='D',
    lags=range(1,8),
    lag_transforms={
        1: [ExpandingMean()],
        7: [RollingMean(window_size=7)]
    }
)

ml_cv_df = mlf.cross_validation(
    df=small_df,
    h=horizon,
    n_windows=8,
    step_size=horizon,
    refit=False,
    static_features=[]
)

full_eval_df = baseline_cv_df.merge(ml_cv_df.drop(["cutoff", "y"], axis=1), "left", ["unique_id", "ds"])

full_evaluation = evaluate(
    full_eval_df.drop(["cutoff"], axis=1),
    metrics=[mae, partial(mase, seasonality=7)],
    train_df = eval_train_df
)
full_evaluation = full_evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
full_evaluation

### Tuning the features

In [None]:
from mlforecast.lag_transforms import ExpandingStd

In [None]:
# init config


def fit_config(trial):
    return {
        "static_features": []
    }

auto_mlf = AutoMLForecast(
    models={"auto_lasso": AutoLasso()},
    freq='D',
    init_config=tune_init_config,
    fit_config=fit_config
)

auto_mlf.fit(
    df=small_df,
    n_windows=8,
    h=horizon,
    step_size=horizon,
    num_samples=15
)

In [None]:
auto_mlf.results_['auto_lasso'].best_trial.user_attrs['config']

In [None]:
optimized_lasso_config =  auto_mlf.results_['auto_lasso'].best_trial.user_attrs['config']['model_params']
optimized_feats = auto_mlf.results_['auto_lasso'].best_trial.user_attrs['config']['mlf_init_params']

mlf = MLForecast(
    models={
        "default_lasso": Lasso(),
        "optimized_lasso": Lasso(**optimized_lasso_config)
    },
    freq='D',
    **optimized_feats
)

ml_cv_df = mlf.cross_validation(
    df=small_df,
    h=horizon,
    n_windows=8,
    step_size=horizon,
    refit=False,
    static_features=[]
)

full_eval_df = baseline_cv_df.merge(ml_cv_df.drop(["cutoff", "y"], axis=1), "left", ["unique_id", "ds"])

full_evaluation = evaluate(
    full_eval_df.drop(["cutoff"], axis=1),
    metrics=[mae, partial(mase, seasonality=7)],
    train_df = eval_train_df
)
full_evaluation = full_evaluation.drop(['unique_id'], axis=1).groupby('metric').mean().reset_index()
full_evaluation