# Baseline Methods Benchmarking for M4 Dataset

## Setup

In [2]:
import sys
import os

# Go up one level to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

In [3]:
from data_provider.data_factory import data_provider
from types import SimpleNamespace
from utils.tools import visual
from data_provider.m4 import M4Meta
from utils.m4_summary import M4Summary
import numpy as np
import pandas as pd
import torch

DATA_PATH = '../../Time-Series-Library/dataset/m4'     # Replace with actual dataset path

In [4]:
def get_args(seasonal_pattern: str='Hourly') -> SimpleNamespace:
    return SimpleNamespace(
        data='m4',
        root_path=DATA_PATH,
        data_path=f'{seasonal_pattern}-train.csv',
        features='M',
        target='value',
        freq='h',                                         # Inferred from 'Hourly'
        seq_len=36,                                       # Typical M4 short-term setting
        label_len=18,                                     # Used for decoder input, not needed for naive
        pred_len=M4Meta.horizons_map[seasonal_pattern],   # Forecast horizon
        embed='timeF',
        seasonal_patterns=seasonal_pattern,
        batch_size=1,
        num_workers=0,
    )

def get_data(args: SimpleNamespace):
    _, train_loader = data_provider(args, flag='train')
    _, test_loader = data_provider(args, flag='test')

    x = train_loader.dataset.timeseries                 # list of all time series
    # True future values from test set
    y = test_loader.dataset.timeseries                  # shape: [B, pred_len]

    # x = torch.tensor(x, dtype=torch.float32)
    x = [torch.tensor(s, dtype=torch.float32) for s in x]
    y = torch.tensor(y, dtype=torch.float32)
    return x, y, test_loader

def evaluate(model: str, args: SimpleNamespace, x: list[torch.Tensor], y: torch.Tensor, test_loader, preds: torch.Tensor):
    x_np = [s.numpy() for s in x]
    preds_np = preds.numpy()
    test_path = f'./test_results/m4_{model}_{args.seasonal_patterns}/'
    os.makedirs(test_path, exist_ok=True)

    for i in range(0, preds.shape[0], preds.shape[0] // 10):
        gt = np.concatenate((x_np[i], y[i].numpy()), axis=0)
        prd = np.concatenate((x_np[i], preds_np[i]), axis=0)
        visual(gt, prd, os.path.join(f'./test_results/m4_{model}_{args.seasonal_patterns}/', f'{i}.pdf'))
    
    folder_path = './m4_results/' + model + '/'
    os.makedirs(folder_path, exist_ok=True)

    # Save as DataFrame with proper ID indexing
    forecasts_df = pd.DataFrame(preds_np, columns=[f'V{i + 1}' for i in range(args.pred_len)])
    forecasts_df.to_csv(os.path.join(folder_path, args.seasonal_patterns + '_forecast.csv'), index=False)

    required_files = {
        'Weekly_forecast.csv',
        'Monthly_forecast.csv',
        'Yearly_forecast.csv',
        'Daily_forecast.csv',
        'Hourly_forecast.csv',
        'Quarterly_forecast.csv',
    }

    if required_files.issubset(set(os.listdir(folder_path))):
        m4_summary = M4Summary(folder_path, args.root_path)
        smape_results, owa_results, mape, mase = m4_summary.evaluate()
        print('smape:', smape_results)
        print('mape:', mape)
        print('mase:', mase)
        print('owa:', owa_results)
    else:
        print('After all 6 tasks are finished, you can calculate the averaged index.')


## Naive Predictions

We have 3 types: last value, mean value, and seasonal (predicting using points from the last cycle)

In [4]:
def naive_last(x, pred_len):
    return torch.stack([
        torch.full((pred_len,), s[-1].item()) for s in x
    ])

def naive_mean(x, pred_len):
    return torch.stack([
        torch.full((pred_len,), s.mean().item()) for s in x
    ])

def naive_seasonal(x, pred_len, season_len):
    preds = []
    for s in x:
        if len(s) < season_len:
            raise ValueError("Time series too short for given season length")
        # Repeat the last season values to fill pred_len
        last_season = s[-season_len:]
        num_repeats = (pred_len + season_len - 1) // season_len  # ceil division
        repeated = last_season.repeat(num_repeats)[:pred_len]
        preds.append(repeated)
    return torch.stack(preds)

In [5]:
print('Naive Last')
for pattern in ["Monthly", "Yearly", "Quarterly", "Weekly", "Daily", "Hourly"]:
    args = get_args(pattern)
    x, y, test_loader = get_data(args)
    last_preds = naive_last(x, args.pred_len)
    evaluate('last_naive', args, x, y, test_loader, last_preds)

Naive Last
train 48000
test 48000
smape: {'Yearly': 16.342, 'Quarterly': 11.61, 'Monthly': 15.256, 'Others': 6.793, 'Average': 14.208}
mape: {'Yearly': 17.507, 'Quarterly': 13.182, 'Monthly': 19.014, 'Others': 7.33, 'Average': 16.684}
mase: {'Yearly': 3.974, 'Quarterly': 1.477, 'Monthly': 1.205, 'Others': 3.932, 'Average': 2.044}
owa: {'Yearly': 1.0, 'Quarterly': 1.066, 'Monthly': 1.095, 'Others': 1.335, 'Average': 1.058}
train 23000
test 23000
smape: {'Yearly': 16.342, 'Quarterly': 11.61, 'Monthly': 15.256, 'Others': 6.793, 'Average': 14.208}
mape: {'Yearly': 17.507, 'Quarterly': 13.182, 'Monthly': 19.014, 'Others': 7.33, 'Average': 16.684}
mase: {'Yearly': 3.974, 'Quarterly': 1.477, 'Monthly': 1.205, 'Others': 3.932, 'Average': 2.044}
owa: {'Yearly': 1.0, 'Quarterly': 1.066, 'Monthly': 1.095, 'Others': 1.335, 'Average': 1.058}
train 24000
test 24000
smape: {'Yearly': 16.342, 'Quarterly': 11.61, 'Monthly': 15.256, 'Others': 6.793, 'Average': 14.208}
mape: {'Yearly': 17.507, 'Quarterly

In [6]:
print('Naive Mean')
for pattern in ["Monthly", "Yearly", "Quarterly", "Weekly", "Daily", "Hourly"]:
    args = get_args(pattern)
    x, y, test_loader = get_data(args)
    mean_preds = naive_mean(x, args.pred_len)
    evaluate('mean_naive', args, x, y, test_loader, mean_preds)

Naive Mean


train 48000
test 48000
smape: {'Yearly': 56.095, 'Quarterly': 40.006, 'Monthly': 35.153, 'Others': 32.123, 'Average': 40.983}
mape: {'Yearly': 48.357, 'Quarterly': 38.6, 'Monthly': 42.625, 'Others': 39.2, 'Average': 42.806}
mase: {'Yearly': 13.421, 'Quarterly': 6.502, 'Monthly': 4.109, 'Others': 36.742, 'Average': 8.457}
owa: {'Yearly': 3.405, 'Quarterly': 4.187, 'Monthly': 3.15, 'Others': 9.175, 'Average': 3.722}
train 23000
test 23000
smape: {'Yearly': 56.095, 'Quarterly': 40.006, 'Monthly': 35.153, 'Others': 32.123, 'Average': 40.983}
mape: {'Yearly': 48.357, 'Quarterly': 38.6, 'Monthly': 42.625, 'Others': 39.2, 'Average': 42.806}
mase: {'Yearly': 13.421, 'Quarterly': 6.502, 'Monthly': 4.109, 'Others': 36.742, 'Average': 8.457}
owa: {'Yearly': 3.405, 'Quarterly': 4.187, 'Monthly': 3.15, 'Others': 9.175, 'Average': 3.722}
train 24000
test 24000
smape: {'Yearly': 56.095, 'Quarterly': 40.006, 'Monthly': 35.153, 'Others': 32.123, 'Average': 40.983}
mape: {'Yearly': 48.357, 'Quarterly': 

In [7]:
print('Naive Seasonal')
for pattern in ["Monthly", "Yearly", "Quarterly", "Weekly", "Daily", "Hourly"]:
    args = get_args(pattern)
    x, y, test_loader = get_data(args)
    seasonal_preds = naive_seasonal(x, args.pred_len, M4Meta.frequency_map[args.seasonal_patterns])
    evaluate('seasonal_naive', args, x, y, test_loader, seasonal_preds)

Naive Seasonal
train 48000
test 48000
smape: {'Yearly': 16.342, 'Quarterly': 12.521, 'Monthly': 15.988, 'Others': 4.384, 'Average': 14.657}
mape: {'Yearly': 17.507, 'Quarterly': 14.193, 'Monthly': 19.223, 'Others': 5.5, 'Average': 16.935}
mase: {'Yearly': 3.974, 'Quarterly': 1.602, 'Monthly': 1.26, 'Others': 3.07, 'Average': 2.057}
owa: {'Yearly': 1.0, 'Quarterly': 1.153, 'Monthly': 1.146, 'Others': 0.945, 'Average': 1.078}
train 23000
test 23000
smape: {'Yearly': 16.342, 'Quarterly': 12.521, 'Monthly': 15.988, 'Others': 4.384, 'Average': 14.657}
mape: {'Yearly': 17.507, 'Quarterly': 14.193, 'Monthly': 19.223, 'Others': 5.5, 'Average': 16.935}
mase: {'Yearly': 3.974, 'Quarterly': 1.602, 'Monthly': 1.26, 'Others': 3.07, 'Average': 2.057}
owa: {'Yearly': 1.0, 'Quarterly': 1.153, 'Monthly': 1.146, 'Others': 0.945, 'Average': 1.078}
train 24000
test 24000
smape: {'Yearly': 16.342, 'Quarterly': 12.521, 'Monthly': 15.988, 'Others': 4.384, 'Average': 14.657}
mape: {'Yearly': 17.507, 'Quarterl

In [5]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [18]:
def ets_forecast(x, pred_len, season_len):
    return torch.stack([
        torch.tensor(
            ExponentialSmoothing(
                s.numpy(),
                trend='add',
                seasonal='add' if season_len > 1 else None,
                seasonal_periods=season_len if season_len > 1 else None
            ).fit().forecast(pred_len),
            dtype=torch.float32
        )
        for s in x
    ])


In [19]:
print('ETS - Exponential Smoothing')
for pattern in ["Monthly", "Yearly", "Quarterly", "Weekly", "Daily", "Hourly"]:
    args = get_args(pattern)
    x, y, test_loader = get_data(args)
    season_len = M4Meta.frequency_map[args.seasonal_patterns]  # e.g., 12 for Monthly
    preds = ets_forecast(x, args.pred_len, season_len)
    evaluate('ets', args, x, y, test_loader, preds)

ETS - Exponential Smoothing
train 48000
test 48000
smape: {'Yearly': 16.391, 'Quarterly': 11.089, 'Monthly': 15.656, 'Others': 6.777, 'Average': 14.285}
mape: {'Yearly': 17.738, 'Quarterly': 12.68, 'Monthly': 18.293, 'Others': 7.28, 'Average': 16.267}
mase: {'Yearly': 3.982, 'Quarterly': 1.417, 'Monthly': 1.002, 'Others': 3.928, 'Average': 1.934}
owa: {'Yearly': 1.002, 'Quarterly': 1.02, 'Monthly': 1.014, 'Others': 1.332, 'Average': 1.032}
train 23000
test 23000
smape: {'Yearly': 16.7, 'Quarterly': 11.089, 'Monthly': 15.656, 'Others': 6.777, 'Average': 14.356}
mape: {'Yearly': 19.47, 'Quarterly': 12.68, 'Monthly': 18.293, 'Others': 7.28, 'Average': 16.666}
mase: {'Yearly': 3.619, 'Quarterly': 1.417, 'Monthly': 1.002, 'Others': 3.928, 'Average': 1.85}
owa: {'Yearly': 0.966, 'Quarterly': 1.02, 'Monthly': 1.014, 'Others': 1.332, 'Average': 1.013}
train 24000
test 24000
smape: {'Yearly': 16.7, 'Quarterly': 11.1, 'Monthly': 15.656, 'Others': 6.777, 'Average': 14.359}
mape: {'Yearly': 19.47,

In [20]:
from pmdarima import auto_arima

def naive2_forecast(x, pred_len, season_len):
    from pmdarima import auto_arima
    preds = []
    for s in x:
        arr = s.numpy()
        if season_len > 1 and len(arr) >= season_len:
            # Seasonal naive
            repeated = arr[-season_len:].repeat((pred_len + season_len - 1) // season_len)[:pred_len]
            preds.append(torch.tensor(repeated, dtype=torch.float32))
        else:
            # ARIMA fallback
            try:
                model = auto_arima(
                    arr,
                    start_p=0, start_q=0,
                    max_p=2, max_q=2,
                    seasonal=False,
                    stepwise=True,
                    suppress_warnings=True,
                    error_action='ignore',
                    maxiter=10
                )
                preds.append(torch.tensor(model.predict(n_periods=pred_len), dtype=torch.float32))
            except:
                # Fallback to naive last if ARIMA fails
                preds.append(torch.full((pred_len,), arr[-1], dtype=torch.float32))
    return torch.stack(preds)

In [21]:
print('Naive2 Forecast')
for pattern in ["Monthly", "Yearly", "Quarterly", "Weekly", "Daily", "Hourly"]:
    args = get_args(pattern)
    x, y, test_loader = get_data(args)
    season_len = M4Meta.frequency_map[args.seasonal_patterns]  # e.g., 12 for Monthly
    preds = naive2_forecast(x, args.pred_len, season_len)
    evaluate('naive2', args, x, y, test_loader, preds)

Naive2 Forecast
train 48000
test 48000
After all 6 tasks are finished, you can calculate the averaged index.
train 23000
test 23000
After all 6 tasks are finished, you can calculate the averaged index.
train 24000
test 24000
After all 6 tasks are finished, you can calculate the averaged index.
train 359
test 359
After all 6 tasks are finished, you can calculate the averaged index.
train 4227
test 4227
After all 6 tasks are finished, you can calculate the averaged index.
train 414
test 414
smape: {'Yearly': 15.339, 'Quarterly': 13.592, 'Monthly': 17.721, 'Others': 6.536, 'Average': 15.623}
mape: {'Yearly': 18.295, 'Quarterly': 15.629, 'Monthly': 21.239, 'Others': 11.477, 'Average': 18.727}
mase: {'Yearly': 3.422, 'Quarterly': 1.748, 'Monthly': 1.478, 'Others': 4.081, 'Average': 2.12}
owa: {'Yearly': 0.9, 'Quarterly': 1.254, 'Monthly': 1.309, 'Others': 1.331, 'Average': 1.13}


In [28]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.deterministic import Fourier
import numpy as np
import torch

def dhr_arima_forecast(x, pred_len, season_len):
    preds = []
    for s in x:
        arr = s.numpy()
        t_idx = np.arange(len(arr))
        t_future = np.arange(len(arr), len(arr) + pred_len)

        # Dynamically set order to ensure 2*order <= season_len
        order = max(1, min(3, season_len // 2))

        try:
            fourier = Fourier(period=season_len, order=order)
            X = fourier.in_sample(index=t_idx)
            X_fore = fourier.in_sample(index=t_future)

            model = SARIMAX(
                arr,
                exog=X,
                order=(1, 0, 0),
                seasonal_order=(0, 0, 0, 0),
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            fit = model.fit(disp=False)
            forecast = fit.forecast(steps=pred_len, exog=X_fore)
            preds.append(torch.tensor(forecast, dtype=torch.float32))
        except:
            preds.append(torch.full((pred_len,), arr[-1], dtype=torch.float32))  # fallback
    return torch.stack(preds)


In [29]:
print('DHR-ARIMA')
for pattern in ["Monthly", "Yearly", "Quarterly", "Weekly", "Daily", "Hourly"]:
    args = get_args(pattern)
    x, y, test_loader = get_data(args)
    season_len = M4Meta.frequency_map[args.seasonal_patterns]  # e.g., 12 for Monthly
    preds = dhr_arima_forecast(x, args.pred_len, season_len)
    evaluate('dhr-arima', args, x, y, test_loader, preds)

DHR-ARIMA
train 48000
test 48000
After all 6 tasks are finished, you can calculate the averaged index.
train 23000
test 23000
After all 6 tasks are finished, you can calculate the averaged index.
train 24000
test 24000
After all 6 tasks are finished, you can calculate the averaged index.
train 359
test 359
After all 6 tasks are finished, you can calculate the averaged index.
train 4227
test 4227
After all 6 tasks are finished, you can calculate the averaged index.
train 414
test 414
smape: {'Yearly': 16.342, 'Quarterly': 11.61, 'Monthly': 15.256, 'Others': 6.793, 'Average': 14.208}
mape: {'Yearly': 17.507, 'Quarterly': 13.182, 'Monthly': 19.014, 'Others': 7.33, 'Average': 16.684}
mase: {'Yearly': 3.974, 'Quarterly': 1.477, 'Monthly': 1.205, 'Others': 3.932, 'Average': 2.044}
owa: {'Yearly': 1.0, 'Quarterly': 1.066, 'Monthly': 1.095, 'Others': 1.335, 'Average': 1.058}
