# Minimalistic GIFT-Eval Evaluation

Simple windowed evaluation following TempoPFN approach.

In [1]:
import numpy as np
import jax.numpy as jnp
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent))

from src.config import load_config, cfg_to_training_config
from src.tsf import TimeSeriesForecaster
from src.data.time_features import compute_batch_time_features
from src.data.frequency import parse_frequency

## 1. Load Dataset and Extract Metadata

In [2]:
from gluonts.dataset.repository import get_dataset

dataset = get_dataset("electricity")
test_data = dataset.test

# Get metadata
freq = dataset.metadata.freq  # Get frequency from metadata
prediction_length = dataset.metadata.prediction_length

print(f"Frequency: {freq}")
print(f"Prediction length: {prediction_length}")

Frequency: 1H
Prediction length: 24


## 2. Load Model

In [3]:
cfg = load_config("../conf/training.yaml")
training_config = cfg_to_training_config(cfg)

forecaster = TimeSeriesForecaster(config=training_config)
# forecaster.load("../checkpoints/model.pkl")  # Uncomment when you have a checkpoint

context_length = 512
quantiles = forecaster.quantiles

print(f"Context length: {context_length}")
print(f"Quantiles: {quantiles}")

Context length: 512
Quantiles: [0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]


## 3. Create Windowed Test Dataset

Split each time series into windows: [context | target]

In [4]:
class TestDataset:
    """Split time series into non-overlapping windows."""
    
    def __init__(self, data, context_length, prediction_length, max_windows=20):
        self.data = list(data)
        self.context_length = context_length
        self.prediction_length = prediction_length
        self.max_windows = max_windows
        self.window_size = context_length + prediction_length
    
    def create_windows(self):
        """Generate (context, target, metadata) tuples."""
        windows = []
        
        for ts in self.data:
            target = np.array(ts['target'])
            
            # Calculate how many windows we can fit
            max_possible = (len(target) - self.window_size) // self.prediction_length + 1
            num_windows = min(max_possible, self.max_windows)
            
            # Create windows
            for i in range(num_windows):
                end_idx = len(target) - i * self.prediction_length
                start_idx = end_idx - self.window_size
                
                if start_idx < 0:
                    break
                
                context = target[start_idx:start_idx + self.context_length]
                pred_target = target[start_idx + self.context_length:end_idx]
                
                windows.append({
                    'context': context,
                    'target': pred_target,
                    'start': ts['start'],
                    'item_id': ts.get('item_id', '')
                })
        
        return windows

# Create test dataset
test_dataset = TestDataset(
    data=test_data,
    context_length=context_length,
    prediction_length=prediction_length,
    max_windows=20
)

windows = test_dataset.create_windows()
print(f"Created {len(windows)} windows")

  return pd.Period(val, freq)


Created 44940 windows


## 4. Run Predictions on All Windows

In [5]:
# Parse frequency once
frequency = parse_frequency(freq)

predictions = []
targets = []

for window in windows[:5]:  # Start with first 5 for testing
    # Prepare context
    context = jnp.array(window['context'][None, ..., None, None])  # (1, context_len, 1, 1)
    
    # Compute time features
    history_tf, future_tf = compute_batch_time_features(
        start=[np.datetime64(window['start'])],
        history_length=context_length,
        future_length=prediction_length,
        batch_size=1,
        frequency=[frequency],
        K_max=training_config.time_dim,
        include_extra=False,
    )
    
    # Predict (uncomment when you have a trained model)
    # preds = forecaster.predict(context, history_tf, future_tf)
    # preds = np.array(preds[0, :, 0, :])  # (pred_len, num_quantiles)
    
    # For now, use random predictions for testing
    preds = np.random.randn(prediction_length, len(quantiles))
    
    predictions.append(preds)
    targets.append(window['target'])

print(f"Generated {len(predictions)} predictions")
print(f"Prediction shape: {predictions[0].shape}  # (pred_len, num_quantiles)")

Generated 5 predictions
Prediction shape: (24, 9)  # (pred_len, num_quantiles)


  offset = pd.tseries.frequencies.to_offset(freq_str)


## 5. Compute Metrics

In [6]:
from gluonts.ev.metrics import MeanWeightedSumQuantileLoss, MSE

# Stack predictions and targets
all_preds = np.stack(predictions)  # (num_windows, pred_len, num_quantiles)
all_targets = np.stack(targets)    # (num_windows, pred_len)

print(f"Predictions shape: {all_preds.shape}")
print(f"Targets shape: {all_targets.shape}")

# Initialize metrics
quantile_loss = MeanWeightedSumQuantileLoss(
    quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
)
mse_mean = MSE(forecast_type="mean")
mse_median = MSE(forecast_type=0.5)

# Compute metrics
# Note: GluonTS metrics expect specific input format
# For now, let's compute simple metrics manually

# Extract median (0.5 quantile) predictions
median_idx = list(quantiles).index(0.5)
median_preds = all_preds[:, :, median_idx]

# Compute MSE for median
mse_value = np.mean((median_preds - all_targets) ** 2)

# Compute mean prediction (average across quantiles)
mean_preds = np.mean(all_preds, axis=2)
mse_mean_value = np.mean((mean_preds - all_targets) ** 2)

print(f"\n=== Metrics ===")
print(f"MSE (median): {mse_value:.4f}")
print(f"MSE (mean): {mse_mean_value:.4f}")

Predictions shape: (5, 24, 9)
Targets shape: (5, 24)

=== Metrics ===
MSE (median): 191.6480
MSE (mean): 188.5279


## 6. Compute Quantile Loss

Manual implementation of weighted quantile loss.

In [9]:
def quantile_loss(y_true, y_pred, quantile):
    """Compute quantile loss for a single quantile."""
    error = y_true - y_pred
    return np.mean(np.maximum(quantile * error, (quantile - 1) * error))

# Compute loss for each quantile
quantile_losses = {}
for i, q in enumerate(quantiles):
    q_preds = all_preds[:, :, i]
    loss = quantile_loss(all_targets, q_preds, q)
    quantile_losses[q.item()] = loss

print("\n=== Quantile Losses ===")
for q, loss in quantile_losses.items():
    print(f"Q{q}: {loss:.4f}")

# Average quantile loss
avg_ql = np.mean(list(quantile_losses.values()))
print(f"\nAverage Quantile Loss: {avg_ql:.4f}")


=== Quantile Losses ===
Q0.10000000149011612: 0.8942
Q0.20000000298023224: 1.8204
Q0.30000001192092896: 2.7448
Q0.4000000059604645: 3.6181
Q0.5: 4.5030
Q0.6000000238418579: 5.4280
Q0.699999988079071: 6.2855
Q0.800000011920929: 7.1625
Q0.8999999761581421: 8.1268

Average Quantile Loss: 4.5092


## Summary

This notebook demonstrates:
1. Loading dataset and extracting metadata (frequency from `dataset.metadata.freq`)
2. Creating windowed test dataset (non-overlapping windows)
3. Running predictions on all windows
4. Computing metrics: MSE (mean, median) and Quantile Loss

Next: Port this to `eval.py` script.