# Time Series Forecasting - Getting Started

This notebook demonstrates how to use the time series forecasting models for the Hull Tactical Market Prediction competition.

## 1. Setup and Imports

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from utils import TimeSeriesPreprocessor, split_time_series
from models import (
    XGBoostTimeSeriesModel,
    LightGBMTimeSeriesModel,
    CatBoostTimeSeriesModel,
    ProphetTimeSeriesModel,
    ChronosTimeSeriesModel
)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 6)

print("Setup complete!")

## 2. Load and Explore Data

In [None]:
# Load data
train_df = pd.read_csv('../data/train.csv')

# Parse dates
train_df['date'] = pd.to_datetime(train_df['date'])
train_df = train_df.sort_values('date').reset_index(drop=True)

print(f"Data shape: {train_df.shape}")
print(f"\nDate range: {train_df['date'].min()} to {train_df['date'].max()}")
print(f"\nFirst few rows:")
train_df.head()

In [None]:
# Plot time series
plt.figure(figsize=(15, 6))
plt.plot(train_df['date'], train_df['target'])
plt.title('Time Series Plot', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Target Value')
plt.grid(True, alpha=0.3)
plt.show()

## 3. Feature Engineering

In [None]:
# Create preprocessor
preprocessor = TimeSeriesPreprocessor(scaler_type='standard')

# Create all features
df_features = preprocessor.create_all_features(
    train_df,
    target_col='target',
    lags=[1, 2, 3, 5, 7, 14, 21, 30],
    windows=[7, 14, 30, 60]
)

print(f"Features shape: {df_features.shape}")
print(f"\nFeature columns:")
print(df_features.columns.tolist())

## 4. Train-Test Split

In [None]:
# Split data
train_data, val_data = split_time_series(df_features, test_size=0.2)

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

# Prepare features and targets
feature_cols = [col for col in df_features.columns if col not in ['date', 'target']]

X_train = train_data[feature_cols].values
y_train = train_data['target'].values
X_val = val_data[feature_cols].values
y_val = val_data['target'].values

# Scale features
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

## 5. Train Models

### 5.1 XGBoost

In [None]:
# Train XGBoost
xgb_model = XGBoostTimeSeriesModel()
xgb_metrics = xgb_model.train(X_train, y_train, X_val, y_val)

print("\nXGBoost Results:")
print(f"  Train RMSE: {xgb_metrics['train_rmse']:.6f}")
print(f"  Val RMSE: {xgb_metrics['val_rmse']:.6f}")
print(f"  Val MAE: {xgb_metrics['val_mae']:.6f}")

### 5.2 LightGBM

In [None]:
# Train LightGBM
lgb_model = LightGBMTimeSeriesModel()
lgb_metrics = lgb_model.train(X_train, y_train, X_val, y_val)

print("\nLightGBM Results:")
print(f"  Train RMSE: {lgb_metrics['train_rmse']:.6f}")
print(f"  Val RMSE: {lgb_metrics['val_rmse']:.6f}")
print(f"  Val MAE: {lgb_metrics['val_mae']:.6f}")

### 5.3 CatBoost

In [None]:
# Train CatBoost
cat_model = CatBoostTimeSeriesModel()
cat_metrics = cat_model.train(X_train, y_train, X_val, y_val)

print("\nCatBoost Results:")
print(f"  Train RMSE: {cat_metrics['train_rmse']:.6f}")
print(f"  Val RMSE: {cat_metrics['val_rmse']:.6f}")
print(f"  Val MAE: {cat_metrics['val_mae']:.6f}")

### 5.4 Prophet

In [None]:
# Prepare data for Prophet
prophet_model = ProphetTimeSeriesModel()
prophet_train = prophet_model.prepare_data(train_data, 'date', 'target')
prophet_val = prophet_model.prepare_data(val_data, 'date', 'target')

# Train Prophet
prophet_metrics = prophet_model.train(prophet_train, verbose=False)

# Validate
val_forecast = prophet_model.predict(prophet_val[['ds']])
val_rmse = np.sqrt(np.mean((val_forecast['yhat'].values - prophet_val['y'].values)**2))
val_mae = np.mean(np.abs(val_forecast['yhat'].values - prophet_val['y'].values))

print("\nProphet Results:")
print(f"  Train RMSE: {prophet_metrics['train_rmse']:.6f}")
print(f"  Val RMSE: {val_rmse:.6f}")
print(f"  Val MAE: {val_mae:.6f}")

## 6. Model Comparison

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'CatBoost', 'Prophet'],
    'Val RMSE': [
        xgb_metrics['val_rmse'],
        lgb_metrics['val_rmse'],
        cat_metrics['val_rmse'],
        val_rmse
    ],
    'Val MAE': [
        xgb_metrics['val_mae'],
        lgb_metrics['val_mae'],
        cat_metrics['val_mae'],
        val_mae
    ]
})

comparison_df = comparison_df.sort_values('Val RMSE')
print("\nModel Comparison:")
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

comparison_df.plot(x='Model', y='Val RMSE', kind='bar', ax=axes[0], legend=False)
axes[0].set_title('Validation RMSE Comparison', fontsize=14)
axes[0].set_ylabel('RMSE')
axes[0].grid(True, alpha=0.3)

comparison_df.plot(x='Model', y='Val MAE', kind='bar', ax=axes[1], legend=False, color='orange')
axes[1].set_title('Validation MAE Comparison', fontsize=14)
axes[1].set_ylabel('MAE')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Feature Importance (XGBoost)

In [None]:
# Get feature importance
importance_df = xgb_model.get_feature_importance(top_n=15)
importance_df['feature_name'] = [feature_cols[i] for i in importance_df['feature']]

# Plot
plt.figure(figsize=(12, 6))
plt.barh(range(len(importance_df)), importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature_name'])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importance (XGBoost)', fontsize=14)
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 8. Predictions Visualization

In [None]:
# Get predictions from all models
xgb_pred = xgb_model.predict(X_val)
lgb_pred = lgb_model.predict(X_val)
cat_pred = cat_model.predict(X_val)

# Ensemble prediction (simple average)
ensemble_pred = (xgb_pred + lgb_pred + cat_pred) / 3

# Plot
plt.figure(figsize=(15, 6))
plt.plot(val_data['date'].values, y_val, label='Actual', linewidth=2, alpha=0.8)
plt.plot(val_data['date'].values, xgb_pred, label='XGBoost', alpha=0.6)
plt.plot(val_data['date'].values, lgb_pred, label='LightGBM', alpha=0.6)
plt.plot(val_data['date'].values, cat_pred, label='CatBoost', alpha=0.6)
plt.plot(val_data['date'].values, ensemble_pred, label='Ensemble', linewidth=2, linestyle='--')
plt.title('Model Predictions vs Actual', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Target Value')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate ensemble RMSE
ensemble_rmse = np.sqrt(np.mean((ensemble_pred - y_val)**2))
print(f"\nEnsemble RMSE: {ensemble_rmse:.6f}")

## 9. Save Models

In [None]:
import os
import joblib

# Create output directory
os.makedirs('../trained_models', exist_ok=True)

# Save models
xgb_model.save_model('../trained_models/xgboost_model.json')
lgb_model.save_model('../trained_models/lightgbm_model.txt')
cat_model.save_model('../trained_models/catboost_model.cbm')
prophet_model.save_model('../trained_models/prophet_model.pkl')

# Save preprocessor
joblib.dump(preprocessor, '../trained_models/preprocessor.pkl')

print("All models saved successfully!")

## 10. Next Steps

1. **Hyperparameter Tuning**: Use the `optimize_hyperparameters()` method to find better parameters
2. **Chronos Model**: Try the Chronos-2 foundation model for zero-shot forecasting
3. **Ensemble Optimization**: Optimize ensemble weights using validation data
4. **Feature Engineering**: Add domain-specific features (holidays, events, etc.)
5. **Cross-validation**: Implement time series cross-validation for robust evaluation

Good luck with the competition! ðŸš€