# ARIMA Weather Forecasting

This notebook trains ARIMA (AutoRegressive Integrated Moving Average) models for weather forecasting.

**Model Type**: Univariate time series
**Approach**: Train 3 separate ARIMA models (one per target variable)
**Target Variables**: temperature, relative_humidity, wind_speed_10m
**Prediction Window**: 72 hours (3 days)
**Training**: CPU-friendly, completes in ~10-20 minutes

## 1. Setup & Installation

In [None]:
# Install required packages
import subprocess
import sys

packages = [
    'pandas',
    'numpy',
    'matplotlib',
    'scikit-learn',
    'statsmodels',
    'pmdarima',  # For auto_arima
    'joblib',
    'tqdm'
]

print("Installing packages...")
for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
print("✅ All packages installed!")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import json
import warnings
from tqdm import tqdm
import time
from pathlib import Path

warnings.filterwarnings('ignore')

print("✅ Imports successful!")

## 2. Configuration

In [None]:
config = {
    # Data paths
    'data_path': '../data/Weather_Data_1980_2024(hourly).csv',
    'models_dir': '../models',

    # Target variables to predict
    'target_variables': [
        'temperature',
        'relative_humidity',
        'wind_speed_10m (km/h)'
    ],

    # All input features (for context)
    'input_features': [
        'temperature',
        'relative_humidity',
        'dew_point',
        'wind_speed_10m (km/h)',
        'pressure_msl (hPa)',
        'cloud_cover (%)',
        'vapour_pressure_deficit (kPa)'
    ],

    # Data split
    'train_split': 0.8,
    'val_split': 0.1,  # Remaining 0.1 for test

    # Forecast settings
    'forecast_horizon': 72,  # 3 days in hours

    # ARIMA settings
    'auto_arima_params': {
        'max_p': 5,
        'max_d': 2,
        'max_q': 5,
        'seasonal': False,
        'stepwise': True,
        'suppress_warnings': True,
        'error_action': 'ignore',
        'trace': True
    }
}

print("Configuration:")
print(json.dumps({k: v for k, v in config.items() if k != 'auto_arima_params'}, indent=2))

## 3. Load and Prepare Data

In [None]:
print("Loading data...")
df = pd.read_csv(config['data_path'])

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# Check for missing values
print(f"\nMissing values:")
print(df[config['input_features']].isnull().sum())

In [None]:
# Clean data - remove missing values
df_clean = df[config['input_features']].dropna().reset_index(drop=True)

print(f"Clean dataset shape: {df_clean.shape}")
print(f"Removed {len(df) - len(df_clean)} rows with missing values")

# Calculate split indices
total_len = len(df_clean)
train_idx = int(config['train_split'] * total_len)
val_idx = train_idx + int(config['val_split'] * total_len)

print(f"\nData split:")
print(f"  Train: 0 to {train_idx} ({train_idx:,} samples)")
print(f"  Val: {train_idx} to {val_idx} ({val_idx - train_idx:,} samples)")
print(f"  Test: {val_idx} to {total_len} ({total_len - val_idx:,} samples)")

## 4. Train ARIMA Models

We'll train separate ARIMA models for each target variable.
This will take ~10-20 minutes depending on your CPU.

In [None]:
# Store trained models and results
trained_models = {}
training_results = {}

for target_var in config['target_variables']:
    print("\n" + "="*80)
    print(f"Training ARIMA for: {target_var}")
    print("="*80)

    # Get the time series for this variable
    series = df_clean[target_var].values

    # Split data
    train_series = series[:train_idx]
    val_series = series[train_idx:val_idx]
    test_series = series[val_idx:]

    print(f"Train samples: {len(train_series):,}")
    print(f"Val samples: {len(val_series):,}")
    print(f"Test samples: {len(test_series):,}")

    # Auto ARIMA to find best parameters
    print("\n🔍 Running auto_arima to find optimal (p,d,q)...")
    start_time = time.time()

    auto_model = auto_arima(
        train_series,
        **config['auto_arima_params']
    )

    auto_time = time.time() - start_time
    print(f"\n✅ Best model found in {auto_time:.1f}s")
    print(f"Order: {auto_model.order}")
    print(f"AIC: {auto_model.aic():.2f}")
    print(f"BIC: {auto_model.bic():.2f}")

    # Train final ARIMA model with best parameters
    print("\n🎯 Training final ARIMA model...")
    start_time = time.time()

    arima_model = ARIMA(train_series, order=auto_model.order)
    fitted_model = arima_model.fit()

    train_time = time.time() - start_time
    print(f"✅ Model trained in {train_time:.1f}s")

    # Validation forecast
    print("\n📊 Validating on validation set...")
    val_predictions = []

    # Use train + val for forecasting test set
    val_forecast = fitted_model.forecast(steps=len(val_series))
    val_mse = mean_squared_error(val_series, val_forecast)
    val_mae = mean_absolute_error(val_series, val_forecast)
    val_rmse = np.sqrt(val_mse)

    print(f"Validation MSE: {val_mse:.6f}")
    print(f"Validation MAE: {val_mae:.6f}")
    print(f"Validation RMSE: {val_rmse:.6f}")

    # Test forecast (retrain on train+val for final predictions)
    print("\n📈 Testing on test set...")
    train_val_series = series[:val_idx]
    test_model = ARIMA(train_val_series, order=auto_model.order).fit()
    test_forecast = test_model.forecast(steps=len(test_series))

    test_mse = mean_squared_error(test_series, test_forecast)
    test_mae = mean_absolute_error(test_series, test_forecast)
    test_rmse = np.sqrt(test_mse)

    print(f"Test MSE: {test_mse:.6f}")
    print(f"Test MAE: {test_mae:.6f}")
    print(f"Test RMSE: {test_rmse:.6f}")

    # Store model and results
    trained_models[target_var] = {
        'model': fitted_model,
        'order': auto_model.order,
        'scaler': None  # ARIMA works on raw data
    }

    training_results[target_var] = {
        'order': auto_model.order,
        'aic': float(auto_model.aic()),
        'bic': float(auto_model.bic()),
        'auto_arima_time': auto_time,
        'training_time': train_time,
        'val_mse': float(val_mse),
        'val_mae': float(val_mae),
        'val_rmse': float(val_rmse),
        'test_mse': float(test_mse),
        'test_mae': float(test_mae),
        'test_rmse': float(test_rmse),
        'test_forecast': test_forecast.tolist()
    }

print("\n" + "="*80)
print("✅ All ARIMA models trained successfully!")
print("="*80)

## 5. Visualize Training Results

In [None]:
# Summary table
print("\n📊 ARIMA Models Summary:")
print("-" * 100)
print(f"{'Variable':<30} {'Order':<15} {'AIC':<12} {'Val MSE':<12} {'Test MSE':<12}")
print("-" * 100)

for var in config['target_variables']:
    results = training_results[var]
    print(f"{var:<30} {str(results['order']):<15} {results['aic']:<12.2f} "
          f"{results['val_mse']:<12.6f} {results['test_mse']:<12.6f}")

print("-" * 100)

In [None]:
# Plot test predictions for each variable
fig, axes = plt.subplots(len(config['target_variables']), 1,
                         figsize=(15, 5*len(config['target_variables'])))

if len(config['target_variables']) == 1:
    axes = [axes]

for idx, target_var in enumerate(config['target_variables']):
    ax = axes[idx]

    # Get data
    series = df_clean[target_var].values
    test_series = series[val_idx:]
    test_forecast = np.array(training_results[target_var]['test_forecast'])

    # Plot first 500 hours of test set
    plot_len = min(500, len(test_series))
    time_steps = np.arange(plot_len)

    ax.plot(time_steps, test_series[:plot_len], 'b-', label='Actual', alpha=0.7, linewidth=2)
    ax.plot(time_steps, test_forecast[:plot_len], 'r--', label='ARIMA Forecast', linewidth=2)

    ax.set_xlabel('Hours')
    ax.set_ylabel('Value')
    ax.set_title(f'{target_var} - ARIMA Forecast vs Actual (First 500 hours of test set)')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../models/arima_predictions.png', dpi=150, bbox_inches='tight')
print("📊 Saved predictions plot to models/arima_predictions.png")
plt.show()

In [None]:
# Plot 72-hour forecast example
fig, axes = plt.subplots(len(config['target_variables']), 1,
                         figsize=(15, 5*len(config['target_variables'])))

if len(config['target_variables']) == 1:
    axes = [axes]

forecast_start = 0  # Start of test set
history_len = 168  # Show 7 days of history

for idx, target_var in enumerate(config['target_variables']):
    ax = axes[idx]

    # Get data
    series = df_clean[target_var].values
    history = series[val_idx - history_len:val_idx]
    actual = series[val_idx:val_idx + config['forecast_horizon']]
    forecast = np.array(training_results[target_var]['test_forecast'][:config['forecast_horizon']])

    # Time arrays
    hist_time = np.arange(-history_len, 0)
    pred_time = np.arange(0, config['forecast_horizon'])

    # Plot
    ax.plot(hist_time, history, 'b-', label='Historical', alpha=0.7, linewidth=2)
    ax.plot(pred_time, actual, 'g-', label='Actual', linewidth=2)
    ax.plot(pred_time, forecast, 'r--', label='ARIMA Forecast', linewidth=2)

    ax.axvline(x=0, color='k', linestyle='--', alpha=0.3, label='Forecast Start')
    ax.set_xlabel('Hours')
    ax.set_ylabel('Value')
    ax.set_title(f'{target_var} - 72-Hour ARIMA Forecast')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../models/arima_72hour_forecast.png', dpi=150, bbox_inches='tight')
print("📊 Saved 72-hour forecast plot to models/arima_72hour_forecast.png")
plt.show()

## 6. Save Models

In [None]:
# Create models directory if it doesn't exist
models_dir = Path(config['models_dir'])
models_dir.mkdir(exist_ok=True)

print("💾 Saving ARIMA models...")

for target_var in config['target_variables']:
    # Save model
    model_filename = f"arima_{target_var.replace(' ', '_').replace('(', '').replace(')', '').replace('/', '_')}.pkl"
    model_path = models_dir / model_filename

    joblib.dump(trained_models[target_var]['model'], model_path)
    print(f"  ✓ Saved {model_filename}")

# Save training results
results_path = models_dir / 'arima_results.json'
with open(results_path, 'w') as f:
    # Remove test_forecast for smaller file size
    results_to_save = {
        var: {k: v for k, v in results.items() if k != 'test_forecast'}
        for var, results in training_results.items()
    }
    json.dump(results_to_save, f, indent=2)
print(f"  ✓ Saved arima_results.json")

# Save configuration
config_path = models_dir / 'arima_config.json'
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print(f"  ✓ Saved arima_config.json")

print("\n✅ All ARIMA models saved successfully!")
print(f"📁 Location: {models_dir.absolute()}")

## 7. Summary & Next Steps

In [None]:
print("\n" + "="*80)
print("🎉 ARIMA Training Complete!")
print("="*80)

print("\n📊 Performance Summary:")
for var in config['target_variables']:
    results = training_results[var]
    print(f"\n{var}:")
    print(f"  Order (p,d,q): {results['order']}")
    print(f"  Test MSE: {results['test_mse']:.6f}")
    print(f"  Test MAE: {results['test_mae']:.6f}")
    print(f"  Test RMSE: {results['test_rmse']:.6f}")
    print(f"  Training time: {results['auto_arima_time'] + results['training_time']:.1f}s")

print("\n📁 Saved Files:")
print(f"  - ARIMA models (3 files): {models_dir}/arima_*.pkl")
print(f"  - Results: {models_dir}/arima_results.json")
print(f"  - Configuration: {models_dir}/arima_config.json")
print(f"  - Plots: {models_dir}/arima_*.png")

print("\n🔜 Next Steps:")
print("  1. Train VARIMA model: Run train_varima.py")
print("  2. Train LSTM model: Run train_lstm.py (on Colab recommended)")
print("  3. Compare all models: Run model_comparison.py")

print("\n" + "="*80)