# üß™ V3 Multivariate Transformer Training

Train a **Multivariate Transformer model** that uses Temperature, Humidity, Pressure, Wind, and Cloud Cover.

**Goal:**
- Break the 2.05¬∞C MAE plateau by learning physical interactions between weather variables.

**New Features:**
- `humidity`
- `pressure_mb`
- `wind_kph`
- `cloud`
- `precip_mm`
- `uv_index`

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
import joblib
import json
import math
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Using device: {device}")

‚úÖ Using device: cuda


## 1. Load Data & Define Features

In [2]:
df = pd.read_csv('../../data/processed/weather_cleaned.csv', parse_dates=['date'])
location_stats = pd.read_csv('../models/location_stats.csv')

df = df.merge(
    location_stats[['country', 'hemisphere_encoded', 'climate_zone_encoded', 'abs_latitude', 'latitude_normalized']],
    on='country', how='left'
).dropna()

# Define Combined Feature Set (Static + Dynamic + Multivariate)
FEATURE_COLS = [
    # Static / Semi-static
    'latitude', 'longitude', 'abs_latitude', 'latitude_normalized',
    'hemisphere_encoded', 'climate_zone_encoded',
    # Time
    'month', 'day_of_month', 'day_of_week', 'day_of_year', 'quarter', 'is_weekend',
    'month_sin', 'month_cos', 'day_sin', 'day_cos', 'day_of_year_sin', 'day_of_year_cos',
    # Dynamic Weather Drivers (The New Stuff)
    'temperature_celsius', 'humidity', 'pressure_mb', 'wind_kph', 'precip_mm', 'cloud', 'uv_index'
]

SEQ_LEN = 30
PRED_LEN = 7

print(f"üìä Features: {len(FEATURE_COLS)}, Seq: {SEQ_LEN} ‚Üí Pred: {PRED_LEN}")

üìä Features: 25, Seq: 30 ‚Üí Pred: 7


## 2. Create Sequences (Multivariate)

In [3]:
def create_sequences(df, seq_len=30, pred_len=7):
    sequences, targets = [], []
    for country in tqdm(df['country'].unique(), desc="Creating sequences"):
        country_df = df[df['country'] == country].sort_values('date')
        if len(country_df) < seq_len + pred_len:
            continue
        
        # Select ALL feature columns
        data = country_df[FEATURE_COLS].values
        temps = country_df['temperature_celsius'].values
        
        for i in range(len(data) - seq_len - pred_len + 1):
            sequences.append(data[i:i+seq_len])
            targets.append(temps[i+seq_len:i+seq_len+pred_len])
            
    return np.array(sequences), np.array(targets)

X, y = create_sequences(df, SEQ_LEN, PRED_LEN)
print(f"üìä Sequences: {X.shape}, Targets: {y.shape}")

Creating sequences: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 186/186 [00:00<00:00, 244.68it/s]


üìä Sequences: (95956, 30, 25), Targets: (95956, 7)


## 3. Train/Test Split & Scaling

In [4]:
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

scaler = StandardScaler()
# Reshape to 2D for scaling: (Samples * SeqLen, Features)
scaler.fit(X_train.reshape(-1, X_train.shape[-1]))

X_train_scaled = scaler.transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

print(f"üìä Train: {X_train.shape[0]:,}, Test: {X_test.shape[0]:,}")

üìä Train: 76,764, Test: 19,192


In [5]:
class WeatherDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

BATCH_SIZE = 256
train_loader = DataLoader(WeatherDataset(X_train_scaled, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(WeatherDataset(X_test_scaled, y_test), batch_size=BATCH_SIZE, shuffle=False)

## 4. Model Definition (Previous V2.3 Architecture)

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class WeatherTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=8, num_layers=4, dropout=0.2, seq_len=30, pred_len=7):
        super().__init__()
        self.d_model = d_model
        self.input_projection = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, seq_len)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=d_model*4, dropout=dropout, batch_first=True, activation='gelu')
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output_head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, pred_len)
        )
    
    def forward(self, x):
        x = self.input_projection(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        return self.output_head(x[:, -1, :])

model = WeatherTransformer(
    input_dim=len(FEATURE_COLS),
    d_model=64,
    nhead=8,
    num_layers=4,
    dropout=0.2
).to(device)
print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()):,}")

üìä Model parameters: 204,039


## 5. Training Loop

In [7]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.01)

best_val_loss = float('inf')
best_state = None
patience = 15
patience_counter = 0

print("üöÄ Starting training...")
for epoch in range(100):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        pred = model(X_batch)
        loss = criterion(pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()
    
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            val_loss += criterion(model(X_batch), y_batch).item()
    
    train_loss /= len(train_loader)
    val_loss /= len(test_loader)
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:3d} | Train: {train_loss:.4f} | Val: {val_loss:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = model.state_dict().copy()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\n‚èπÔ∏è Early stopping at epoch {epoch+1}")
            break

model.load_state_dict(best_state)
print(f"\n‚úÖ Best validation loss: {best_val_loss:.4f}")

üöÄ Starting training...
Epoch   5 | Train: 15.4704 | Val: 8.5593
Epoch  10 | Train: 14.3770 | Val: 8.4413
Epoch  15 | Train: 13.1428 | Val: 8.3926
Epoch  20 | Train: 12.0722 | Val: 7.9842
Epoch  25 | Train: 11.2357 | Val: 8.0473
Epoch  30 | Train: 10.6299 | Val: 8.0825
Epoch  35 | Train: 9.9828 | Val: 8.0768

‚èπÔ∏è Early stopping at epoch 38

‚úÖ Best validation loss: 7.9469


## 6. Evaluation

In [8]:
model.eval()
all_preds, all_targets = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch).cpu().numpy()
        all_preds.extend(preds)
        all_targets.extend(y_batch.numpy())

all_preds = np.array(all_preds)
all_targets = np.array(all_targets)

mae = np.mean(np.abs(all_preds - all_targets))
rmse = np.sqrt(np.mean((all_preds - all_targets) ** 2))

print(f"üìä Test Results:")
print(f"   MAE:  {mae:.2f}¬∞C")
print(f"   RMSE: {rmse:.2f}¬∞C")

# Per-day MAE
print(f"\nüìä MAE per forecast day:")
for i in range(PRED_LEN):
    day_mae = np.mean(np.abs(all_preds[:, i] - all_targets[:, i]))
    print(f"   Day {i+1}: {day_mae:.2f}¬∞C")

üìä Test Results:
   MAE:  2.07¬∞C
   RMSE: 2.84¬∞C

üìä MAE per forecast day:
   Day 1: 2.03¬∞C
   Day 2: 2.03¬∞C
   Day 3: 2.04¬∞C
   Day 4: 2.05¬∞C
   Day 5: 2.07¬∞C
   Day 6: 2.12¬∞C
   Day 7: 2.17¬∞C


## 7. Save Model Artifacts

In [9]:
# Save to v2/models/
save_path = '../models/multivariate_transformer.pt'
checkpoint = {
    'model_state_dict': model.state_dict(),
    'input_dim': len(FEATURE_COLS),
    'd_model': 64,
    'nhead': 8,
    'num_layers': 4,
    'dropout': 0.2,
    'seq_len': SEQ_LEN,
    'pred_len': PRED_LEN,
    'feature_cols': FEATURE_COLS,
    'mae': mae
}
torch.save(checkpoint, save_path)
joblib.dump(scaler, '../models/multivariate_scaler.joblib')
print(f"‚úÖ Saved multivariate model to {save_path}")
print(f"   MAE: {mae:.2f}¬∞C")

‚úÖ Saved multivariate model to ../models/multivariate_transformer.pt
   MAE: 2.07¬∞C
