# ü§ñ V2 Transformer Model Training

Train a **Transformer model** for weather temperature forecasting.

**Why Transformer?**
- Attention mechanism can focus on ANY past day equally
- Parallel processing (faster training)
- State-of-the-art for sequence modeling

**Architecture:**
```
Input: 30 days of features ‚Üí Positional Encoding ‚Üí Transformer Encoder ‚Üí Predict 7 days
```

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
import joblib
import json
import math
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Using device: {device}")

‚úÖ Using device: cuda


## 1. Load Data (Same as LSTM)

In [2]:
df = pd.read_csv('../../data/processed/weather_cleaned.csv', parse_dates=['date'])
location_stats = pd.read_csv('../models/location_stats.csv')

df = df.merge(
    location_stats[['country', 'hemisphere_encoded', 'climate_zone_encoded', 'abs_latitude', 'latitude_normalized']],
    on='country', how='left'
).dropna()

print(f"üìä Loaded {len(df):,} rows")

üìä Loaded 102,652 rows


In [3]:
FEATURE_COLS = [
    'latitude', 'longitude', 'abs_latitude', 'latitude_normalized',
    'hemisphere_encoded', 'climate_zone_encoded',
    'month', 'day_of_month', 'day_of_week', 'day_of_year', 'quarter', 'is_weekend',
    'month_sin', 'month_cos', 'day_sin', 'day_cos', 'day_of_year_sin', 'day_of_year_cos',
    'temperature_celsius'
]

SEQ_LEN = 30
PRED_LEN = 7

print(f"üìä Features: {len(FEATURE_COLS)}, Seq: {SEQ_LEN} ‚Üí Pred: {PRED_LEN}")

üìä Features: 19, Seq: 30 ‚Üí Pred: 7


## 2. Create Sequences

In [4]:
def create_sequences(df, seq_len=30, pred_len=7):
    sequences, targets = [], []
    for country in tqdm(df['country'].unique(), desc="Creating sequences"):
        country_df = df[df['country'] == country].sort_values('date')
        if len(country_df) < seq_len + pred_len:
            continue
        data = country_df[FEATURE_COLS].values
        temps = country_df['temperature_celsius'].values
        for i in range(len(data) - seq_len - pred_len + 1):
            sequences.append(data[i:i+seq_len])
            targets.append(temps[i+seq_len:i+seq_len+pred_len])
    return np.array(sequences), np.array(targets)

X, y = create_sequences(df, SEQ_LEN, PRED_LEN)
print(f"üìä Sequences: {X.shape}, Targets: {y.shape}")

Creating sequences: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 186/186 [00:00<00:00, 237.41it/s]


üìä Sequences: (95956, 30, 19), Targets: (95956, 7)


## 3. Train/Test Split & Scaling

In [5]:
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

scaler = StandardScaler()
scaler.fit(X_train.reshape(-1, X_train.shape[-1]))
X_train_scaled = scaler.transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

print(f"üìä Train: {X_train.shape[0]:,}, Test: {X_test.shape[0]:,}")

üìä Train: 76,764, Test: 19,192


In [6]:
class WeatherDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

BATCH_SIZE = 256  # Larger batch for Transformer
train_loader = DataLoader(WeatherDataset(X_train_scaled, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(WeatherDataset(X_test_scaled, y_test), batch_size=BATCH_SIZE, shuffle=False)
print(f"‚úÖ DataLoaders created")

‚úÖ DataLoaders created


## 4. Transformer Model Definition

In [7]:
class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding for Transformer."""
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]


class WeatherTransformer(nn.Module):
    """Transformer model for weather temperature prediction."""
    
    def __init__(self, input_dim, d_model=64, nhead=8, num_layers=4, 
                 dropout=0.2, seq_len=30, pred_len=7):
        super().__init__()
        self.d_model = d_model
        
        # Input projection
        self.input_projection = nn.Linear(input_dim, d_model)
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(d_model, seq_len)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            batch_first=True,
            activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        
        # Output head
        self.output_head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, pred_len)
        )
    
    def forward(self, x):
        # x: [batch, seq_len, input_dim]
        x = self.input_projection(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        x = x[:, -1, :]  # Use last timestep
        return self.output_head(x)

# Initialize model
model = WeatherTransformer(
    input_dim=len(FEATURE_COLS),
    d_model=64,
    nhead=8,
    num_layers=4,
    dropout=0.2,
    seq_len=SEQ_LEN,
    pred_len=PRED_LEN
).to(device)

print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(model)

üìä Model parameters: 203,655
WeatherTransformer(
  (input_projection): Linear(in_features=19, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding()
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (output_head): Sequential(
    (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (1): Linear(in_features=64, out_features

## 5. Training with Warmup Scheduler

In [8]:
class WarmupScheduler:
    """Linear warmup then cosine decay."""
    def __init__(self, optimizer, warmup_steps, total_steps):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.current_step = 0
        self.base_lr = optimizer.param_groups[0]['lr']
    
    def step(self):
        self.current_step += 1
        if self.current_step < self.warmup_steps:
            lr = self.base_lr * (self.current_step / self.warmup_steps)
        else:
            progress = (self.current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
            lr = self.base_lr * 0.5 * (1 + math.cos(math.pi * progress))
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        return lr

In [9]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.01)

epochs = 100
warmup_epochs = 10
total_steps = epochs * len(train_loader)
warmup_steps = warmup_epochs * len(train_loader)
scheduler = WarmupScheduler(optimizer, warmup_steps, total_steps)

best_val_loss = float('inf')
patience, patience_counter = 20, 0
history = {'train_loss': [], 'val_loss': [], 'lr': []}

print("üöÄ Starting training with warmup...")
for epoch in range(epochs):
    # Train
    model.train()
    train_losses = []
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        pred = model(X_batch)
        loss = criterion(pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr = scheduler.step()
        train_losses.append(loss.item())
    
    # Validate
    model.eval()
    val_losses = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            val_losses.append(criterion(model(X_batch), y_batch).item())
    
    train_loss, val_loss = np.mean(train_losses), np.mean(val_losses)
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['lr'].append(lr)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_state = model.state_dict().copy()
    else:
        patience_counter += 1
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:3d} | Train: {train_loss:.4f} | Val: {val_loss:.4f} | LR: {lr:.6f}")
    
    if patience_counter >= patience:
        print(f"\n‚èπÔ∏è Early stopping at epoch {epoch+1}")
        break

model.load_state_dict(best_state)
print(f"\n‚úÖ Best validation loss: {best_val_loss:.4f}")

üöÄ Starting training with warmup...
Epoch  10 | Train: 19.4336 | Val: 8.6745 | LR: 0.000500
Epoch  20 | Train: 16.0140 | Val: 8.3312 | LR: 0.000485
Epoch  30 | Train: 13.0986 | Val: 8.1870 | LR: 0.000442
Epoch  40 | Train: 11.2121 | Val: 7.8478 | LR: 0.000375
Epoch  50 | Train: 10.0735 | Val: 7.8681 | LR: 0.000293
Epoch  60 | Train: 9.4638 | Val: 7.6342 | LR: 0.000207
Epoch  70 | Train: 9.1078 | Val: 7.7817 | LR: 0.000125
Epoch  80 | Train: 8.9301 | Val: 7.8461 | LR: 0.000058

‚èπÔ∏è Early stopping at epoch 80

‚úÖ Best validation loss: 7.6342


In [10]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=history['train_loss'], name='Train'))
fig.add_trace(go.Scatter(y=history['val_loss'], name='Validation'))
fig.update_layout(title='üìâ Transformer Training History', xaxis_title='Epoch', yaxis_title='MSE Loss',
                  paper_bgcolor='#0f0f1a', plot_bgcolor='#0f0f1a', font_color='white')
fig.show()

## 6. Evaluation

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model.eval()
all_preds, all_targets = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        all_preds.append(model(X_batch.to(device)).cpu().numpy())
        all_targets.append(y_batch.numpy())

y_pred = np.concatenate(all_preds)
y_true = np.concatenate(all_targets)

mae = mean_absolute_error(y_true.flatten(), y_pred.flatten())
rmse = np.sqrt(mean_squared_error(y_true.flatten(), y_pred.flatten()))
r2 = r2_score(y_true.flatten(), y_pred.flatten())

print("="*50)
print("üìä Transformer Model - Test Results")
print("="*50)
print(f"MAE:  {mae:.2f}¬∞C")
print(f"RMSE: {rmse:.2f}¬∞C")
print(f"R¬≤:   {r2:.4f}")

üìä Transformer Model - Test Results
MAE:  2.05¬∞C
RMSE: 2.80¬∞C
R¬≤:   0.9045


In [12]:
print("\nüìä MAE by Forecast Day:")
for day in range(PRED_LEN):
    day_mae = mean_absolute_error(y_true[:, day], y_pred[:, day])
    print(f"  Day {day+1}: {day_mae:.2f}¬∞C")


üìä MAE by Forecast Day:
  Day 1: 2.01¬∞C
  Day 2: 2.01¬∞C
  Day 3: 2.00¬∞C
  Day 4: 2.01¬∞C
  Day 5: 2.04¬∞C
  Day 6: 2.09¬∞C
  Day 7: 2.15¬∞C


In [13]:
fig = px.scatter(x=y_true[:, 0], y=y_pred[:, 0], opacity=0.3,
                 labels={'x': 'Actual (¬∞C)', 'y': 'Predicted (¬∞C)'},
                 title=f'üéØ Transformer: Day 1 Actual vs Predicted (MAE={mean_absolute_error(y_true[:, 0], y_pred[:, 0]):.2f}¬∞C)')
fig.add_trace(go.Scatter(x=[-20, 50], y=[-20, 50], mode='lines', name='Perfect', line=dict(color='red', dash='dash')))
fig.update_layout(paper_bgcolor='#0f0f1a', plot_bgcolor='#0f0f1a', font_color='white')
fig.show()

## 7. Compare with LSTM

In [14]:
try:
    with open('../models/lstm_config.json', 'r') as f:
        lstm_config = json.load(f)
    lstm_mae = lstm_config.get('metrics', {}).get('mae', 'N/A')
    
    print("\n" + "="*50)
    print("üìä Model Comparison")
    print("="*50)
    print(f"LSTM MAE:        {lstm_mae}¬∞C")
    print(f"Transformer MAE: {mae:.2f}¬∞C")
    
    if isinstance(lstm_mae, (int, float)) and mae < lstm_mae:
        improvement = ((lstm_mae - mae) / lstm_mae) * 100
        print(f"\nüéâ Transformer is {improvement:.1f}% better!")
    elif isinstance(lstm_mae, (int, float)):
        print(f"\n‚ö†Ô∏è LSTM performs better on this run")
except:
    print("Could not load LSTM config for comparison")


üìä Model Comparison
LSTM MAE:        2.05¬∞C
Transformer MAE: 2.05¬∞C

üéâ Transformer is 0.2% better!


## 8. Save Model (if better than LSTM)

In [15]:
save_model = True  # Set to True to save regardless of comparison

try:
    with open('../models/lstm_config.json', 'r') as f:
        lstm_mae = json.load(f).get('metrics', {}).get('mae', float('inf'))
    if mae < lstm_mae:
        save_model = True
        print(f"‚úÖ Transformer ({mae:.2f}¬∞C) beats LSTM ({lstm_mae}¬∞C)!")
except:
    pass

if save_model:
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'model_type': 'transformer',
        'input_dim': len(FEATURE_COLS),
        'd_model': 64,
        'nhead': 8,
        'num_layers': 4,
        'dropout': 0.2,
        'seq_len': SEQ_LEN,
        'pred_len': PRED_LEN,
        'feature_cols': FEATURE_COLS,
        'metrics': {'mae': round(mae, 2), 'rmse': round(rmse, 2), 'r2': round(r2, 4)}
    }
    torch.save(checkpoint, '../models/transformer_model.pt')
    joblib.dump(scaler, '../models/transformer_scaler.joblib')
    
    config = {
        'version': '2.3-transformer',
        'model_type': 'transformer',
        'feature_cols': FEATURE_COLS,
        **{k: v for k, v in checkpoint.items() if k not in ['model_state_dict', 'feature_cols']}
    }
    with open('../models/transformer_config.json', 'w') as f:
        json.dump(config, f, indent=2)
    
    print("‚úÖ Saved transformer_model.pt")
    print("‚úÖ Saved transformer_scaler.joblib")
    print("‚úÖ Saved transformer_config.json")
else:
    print("‚ö†Ô∏è Model not saved (LSTM is better)")

‚úÖ Transformer (2.05¬∞C) beats LSTM (2.05¬∞C)!
‚úÖ Saved transformer_model.pt
‚úÖ Saved transformer_scaler.joblib
‚úÖ Saved transformer_config.json


## 9. Summary

In [16]:
print("="*60)
print("üéâ Transformer Model Training Complete!")
print("="*60)
print(f"\nüìä Model Performance:")
print(f"   ‚Ä¢ MAE:  {mae:.2f}¬∞C")
print(f"   ‚Ä¢ RMSE: {rmse:.2f}¬∞C")
print(f"   ‚Ä¢ R¬≤:   {r2:.4f}")
print(f"\nüì¶ Artifacts:")
print(f"   ‚Ä¢ v2/models/transformer_model.pt")
print(f"   ‚Ä¢ v2/models/transformer_scaler.joblib")
print(f"   ‚Ä¢ v2/models/transformer_config.json")
print(f"\nüöÄ Next: Update web app to use Transformer if it's better!")

üéâ Transformer Model Training Complete!

üìä Model Performance:
   ‚Ä¢ MAE:  2.05¬∞C
   ‚Ä¢ RMSE: 2.80¬∞C
   ‚Ä¢ R¬≤:   0.9045

üì¶ Artifacts:
   ‚Ä¢ v2/models/transformer_model.pt
   ‚Ä¢ v2/models/transformer_scaler.joblib
   ‚Ä¢ v2/models/transformer_config.json

üöÄ Next: Update web app to use Transformer if it's better!
