# üß† V2 LSTM Model Training

Train a **sequence-aware LSTM model** for better temperature forecasting.

**Why LSTM?**
- MLP treats each day independently
- LSTM understands temporal patterns and sequences
- Better at learning trends and patterns over time

**Architecture:**
```
Input: 30 days of features ‚Üí LSTM ‚Üí Predict next 7 days
```

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
import joblib
import json
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Using device: {device}")

‚úÖ Using device: cuda


## 1. Load Data

In [2]:
# Load processed weather data
df = pd.read_csv('../../data/processed/weather_cleaned.csv', parse_dates=['date'])
print(f"üìä Loaded {len(df):,} rows")

# Load V2 location stats
location_stats = pd.read_csv('../models/location_stats.csv')
print(f"üåç Location stats: {len(location_stats)} countries")

# Merge climate zone info
df = df.merge(
    location_stats[['country', 'hemisphere_encoded', 'climate_zone_encoded', 'abs_latitude', 'latitude_normalized']],
    on='country', how='left'
).dropna()
print(f"üìä After merge: {len(df):,} rows")

üìä Loaded 102,652 rows
üåç Location stats: 186 countries
üìä After merge: 102,652 rows


## 2. Feature Definition

In [3]:
# Features for LSTM (per timestep)
FEATURE_COLS = [
    # Geographic (static)
    'latitude', 'longitude', 'abs_latitude', 'latitude_normalized',
    'hemisphere_encoded', 'climate_zone_encoded',
    
    # Temporal
    'month', 'day_of_month', 'day_of_week', 'day_of_year', 'quarter', 'is_weekend',
    
    # Cyclical
    'month_sin', 'month_cos', 'day_sin', 'day_cos', 'day_of_year_sin', 'day_of_year_cos',
    
    # Temperature (key feature)
    'temperature_celsius'
]

# Sequence parameters
SEQ_LEN = 30  # Input: 30 days
PRED_LEN = 7  # Output: predict 7 days

print(f"üìä Features per timestep: {len(FEATURE_COLS)}")
print(f"üìä Sequence length: {SEQ_LEN} ‚Üí Predict: {PRED_LEN}")

üìä Features per timestep: 19
üìä Sequence length: 30 ‚Üí Predict: 7


## 3. Create Sequences

In [4]:
def create_sequences(df, seq_len=30, pred_len=7):
    """Create input sequences and targets for each country."""
    sequences = []
    targets = []
    
    for country in tqdm(df['country'].unique(), desc="Creating sequences"):
        country_df = df[df['country'] == country].sort_values('date')
        
        if len(country_df) < seq_len + pred_len:
            continue
        
        data = country_df[FEATURE_COLS].values
        temps = country_df['temperature_celsius'].values
        
        for i in range(len(data) - seq_len - pred_len + 1):
            seq = data[i:i+seq_len]
            target = temps[i+seq_len:i+seq_len+pred_len]
            sequences.append(seq)
            targets.append(target)
    
    return np.array(sequences), np.array(targets)

print("Creating sequences...")
X, y = create_sequences(df, SEQ_LEN, PRED_LEN)
print(f"üìä Sequences: {X.shape}")
print(f"üìä Targets: {y.shape}")

Creating sequences...


Creating sequences: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 186/186 [00:00<00:00, 260.51it/s]


üìä Sequences: (95956, 30, 19)
üìä Targets: (95956, 7)


## 4. Train/Test Split & Scaling

In [5]:
# 80/20 split
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"üìä Train: {X_train.shape[0]:,} sequences")
print(f"üìä Test: {X_test.shape[0]:,} sequences")

üìä Train: 76,764 sequences
üìä Test: 19,192 sequences


In [6]:
# Scale features (fit on train only)
scaler = StandardScaler()
X_train_flat = X_train.reshape(-1, X_train.shape[-1])
scaler.fit(X_train_flat)

# Transform
X_train_scaled = scaler.transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

print("‚úÖ Features scaled")

‚úÖ Features scaled


In [7]:
# PyTorch Dataset
class WeatherDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = WeatherDataset(X_train_scaled, y_train)
test_ds = WeatherDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=128, shuffle=False)

print(f"‚úÖ DataLoaders created")

‚úÖ DataLoaders created


## 5. LSTM Model Definition

In [8]:
class WeatherLSTM(nn.Module):
    """LSTM model for weather temperature prediction."""
    
    def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.3, pred_len=7):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.pred_len = pred_len
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Output layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, pred_len)
        )
    
    def forward(self, x):
        # x: [batch, seq_len, input_dim]
        lstm_out, _ = self.lstm(x)
        # Use last hidden state
        last_hidden = lstm_out[:, -1, :]  # [batch, hidden_dim]
        out = self.fc(last_hidden)  # [batch, pred_len]
        return out

# Initialize model
model = WeatherLSTM(
    input_dim=len(FEATURE_COLS),
    hidden_dim=128,
    num_layers=2,
    dropout=0.3,
    pred_len=PRED_LEN
).to(device)

print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(model)

üìä Model parameters: 217,095
WeatherLSTM(
  (lstm): LSTM(19, 128, num_layers=2, batch_first=True, dropout=0.3)
  (fc): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=7, bias=True)
  )
)


## 6. Training

In [9]:
# Training setup
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

# Training loop
epochs = 100
best_val_loss = float('inf')
patience = 15
patience_counter = 0
history = {'train_loss': [], 'val_loss': []}

print("üöÄ Starting training...")
for epoch in range(epochs):
    # Train
    model.train()
    train_losses = []
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        pred = model(X_batch)
        loss = criterion(pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        train_losses.append(loss.item())
    
    # Validate
    model.eval()
    val_losses = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            pred = model(X_batch)
            val_losses.append(criterion(pred, y_batch).item())
    
    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    
    scheduler.step(val_loss)
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_state = model.state_dict().copy()
    else:
        patience_counter += 1
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:3d} | Train: {train_loss:.4f} | Val: {val_loss:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")
    
    if patience_counter >= patience:
        print(f"\n‚èπÔ∏è Early stopping at epoch {epoch+1}")
        break

# Restore best model
model.load_state_dict(best_state)
print(f"\n‚úÖ Best validation loss: {best_val_loss:.4f}")

üöÄ Starting training...
Epoch   5 | Train: 13.7887 | Val: 8.3908 | LR: 0.001000
Epoch  10 | Train: 12.4658 | Val: 8.2094 | LR: 0.001000
Epoch  15 | Train: 11.8365 | Val: 7.8440 | LR: 0.001000
Epoch  20 | Train: 11.4422 | Val: 7.8118 | LR: 0.001000
Epoch  25 | Train: 11.0968 | Val: 7.6983 | LR: 0.000500
Epoch  30 | Train: 10.7815 | Val: 7.7671 | LR: 0.000250
Epoch  35 | Train: 10.6631 | Val: 7.7222 | LR: 0.000250
Epoch  40 | Train: 10.6651 | Val: 7.7482 | LR: 0.000125
Epoch  45 | Train: 10.5686 | Val: 7.8122 | LR: 0.000125

‚èπÔ∏è Early stopping at epoch 49

‚úÖ Best validation loss: 7.6613


In [10]:
# Plot training history
fig = go.Figure()
fig.add_trace(go.Scatter(y=history['train_loss'], name='Train'))
fig.add_trace(go.Scatter(y=history['val_loss'], name='Validation'))
fig.update_layout(title='üìâ LSTM Training History', xaxis_title='Epoch', yaxis_title='MSE Loss',
                  paper_bgcolor='#0f0f1a', plot_bgcolor='#0f0f1a', font_color='white')
fig.show()

## 7. Evaluation

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        pred = model(X_batch).cpu().numpy()
        all_preds.append(pred)
        all_targets.append(y_batch.numpy())

y_pred = np.concatenate(all_preds)
y_true = np.concatenate(all_targets)

# Overall metrics
mae = mean_absolute_error(y_true.flatten(), y_pred.flatten())
rmse = np.sqrt(mean_squared_error(y_true.flatten(), y_pred.flatten()))
r2 = r2_score(y_true.flatten(), y_pred.flatten())

print("="*50)
print("üìä LSTM Model - Test Results (All Days)")
print("="*50)
print(f"MAE:  {mae:.2f}¬∞C")
print(f"RMSE: {rmse:.2f}¬∞C")
print(f"R¬≤:   {r2:.4f}")

üìä LSTM Model - Test Results (All Days)
MAE:  2.05¬∞C
RMSE: 2.79¬∞C
R¬≤:   0.9051


In [12]:
# MAE by forecast day
print("\nüìä MAE by Forecast Day:")
for day in range(PRED_LEN):
    day_mae = mean_absolute_error(y_true[:, day], y_pred[:, day])
    print(f"  Day {day+1}: {day_mae:.2f}¬∞C")


üìä MAE by Forecast Day:
  Day 1: 2.06¬∞C
  Day 2: 2.02¬∞C
  Day 3: 1.99¬∞C
  Day 4: 2.00¬∞C
  Day 5: 2.03¬∞C
  Day 6: 2.09¬∞C
  Day 7: 2.19¬∞C


In [13]:
# Actual vs Predicted scatter (Day 1)
fig = px.scatter(x=y_true[:, 0], y=y_pred[:, 0], opacity=0.3,
                 labels={'x': 'Actual (¬∞C)', 'y': 'Predicted (¬∞C)'},
                 title=f'üéØ LSTM: Day 1 Actual vs Predicted (MAE={mean_absolute_error(y_true[:, 0], y_pred[:, 0]):.2f}¬∞C)')
fig.add_trace(go.Scatter(x=[-20, 50], y=[-20, 50], mode='lines', name='Perfect',
                         line=dict(color='red', dash='dash')))
fig.update_layout(paper_bgcolor='#0f0f1a', plot_bgcolor='#0f0f1a', font_color='white')
fig.show()

## 8. Save Model Artifacts

In [14]:
# Save LSTM model
checkpoint = {
    'model_state_dict': model.state_dict(),
    'model_type': 'lstm',
    'input_dim': len(FEATURE_COLS),
    'hidden_dim': 128,
    'num_layers': 2,
    'dropout': 0.3,
    'seq_len': SEQ_LEN,
    'pred_len': PRED_LEN,
    'feature_cols': FEATURE_COLS,
    'metrics': {'mae': round(mae, 2), 'rmse': round(rmse, 2), 'r2': round(r2, 4)}
}
torch.save(checkpoint, '../models/lstm_model.pt')
print("‚úÖ Saved lstm_model.pt")

# Save scaler
joblib.dump(scaler, '../models/lstm_scaler.joblib')
print("‚úÖ Saved lstm_scaler.joblib")

# Save model config
config = {
    'version': '2.2-lstm',
    'model_type': 'lstm',
    'feature_cols': FEATURE_COLS,
    'input_dim': len(FEATURE_COLS),
    'hidden_dim': 128,
    'num_layers': 2,
    'dropout': 0.3,
    'seq_len': SEQ_LEN,
    'pred_len': PRED_LEN,
    'metrics': {'mae': round(mae, 2), 'rmse': round(rmse, 2), 'r2': round(r2, 4)}
}
with open('../models/lstm_config.json', 'w') as f:
    json.dump(config, f, indent=2)
print("‚úÖ Saved lstm_config.json")

‚úÖ Saved lstm_model.pt
‚úÖ Saved lstm_scaler.joblib
‚úÖ Saved lstm_config.json


## 9. Compare with MLP

In [15]:
# Load MLP config for comparison
try:
    with open('../models/model_config.json', 'r') as f:
        mlp_config = json.load(f)
    mlp_mae = mlp_config.get('metrics', {}).get('mae', 'N/A')
    
    print("\n" + "="*50)
    print("üìä Model Comparison")
    print("="*50)
    print(f"MLP MAE:  {mlp_mae}¬∞C")
    print(f"LSTM MAE: {mae:.2f}¬∞C")
    
    if isinstance(mlp_mae, (int, float)):
        improvement = ((mlp_mae - mae) / mlp_mae) * 100
        print(f"\nüéâ Improvement: {improvement:.1f}%")
except:
    print("Could not load MLP config for comparison")


üìä Model Comparison
MLP MAE:  1.6¬∞C
LSTM MAE: 2.05¬∞C

üéâ Improvement: -28.3%


## 10. Summary

In [16]:
print("="*60)
print("üéâ LSTM Model Training Complete!")
print("="*60)
print(f"\nüìä Model Performance:")
print(f"   ‚Ä¢ MAE:  {mae:.2f}¬∞C")
print(f"   ‚Ä¢ RMSE: {rmse:.2f}¬∞C")
print(f"   ‚Ä¢ R¬≤:   {r2:.4f}")
print(f"\nüì¶ Artifacts Saved:")
print(f"   ‚Ä¢ v2/models/lstm_model.pt")
print(f"   ‚Ä¢ v2/models/lstm_scaler.joblib")
print(f"   ‚Ä¢ v2/models/lstm_config.json")
print(f"\nüöÄ Next: Update web app to use LSTM model!")

üéâ LSTM Model Training Complete!

üìä Model Performance:
   ‚Ä¢ MAE:  2.05¬∞C
   ‚Ä¢ RMSE: 2.79¬∞C
   ‚Ä¢ R¬≤:   0.9051

üì¶ Artifacts Saved:
   ‚Ä¢ v2/models/lstm_model.pt
   ‚Ä¢ v2/models/lstm_scaler.joblib
   ‚Ä¢ v2/models/lstm_config.json

üöÄ Next: Update web app to use LSTM model!
