# Hyperparameter Search for CNN Model

This notebook implements a staged hyperparameter search for the `model_cnn_wss` architecture:

1. **Data Preparation**: Load data, split into train/validation/test, then apply subsampling within each split.
2. **Stage 1**: Tune learning rate and batch size.
3. **Stage 2**: Tune model capacity (number of convolutional blocks and base filter size).
4. **Stage 3**: Tune regularization (dropout, weight decay).
5. **Stage 4**: Final tuning (learning-rate scheduler, minor kernel tweaks, optional augmentation).

Each stage builds on the best settings from previous stages.  
Make sure `model_cnn_wss.py` is in the same directory. 


In [5]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from model_cnn_old import Model_CNN

# For reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


Using device: cpu


In [7]:
# Instantiate the Model_CNN to access data loading methods
data_path = '/Users/lauritsfauli/PycharmProjects/Final_project_APML/era5'
mc = Model_CNN(data_path=data_path)  # adjust args as needed

# Load raw data
mc.load_data()

# Prepare splits without subsampling
X = mc.X  # shape (N, C, H, W)
y = mc.target  # shape (N,)

# 1) First split into train and temp
X_train_full, X_temp_full, y_train_full, y_temp_full = train_test_split(
    X, y, test_size=0.3, random_state=SEED, shuffle=True
)

# 2) Split temp into val and test
X_val_full, X_test_full, y_val_full, y_test_full = train_test_split(
    X_temp_full, y_temp_full, test_size=0.5, random_state=SEED, shuffle=True
)

print(f'Full shapes:')
print(f'  X_train_full: {X_train_full.shape}, y_train_full: {y_train_full.shape}')
print(f'  X_val_full:   {X_val_full.shape},   y_val_full:   {y_val_full.shape}')
print(f'  X_test_full:  {X_test_full.shape},   y_test_full:   {y_test_full.shape}')

# Scale targets on train only
scaler = MinMaxScaler()
y_train_full = scaler.fit_transform(y_train_full.reshape(-1, 1)).flatten()
y_val_full   = scaler.transform(y_val_full.reshape(-1, 1)).flatten()
y_test_full  = scaler.transform(y_test_full.reshape(-1, 1)).flatten()


Msl input shape: (1827, 120, 408)
input_file_names: ['era5_1940_mean_sea_level_pressure.nc', 'era5_1941_mean_sea_level_pressure.nc', 'era5_1942_mean_sea_level_pressure.nc', 'era5_1943_mean_sea_level_pressure.nc', 'era5_1944_mean_sea_level_pressure.nc']
T850 input shape: (1827, 120, 408)
input_file_names_t850: ['era5_1940_temperature_850hPa.nc', 'era5_1941_temperature_850hPa.nc', 'era5_1942_temperature_850hPa.nc', 'era5_1943_temperature_850hPa.nc', 'era5_1944_temperature_850hPa.nc']
Precipitation target shape: (1827,)
target_file_names: ['era5_1940_total_precipitation.nc', 'era5_1941_total_precipitation.nc', 'era5_1942_total_precipitation.nc', 'era5_1943_total_precipitation.nc', 'era5_1944_total_precipitation.nc']
No sub-sampling applied.
X shape: (1827, 2, 120, 408)
Precipitation target shape: (1827,)
Full shapes:
  X_train_full: (1278, 2, 120, 408), y_train_full: (1278,)
  X_val_full:   (274, 2, 120, 408),   y_val_full:   (274,)
  X_test_full:  (275, 2, 120, 408),   y_test_full:   (27

In [8]:
# Define subsampling function (spatial patches)
def sample_data(X_array, n):
    # Example: split each sample into 'n' spatial patches along the first spatial dimension.
    # Modify as needed for your specific subsampling logic.
    # Here, assume X_array shape: (M, C, H, W). We subsample H into n chunks of size H//n.
    M, C, H, W = X_array.shape
    assert H % n == 0, "H must be divisible by subsample_dim"
    h_chunk = H // n
    collected = []
    for i in range(n):
        patch = X_array[:, :, i*h_chunk:(i+1)*h_chunk, :]
        collected.append(patch)
    # Stack patches and flatten along sample axis
    concatenated = np.concatenate(collected, axis=0)
    return concatenated

# Example: verify shapes
X_train_sub_example = sample_data(X_train_full, n=4)
print(f'After subsampling train_full with n=4: {X_train_sub_example.shape}')


After subsampling train_full with n=4: (5112, 2, 30, 408)


In [9]:
def prepare_datasets(X_train, y_train, X_val, y_val, X_test, y_test, sub_sampling=False, sub_sample_dim=4):
    # Optionally apply subsampling within each split
    if sub_sampling:
        X_train_ds = sample_data(X_train, sub_sample_dim)
        X_val_ds   = sample_data(X_val,   sub_sample_dim)
        X_test_ds  = sample_data(X_test,  sub_sample_dim)
        # Repeat targets
        y_train_ds = np.repeat(y_train, sub_sample_dim)
        y_val_ds   = np.repeat(y_val,   sub_sample_dim)
        y_test_ds  = np.repeat(y_test,  sub_sample_dim)
    else:
        X_train_ds, y_train_ds = X_train, y_train
        X_val_ds,   y_val_ds   = X_val,   y_val
        X_test_ds,  y_test_ds  = X_test,  y_test

    # Convert to tensors
    X_train_t = torch.from_numpy(X_train_ds).float()
    y_train_t = torch.from_numpy(y_train_ds).float().unsqueeze(1)
    X_val_t   = torch.from_numpy(X_val_ds).float()
    y_val_t   = torch.from_numpy(y_val_ds).float().unsqueeze(1)
    X_test_t  = torch.from_numpy(X_test_ds).float()
    y_test_t  = torch.from_numpy(y_test_ds).float().unsqueeze(1)

    return (X_train_t, y_train_t), (X_val_t, y_val_t), (X_test_t, y_test_t)


In [10]:
def train_and_evaluate(hparams, train_data, val_data, test_data):
    # Unpack hyperparameters
    lr = hparams['lr']
    batch_size = hparams['batch_size']
    conv_cfg = hparams['conv_cfg']   # list of tuples
    fc_cfg = hparams['fc_cfg']       # list of ints
    drop = hparams['dropout']
    weight_decay = hparams['weight_decay']

    # Instantiate a fresh Model_CNN and build the model
    mc = Model_CNN(data_path='./data', latitude_range=(...), longitude_range=(...))
    mc.X = X_train_full  # stub to allow model building
    mc.build_model(dropout_rate=drop, conv_layers=conv_cfg, fc_layers=fc_cfg)
    model = mc.model.to(device)

    # DataLoaders
    (X_tr, y_tr), (X_v, y_v), (X_te, y_te) = train_data, val_data, test_data
    train_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(TensorDataset(X_v, y_v), batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(TensorDataset(X_te, y_te), batch_size=batch_size, shuffle=False)

    # Loss & optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Training loop (simple fixed epochs)
    num_epochs = 20
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
        # Validate
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                val_losses.append(criterion(preds, yb).item())
        avg_val_loss = np.mean(val_losses)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')
    # Load best and evaluate on test
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    test_losses = []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            test_losses.append(criterion(preds, yb).item())
    avg_test_loss = np.mean(test_losses)
    return best_val_loss, avg_test_loss


## Stage 1: Learning Rate & Batch Size

We keep a fixed architecture (3 conv blocks with filters [32, 64, 128], kernels=3, pooling=2; FC sizes [256, 128]) and vary:
- Learning rate: [1e-4, 3e-4, 1e-3, 3e-3]
- Batch size: [16, 32, 64]


In [11]:
# Define fixed architecture for Stage 1
base_conv_cfg = [(32, 3, 2), (64, 3, 2), (128, 3, 2)]
base_fc_cfg   = [256, 128]

lrs = [1e-4, 3e-4, 1e-3, 3e-3]
batch_sizes = [16, 32, 64]

results_stage1 = []

for lr in lrs:
    for bs in batch_sizes:
        hparams = {
            'lr': lr,
            'batch_size': bs,
            'conv_cfg': base_conv_cfg,
            'fc_cfg': base_fc_cfg,
            'dropout': 0.2,
            'weight_decay': 1e-5
        }
        # Prepare data (with subsampling if desired; here False)
        train_data, val_data, test_data = prepare_datasets(
            X_train_full, y_train_full,
            X_val_full,   y_val_full,
            X_test_full,  y_test_full,
            sub_sampling=False, sub_sample_dim=4
        )
        val_loss, test_loss = train_and_evaluate(hparams, train_data, val_data, test_data)
        results_stage1.append((lr, bs, val_loss, test_loss))
        print(f"LR {lr}, BS {bs} -> Val: {val_loss:.4f}, Test: {test_loss:.4f}")

# Save to DataFrame
df_stage1 = pd.DataFrame(results_stage1, columns=['lr', 'batch_size', 'val_loss', 'test_loss'])
df_stage1

Layer (type:depth-idx)                   Output Shape              Param #
FlexibleCNN                              [1, 1]                    --
├─Sequential: 1-1                        [1, 128, 15, 51]          --
│    └─Conv2d: 2-1                       [1, 32, 120, 408]         608
│    └─BatchNorm2d: 2-2                  [1, 32, 120, 408]         64
│    └─ReLU: 2-3                         [1, 32, 120, 408]         --
│    └─MaxPool2d: 2-4                    [1, 32, 60, 204]          --
│    └─Dropout2d: 2-5                    [1, 32, 60, 204]          --
│    └─Conv2d: 2-6                       [1, 64, 60, 204]          18,496
│    └─BatchNorm2d: 2-7                  [1, 64, 60, 204]          128
│    └─ReLU: 2-8                         [1, 64, 60, 204]          --
│    └─MaxPool2d: 2-9                    [1, 64, 30, 102]          --
│    └─Dropout2d: 2-10                   [1, 64, 30, 102]          --
│    └─Conv2d: 2-11                      [1, 128, 30, 102]         73,856
│    

KeyboardInterrupt: 

## Stage 2: Model Capacity (Depth & Width)

Using best LR & BS from Stage 1, vary:
- Number of conv blocks: [2, 3, 4]
- Base filter size: [16, 32]


In [None]:
# Pick best LR & BS from df_stage1
best = df_stage1.sort_values('val_loss').iloc[0]
best_lr = best['lr']
best_bs = best['batch_size']
print(f"Best from Stage1: lr={best_lr}, bs={best_bs}")

# Stage 2 search
depths = [2, 3, 4]
base_filters = [16, 32]
results_stage2 = []

for depth in depths:
    for bf in base_filters:
        # Build conv config
        conv_cfg = []
        for i in range(depth):
            out_ch = bf * (2**i)
            conv_cfg.append((out_ch, 3, 2))
        hparams = {
            'lr': best_lr,
            'batch_size': best_bs,
            'conv_cfg': conv_cfg,
            'fc_cfg': [256, 128],
            'dropout': 0.2,
            'weight_decay': 1e-5
        }
        train_data, val_data, test_data = prepare_datasets(
            X_train_full, y_train_full,
            X_val_full,   y_val_full,
            X_test_full,  y_test_full,
            sub_sampling=False, sub_sample_dim=4
        )
        val_loss, test_loss = train_and_evaluate(hparams, train_data, val_data, test_data)
        results_stage2.append((depth, bf, val_loss, test_loss))
        print(f"Depth {depth}, BaseFilt {bf} -> Val: {val_loss:.4f}, Test: {test_loss:.4f}")

df_stage2 = pd.DataFrame(results_stage2, columns=['depth', 'base_filters', 'val_loss', 'test_loss'])
df_stage2

## Stage 3: Regularization (Dropout & Weight Decay)

Using best architecture from Stage 2, vary:
- Dropout: [0.1, 0.2, 0.3]
- Weight decay: [1e-4, 1e-5, 1e-6]


In [None]:
# Pick best architecture from Stage2
best2 = df_stage2.sort_values('val_loss').iloc[0]
best_depth = int(best2['depth'])
best_bf = int(best2['base_filters'])
print(f"Best from Stage2: depth={best_depth}, base_filters={best_bf}")

# Reconstruct conv_cfg for Stage 3
conv_cfg_best = [(best_bf * (2**i), 3, 2) for i in range(best_depth)]
fc_cfg_best = [256, 128]

dropouts = [0.1, 0.2, 0.3]
wds = [1e-4, 1e-5, 1e-6]

results_stage3 = []

for drop in dropouts:
    for wd in wds:
        hparams = {
            'lr': best_lr,
            'batch_size': best_bs,
            'conv_cfg': conv_cfg_best,
            'fc_cfg': fc_cfg_best,
            'dropout': drop,
            'weight_decay': wd
        }
        train_data, val_data, test_data = prepare_datasets(
            X_train_full, y_train_full,
            X_val_full,   y_val_full,
            X_test_full,  y_test_full,
            sub_sampling=False, sub_sample_dim=4
        )
        val_loss, test_loss = train_and_evaluate(hparams, train_data, val_data, test_data)
        results_stage3.append((drop, wd, val_loss, test_loss))
        print(f"Drop {drop}, WD {wd} -> Val: {val_loss:.4f}, Test: {test_loss:.4f}")

df_stage3 = pd.DataFrame(results_stage3, columns=['dropout', 'weight_decay', 'val_loss', 'test_loss'])
df_stage3

## Stage 4: Final Tuning – Learning Rate Scheduler

Take the best config from Stage 3 and try a OneCycle scheduler.


In [None]:
# Pick best reg settings from Stage3
best3 = df_stage3.sort_values('val_loss').iloc[0]
best_drop = best3['dropout']
best_wd = best3['weight_decay']
print(f"Best from Stage3: dropout={best_drop}, weight_decay={best_wd}")

hparams = {
    'lr': best_lr,  # We'll modify with scheduler
    'batch_size': best_bs,
    'conv_cfg': conv_cfg_best,
    'fc_cfg': fc_cfg_best,
    'dropout': best_drop,
    'weight_decay': best_wd
}

# Define training with OneCycleLR
def train_with_onecycle(hparams, train_data, val_data, test_data, max_lr, epochs=20):
    # Build model
    mc = Model_CNN(data_path='./data', latitude_range=(...), longitude_range=(...))
    mc.X = X_train_full
    mc.build_model(dropout_rate=hparams['dropout'],
                   conv_layers=hparams['conv_cfg'],
                   fc_layers=hparams['fc_cfg'])
    model = mc.model.to(device)

    (X_tr, y_tr), (X_v, y_v), (X_te, y_te) = train_data, val_data, test_data
    train_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=hparams['batch_size'], shuffle=True)
    val_loader   = DataLoader(TensorDataset(X_v, y_v), batch_size=hparams['batch_size'], shuffle=False)
    test_loader  = DataLoader(TensorDataset(X_te, y_te), batch_size=hparams['batch_size'], shuffle=False)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=hparams['lr'], weight_decay=hparams['weight_decay'])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=max_lr, epochs=epochs, steps_per_epoch=len(train_loader)
    )

    best_val_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            scheduler.step()
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                val_losses.append(criterion(preds, yb).item())
        avg_val_loss = np.mean(val_losses)
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_onecycle.pth')

    model.load_state_dict(torch.load('best_onecycle.pth'))
    model.eval()
    test_losses = []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            test_losses.append(criterion(preds, yb).item())
    return best_val_loss, np.mean(test_losses)

results_stage4 = []
for max_lr in [best_lr * 10, best_lr * 5, best_lr * 2]:
    train_data, val_data, test_data = prepare_datasets(
        X_train_full, y_train_full,
        X_val_full,   y_val_full,
        X_test_full,  y_test_full,
        sub_sampling=False, sub_sample_dim=4
    )
    val_loss, test_loss = train_with_onecycle(hparams, train_data, val_data, test_data, max_lr)
    results_stage4.append((max_lr, val_loss, test_loss))
    print(f"max_lr {max_lr} -> Val: {val_loss:.4f}, Test: {test_loss:.4f}")

df_stage4 = pd.DataFrame(results_stage4, columns=['max_lr', 'val_loss', 'test_loss'])
df_stage4