This notebook will compare model architecture, hyperparameters, inputs, and training/validation error

In [4]:
import xarray as xr
import numpy as np
from src.crossval import run_crossval

In [5]:
input_dir = "/glade/work/milesep/convective_outlook_ml"
target_dir = "data/processed_data"
stats_dir = "data/processed_data"

In [3]:
# compare all models, save training and validation curves along with corresponding model name, data level, model/training hyperparameters
model_names = [
    "linear_regression", "cnn3d_dropout_0_5", "cnn3d_dropout_5_5", "cnn3d_dropout_5_0", "cnn3d",
    "cnn3d_gelu_0_5", "cnn3d_gelu_0_5", "cnn3d_gelu_0_5", "cnn3d_gelu_0_5",
    "cnn3d_gelu_0_5", "cnn3d_gelu_0_5", "cnn3d_gelu_0_5",
    "cnn3d_gelu_0_5", "cnn3d_gelu_0_5", "cnn3d_gelu_0_5",
    "cnn3d_gelu_0_5", "cnn3d_dropout_5_5", "cnn3d_dropout_5_0",
    "cnn3d_gelu_2_6", "cnn3d_gelu_2_6",
    "cnn3d_big_kernal", "cnn3d_huge_kernal",
    "cnn3d_3_layer", "cnn3d_3_layer_big", "cnn3d_fewer_channels", "cnn3d_3_layer_fewer_channels",
    # --- slgt_small ---
    "cnn3d_gelu_2_6", "cnn3d_gelu_0_5", "cnn3d_3_layer", "cnn3d_fewer_channels", "cnn3d_3_layer_big",
    "cnn3d_gelu_2_6", "cnn3d_gelu_2_6", "cnn3d_gelu_2_6", "cnn3d_gelu_2_6",
    "cnn3d_3_layer_fewer_channels", "cnn3d_3_layer_big", "cnn3d_3_layer", "cnn3d_3_layer_1_3",
    "cnn3d_3_layer_fewer_channels_1_3",
    "cnn3d_3_layer_fewer_channels", "cnn3d_3_layer_big", "cnn3d_3_layer", "cnn3d_3_layer_1_3",
    "cnn3d_3_layer_big_1_3", "cnn3d_3_layer_fewer_channels_1_3",
    "cnn3d_4_layer", "cnn3d_3_layer_big_0", "cnn3d_3_layer_big_1_3",
    "cnn3d_4_layer_1_3", "cnn3d_4_layer_big", "cnn3d_4_layer_big_1_3", "predict_zero", "predict_mean",
    # --- full ---
    "cnn3d_gelu_0_5", "cnn3d_gelu_2_6", "cnn3d_big_kernal", "cnn3d_huge_kernal",
    "cnn3d_3_layer", "cnn3d_3_layer_big", "cnn3d_fewer_channels", "cnn3d_3_layer_fewer_channels"
]

levels = [
    # small
    *["small"]*26,
    # slgt_small
    *["slgt_small"]*27,
    # full
    *["full"]*9
]

lrs = [
    # small
    1e-3, 1e-3, 1e-3, 1e-3, 1e-3,
    1e-3, 1e-3, 1e-3, 1e-3,
    1e-4, 1e-4, 1e-4,
    4e-4, 4e-4, 4e-4,
    1e-3, 1e-3, 1e-3,
    1e-3, 1e-3,
    1e-3, 1e-3,
    1e-3, 1e-3, 1e-3, 1e-3,
    # slgt_small
    1e-3, 1e-3, 1e-3, 1e-3, 1e-3,
    1e-4, 1e-4, 5e-3, 5e-3,
    5e-3, 5e-3, 5e-3, 5e-3,
    5e-3,
    1e-2, 1e-2, 1e-2, 1e-2,
    1e-2, 1e-2,
    5e-3, 5e-3, 5e-3,
    5e-3, 5e-3, 5e-3, 5e-3, 5e-3,
    # full
    1e-3, 1e-3, 1e-3, 1e-3,
    1e-3, 1e-3, 1e-3, 1e-3, 1e-3
]

batch_sizes = [
    # small
    32, 32, 32, 32, 32,
    32, 4, 8, 32,
    4, 8, 32,
    4, 8, 32,
    64, 64, 64,
    64, 8,
    8, 8,
    8, 8, 8, 8,
    # slgt_small
    8, 8, 8, 8, 8,
    4, 32, 4, 32,
    32, 32, 32, 32,
    32,
    64, 64, 64, 64,
    64, 64,
    32, 32, 32,
    32, 32, 32, 32, 32,
    # full
    8, 8, 8, 8,
    8, 8, 8, 8, 8
]

epochs = [50] * len(model_names)
# restarts = [True, False, False, False, False, False, False, False, False, False, False]
# Don't do linear regression with full dataset--too many parameters
for name, level, lr, batch_size, epoch in zip(model_names, levels, lrs, batch_sizes, epochs):
    if level[:4] == 'slgt':
        slgt_mod_str = '_slgt'
    else:
        slgt_mod_str = ''
    print(name, level, lr, batch_size)
    inputs = xr.open_zarr(f"{input_dir}/train_inputs_{level}.zarr")
    targets = xr.open_dataset(f"{target_dir}/train_targets{slgt_mod_str}.nc")
    stats = xr.open_dataset(f"{stats_dir}/daily_input_stats_{level}.nc")
    scores, train_counts, val_counts = run_crossval(inputs, targets, stats, name, batch_size = batch_size, lr = lr, epochs = epoch, level = level, restart = True)
    print(f"{name}: {np.average(scores, weights = val_counts):.3f} ± {np.std(scores):.3f}")

predict_mean small 0.005 32
Using device: cuda

Fold 0:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:22<00:00,  1.64s/epoch, train_loss=1.0004, val_loss=1.0019]



Fold 1:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:21<00:00,  1.63s/epoch, train_loss=0.9406, val_loss=1.2400]



Fold 2:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:24<00:00,  1.68s/epoch, train_loss=0.9996, val_loss=1.0048]



Fold 3:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:22<00:00,  1.65s/epoch, train_loss=1.0149, val_loss=0.9478]



Fold 4:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:23<00:00,  1.68s/epoch, train_loss=1.0397, val_loss=0.8430]


Logging average losses...
Reading fold 0
Reading fold 1
Reading fold 2
Reading fold 3
Reading fold 4
predict_mean: 1.008 ± 0.130
predict_zero small 0.005 32
Using device: cuda

Fold 0:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:24<00:00,  1.69s/epoch, train_loss=1.0011, val_loss=0.9956]



Fold 1:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:22<00:00,  1.65s/epoch, train_loss=0.9417, val_loss=1.2307]



Fold 2:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:23<00:00,  1.67s/epoch, train_loss=1.0002, val_loss=0.9991]



Fold 3:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:25<00:00,  1.70s/epoch, train_loss=1.0164, val_loss=0.9339]



Fold 4:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training: 100%|██████████| 50/50 [01:23<00:00,  1.67s/epoch, train_loss=1.0404, val_loss=0.8373]


Logging average losses...
Reading fold 0
Reading fold 1
Reading fold 2
Reading fold 3
Reading fold 4
predict_zero: 1.000 ± 0.130
predict_mean slgt_small 0.005 32
Using device: cuda

Fold 0:
Loading data...
Standardizing data...
Setting up datasets...
Model has no parameters — skipping device check
Starting training from scratch


Training:   2%|▏         | 1/50 [00:22<18:09, 22.23s/epoch, train_loss=0.9574, val_loss=1.1708]


KeyboardInterrupt: 