# Hyperparam Tuning

## 3 Edge Config

In [1]:
# ipython extension to autoreload imported modules so that any changes will be up to date before running code in this nb
%load_ext autoreload 
%autoreload 2

In [2]:
from utils.jraph_training import train_and_evaluate_with_data, create_dataset
# from utils.jraph_models import MLPGraphNetwork
from utils.jraph_data import print_graph_fts
from utils.jraph_vis import plot_predictions
from utils.hyperparam_tuning import remove_bad_trials, get_best_trial_config, get_best_trial_workdir
import ml_collections
import optuna 
from flax import linen as nn
from functools import partial
from datetime import datetime
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# set up logging
import logging
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [4]:
CHECKPOINT_PATH = "/Users/miamirabelli/Desktop/GNN Research/lorenzGNN/experiments/tuning"

In [5]:
def objective(trial, study_name, datasets):
    """ Defines the objective function to be optimized over, aka the validation loss of a model.
    
        Args:
            trial: object which characterizes the current run 
            datasets: dictionary of data. we explicitly pass this in so that we don't have to waste runtime regenerating the same dataset over and over. 
    """
    # create config 
    config = ml_collections.ConfigDict()

    # Optimizer.
    # config.optimizer = "adam"
    config.optimizer = 'sgd'
    config.learning_rate =  trial.suggest_float('learning_rate', 0.00045346796177033903, 0.00045346796177033903, 
                                               log=True) #0.00045346796177033903
    config.momentum = trial.suggest_float('momentum', 0.8712873602503628, 0.8712873602503628) # 0.8712873602503628
    # Data params that are used in training 
    config.output_steps = 6

    # Training hyperparameters.
    config.epochs = 50
    config.log_every_epochs = 5
    config.eval_every_epochs = 5
    config.checkpoint_every_epochs = 10
    config.max_checkpts_to_keep = 2

    # GNN hyperparameters.
    config.model = 'MLPBlock'
    config.dropout_rate = 0.013287043114620523
    config.skip_connections = False # This was throwing a broadcast error in add_graphs_tuples_nodes when this was set to True
    config.layer_norm = False # TODO perhaps we want to turn on later
    config.activation = 'relu'
    config.n_blocks = trial.suggest_int('n_blocks', 1, 1)

    
    # choose the hidden layer feature size using powers of 2 
    config.edge_features = (
        2**trial.suggest_int("edge_mlp_1_power", 1, 3), # range 2 - 8; upper bound is inclusive
        2**trial.suggest_int("edge_mlp_2_power", 1, 3), # range 2 - 8
    )
    config.node_features = (
        2**trial.suggest_int("node_mlp_1_power", 1, 8), # range 2 - 512
        2**trial.suggest_int("node_mlp_2_power", 1, 8), # range 2 - 512
        2) 
    # note the last feature size will be the number of features that the graph predicts
    config.global_features = None

    # generate a workdir 
    # TODO: check if we actually care about referencing this in the future or if we can just create a temp dir 
    workdir=os.path.join(CHECKPOINT_PATH, study_name, f"trial_{trial.number}")

    # run training 
    state, train_metrics, eval_metrics_dict, _ = train_and_evaluate_with_data(config=config, workdir=workdir, datasets=datasets, trial=trial)
    
    # retrieve and return val loss (MSE)
    # print("eval_metrics_dict['val'].loss", eval_metrics_dict['val'].loss)
    # print("eval_metrics_dict['val'].compute()['loss']", eval_metrics_dict['val'].compute()['loss'])
    # print()
    return eval_metrics_dict['val'].compute()['loss']

In [6]:
def get_data_config(edge_connection=None):
    config = ml_collections.ConfigDict()

    config.n_samples=5000
    config.input_steps=1
    config.output_delay=0 # predict 24 hrs into the future 
    config.output_steps=6
    config.timestep_duration=3 # equivalent to 3 hours
    # note a 3 hour timestep resolution would be 5*24/3=40
    # if the time_resolution is 120, then a sampling frequency of 3 would achieve a 3 hour timestep 
    config.sample_buffer = -1 * (config.input_steps + config.output_delay + config.output_steps - 1) # negative buffer so that our sample input are continuous (i.e. the first sample would overlap a bit with consecutive samples) 
        # number of timesteps strictly between the end 
        # of one full sample and the start of the next sample
    config.time_resolution=120 # the number of 
                # raw data points generated per time unit, equivalent to the 
                # number of data points generated per 5 days in the simulation
    config.init_buffer_samples=0
    config.train_pct=0.7
    config.val_pct=0.2
    config.test_pct=0.1
    config.K=36
    config.F=8
    config.c=10
    config.b=10
    config.h=1
    config.seed=65
    config.normalize=True
    config.fully_connected_edges=edge_connection

    return config

In [8]:
def prepare_study(study_name, edge_connection=None):
    # generate dataset 
    dataset_config = get_data_config(edge_connection=edge_connection)
    datasets = create_dataset(dataset_config)
    print_graph_fts(datasets['train']['inputs'][0][0])

    # get the objective function that reuses the pre-generated datasets 
    objective_partial = partial(objective, study_name=study_name, 
                                datasets=datasets)

    # run optimization study
    db_path = os.path.join(CHECKPOINT_PATH, study_name, "optuna_hparam_search.db")
    if not os.path.exists(os.path.join(CHECKPOINT_PATH, study_name)):
        os.makedirs(os.path.join(CHECKPOINT_PATH, study_name))

    study = optuna.create_study(
        study_name=study_name,
        storage=f'sqlite:///{db_path}', # generates a new db if it doesn't exist
        direction='minimize',
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5, 
            n_warmup_steps=50,
            ), 
        load_if_exists=True, 
    )
    
    return study, objective_partial

In [9]:
# get study
connected3, objective_partial3 = prepare_study(study_name="connected3_optimized", edge_connection=3)

[I 2025-02-06 16:20:37,519] A new study created in RDB with name: connected3_optimized


Number of nodes: 36
Number of edges: 108
Node features shape: (36, 2)
Edge features shape: (108, 1)
Global features shape: (1, 1)


In [10]:
connected3.optimize(objective_partial3, 
                n_trials=20-len(connected3.trials), 
                n_jobs=1)

[I 2025-02-06 16:20:45,648] Trial 0 pruned. 
[I 2025-02-06 16:21:53,101] Trial 1 finished with value: 1.246018409729004 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 2, 'edge_mlp_2_power': 1, 'node_mlp_1_power': 5, 'node_mlp_2_power': 6}. Best is trial 1 with value: 1.246018409729004.
[I 2025-02-06 16:23:18,120] Trial 2 finished with value: 1.2149707078933716 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 3, 'edge_mlp_2_power': 2, 'node_mlp_1_power': 4, 'node_mlp_2_power': 7}. Best is trial 2 with value: 1.2149707078933716.
[I 2025-02-06 16:24:27,011] Trial 3 finished with value: 1.371066927909851 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 2, 'node_mlp_1_power': 6, 'node_mlp_2_power': 5}. Best is trial 2 with value: 1.2149707078

In [12]:
current = get_best_trial_config(study=connected3)
print(f"edge features:", current.edge_features)
print(f"node features:", current.node_features)

edge features: (4, 4)
node features: (64, 2)


## 7 Edge Config

In [13]:
# get study
connected7, objective_partial7 = prepare_study(study_name="connected7_optimized", edge_connection=7)

[I 2025-02-11 14:52:23,807] A new study created in RDB with name: connected7_optimized


Number of nodes: 36
Number of edges: 252
Node features shape: (36, 2)
Edge features shape: (252, 1)
Global features shape: (1, 1)


In [14]:
connected7.optimize(objective_partial7, 
                n_trials=15-len(connected7.trials), 
                n_jobs=1)

[I 2025-02-11 14:53:48,511] Trial 0 finished with value: 1.6976850032806396 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 3, 'edge_mlp_2_power': 3, 'node_mlp_1_power': 1, 'node_mlp_2_power': 1}. Best is trial 0 with value: 1.6976850032806396.
[I 2025-02-11 14:55:16,488] Trial 1 finished with value: 3.0314218997955322 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 2, 'edge_mlp_2_power': 3, 'node_mlp_1_power': 6, 'node_mlp_2_power': 1}. Best is trial 0 with value: 1.6976850032806396.
[I 2025-02-11 14:57:44,473] Trial 2 finished with value: 3.0314218997955322 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 2, 'edge_mlp_2_power': 3, 'node_mlp_1_power': 8, 'node_mlp_2_power': 1}. Best is trial 0 with value: 1.6976850032806396.
[I 2025-02-11 14:57:47,807] Trial 

In [15]:
current = get_best_trial_config(study=connected7)
print(f"edge features:", current.edge_features)
print(f"node features:", current.node_features)

edge features: (4, 2)
node features: (16, 2)


## Solo Edge Config

In [16]:
connected1, objective_partial1 = prepare_study(study_name="connected1_optimized", edge_connection=1)

[I 2025-02-11 15:15:05,525] A new study created in RDB with name: connected1_optimized


Number of nodes: 36
Number of edges: 36
Node features shape: (36, 2)
Edge features shape: (36, 1)
Global features shape: (1, 1)


In [17]:
connected1.optimize(objective_partial1, 
                n_trials=15-len(connected1.trials), 
                n_jobs=1)

[I 2025-02-11 15:16:00,425] Trial 0 finished with value: 1.603869915008545 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 2, 'node_mlp_1_power': 5, 'node_mlp_2_power': 3}. Best is trial 0 with value: 1.603869915008545.
[I 2025-02-11 15:17:44,197] Trial 1 finished with value: 1.5971708297729492 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 1, 'edge_mlp_2_power': 2, 'node_mlp_1_power': 3, 'node_mlp_2_power': 8}. Best is trial 1 with value: 1.5971708297729492.
[I 2025-02-11 15:18:36,540] Trial 2 finished with value: 1.6543980836868286 and parameters: {'learning_rate': 0.00045346796177033903, 'momentum': 0.8712873602503628, 'n_blocks': 1, 'edge_mlp_1_power': 2, 'edge_mlp_2_power': 1, 'node_mlp_1_power': 1, 'node_mlp_2_power': 1}. Best is trial 1 with value: 1.5971708297729492.
[I 2025-02-11 15:19:38,249] Trial 3 

In [18]:
current = get_best_trial_config(study=connected1)
print(f"edge features:", current.edge_features)
print(f"node features:", current.node_features)

edge features: (8, 2)
node features: (16, 2)
