In [3]:
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple, Any
from pathlib import Path
from tqdm import tqdm as tqdm
from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler
from numpy.typing import NDArray
from lib import *
import torch, os, pickle, time, json

In [2]:
torch.set_default_device("cuda:0")
with open("constraints.p", "rb") as f:
    (input_constraints, output_constraints) = pickle.load(f)
    input_constraints = InputConstraints(input_constraints.means, input_constraints.stds)
    output_constraints = OutputConstraints(output_constraints.means, output_constraints.stds)
train_dataset = SimData("./train.bin", in_memory=True)
test_dataset = SimData("./test.bin", in_memory=True)

In [5]:
###################
# Train/Test Loop #
###################

def train_test_loop(
        epochs = 500, k = 5, batch_size = 4096,
        num_layers = 2, num_neurons = 256, learning_rate = 6e-3,
        verbose = False, filename = None, save_files = True, show_plots = False):
    n = epochs // k
    # 'filename' : to load a pre-trained model
    # 'k' : for validation loss plots


    config_dict = {
        'epochs' : epochs,
        'k' : k,
        'batch_size' : batch_size,
        'num_layers' : num_layers,
        'num_neurons' : num_neurons,
        'learning_rate' : learning_rate,
        'verbose' : verbose,
        'filename' : filename,
        'save_files' : save_files,
        'show_plots' : show_plots
    }

    # timestamp at start of training
    timestr = time.strftime("%d-%m-%Y-%H:%M:%S")
    output_directory = Path("./") / f"training-{timestr}"
    if save_files:
        os.mkdir(output_directory)
        os.mkdir(output_directory / "train")
        os.mkdir(output_directory / "test")

    # construct or load the model
    if filename is None:
        model = VehicleModel(
            input_constraints,
            output_constraints,
            num_neurons,
            num_layers,
            verbose=verbose
        )
    else:
        model = torch.load(filename)

    # initialise the optimiser
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr = learning_rate    
    )

    # train the model
    training_loss = []
    validation_loss = []

    model.train()

    for i in tqdm(range(n), "Training", disable=False):
        training_loss += model.train_loop(
            train_dataset,
            batch_size,
            k,
            optimizer
        )
        vl = model.test_loop(
            test_dataset,
            batch_size
        )
        validation_loss += [vl]

    print(f"Final Validation Loss: {validation_loss[-1]}")
    print(f"Final Training Loss: {training_loss[-1]}")
    plt.plot(
        list(range(1, epochs + 1, 1)),
        training_loss,
        color="blue"
    )
    plt.plot(
        list(range(k, epochs + 1, k)),
        validation_loss,
        color="red"
    )
    plt.yscale("log")
    if save_files:
        plt.savefig(output_directory / "loss.png")
        torch.save(model, output_directory / "model.pt")
        with open(output_directory / "meta.json", "w") as f: json.dump(config_dict, f, indent=1)
    if show_plots == True:
        plt.show()
    model.plot_predictions(
        train_dataset,
        output_directory / "train" if save_files else None,
        show = show_plots
    )
    model.plot_predictions(
        test_dataset,
        output_directory / "test" if save_files else None,
        show = show_plots
    )

In [6]:
train_test_loop(
    epochs = 5000,
    k = 5,
    batch_size = 4096,
    num_layers = 2,
    num_neurons = 256,
    learning_rate = 6e-3,
    verbose = False,
    save_files = True,
    show_plots = False
)

Training:   0%|          | 1/1000 [00:25<7:12:21, 25.97s/it]

In [3]:
########################
# Parameter Experiment #
########################

def train(config):
    torch.set_default_device("cuda:0")
    train_dataset = SimData("/nfs/vehicle-model/train.bin", in_memory=True)
    test_dataset = SimData("/nfs/vehicle-model/test.bin", in_memory=True)

    epochs = 1000
    model = VehicleModel(
        input_constraints,
        output_constraints,
        config["num_neurons"],
        config["num_layers"],
        False
    )
    model.cuda()

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr = config["lr"]
    )

    model.train()

    train_err = model.train_loop(
        train_dataset,
        config["batch_size"],
        epochs,
        optimizer
    )

    test_err = model.test_loop(
        test_dataset,
        config["batch_size"]
    )

    session.report({
        "final_test_rmse" : test_err,
        "final_train_rmse" : train_err[-1],
        "train_rmse" : train_err
    })

config = {
    "num_layers" : tune.choice([1, 2]),
    "num_neurons" : tune.choice(2**i for i in range(4, 13)),
    "lr" : tune.loguniform(1e-5, 1e-1),
    "batch_size" : tune.choice([4096])
}

scheduler = ASHAScheduler(
    metric="final_train_rmse",
    mode="min",
)

result = tune.run(
    train,
    config=config,
    scheduler=scheduler,
    num_samples=20,
    resources_per_trial={'cpu' : 1, 'gpu' : 1}
)

pickle.dump(result.dataframe(), open(f"parameter-experiment-{time.strftime('%d-%m-%Y-%H:%M:%S')}.p", "wb"))

2023-08-15 07:21:34,350	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-15 07:21:35,037	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-08-15 07:21:35,039	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-08-15 07:24:41
Running for:,00:03:06.32
Memory:,97.3/503.3 GiB

Trial name,status,loc,batch_size,lr,num_layers,num_neurons
train_5daa1_00000,RUNNING,10.129.0.171:422,4096,2.8464e-05,1,4096
train_5daa1_00001,PENDING,,4096,0.0015949,2,128
train_5daa1_00002,PENDING,,4096,0.000810846,1,2048
train_5daa1_00003,PENDING,,4096,0.000248641,2,2048
train_5daa1_00004,PENDING,,4096,0.00120552,2,128
train_5daa1_00005,PENDING,,4096,1.46102e-05,1,512
train_5daa1_00006,PENDING,,4096,0.00014993,1,512
train_5daa1_00007,PENDING,,4096,9.74832e-05,1,512
train_5daa1_00008,PENDING,,4096,0.0118121,2,128
train_5daa1_00009,PENDING,,4096,9.31523e-05,1,4096


In [None]:
epochs = 1000
k = 5
batch_size = 4096
verbose = False
save_files = True
filename = None
show_plots = False

models = [
    {
        'epochs' : epochs,
        'k' : k,
        'batch_size' : batch_size,
        'num_layers' : 3,
        'num_neurons' : 128,
        'learning_rate' : 1.2e-3,
        'verbose' : False,
        'filename' : filename,
        'save_files' : save_files
    },
    {
        'epochs' : epochs,
        'k' : k,
        'batch_size' : batch_size,
        'num_layers' : 3,
        'num_neurons' : 256,
        'learning_rate' : 1.5e-3,
        'verbose' : False,
        'filename' : filename,
        'save_files' : save_files
    },
    {
        'epochs' : epochs,
        'k' : k,
        'batch_size' : batch_size,
        'num_layers' : 3,
        'num_neurons' : 256,
        'learning_rate' : 2.9e-3,
        'verbose' : False,
        'filename' : filename,
        'save_files' : save_files
    },
    {
        'epochs' : epochs,
        'k' : k,
        'batch_size' : batch_size,
        'num_layers' : 1,
        'num_neurons' : 512,
        'learning_rate' : 1e-2,
        'verbose' : False,
        'filename' : filename,
        'save_files' : save_files
    },
]

for model_params in models:
    train_test_loop(**model_params)

In [None]:
result.dataframe()