In [1]:
import json
import os
import neptune

In [2]:
from networks.dataset import get_loader, calculate_hash
from networks.model import MyModel
from utils.neptune import NeptuneMonitor

# Training configuration

Load configuration from json file. Alternatively you can put your configuration here.

In [6]:
# config = {
#     "neptune": {
#         "project_name": "inn/Hippocampus"
#     },
#     "setup": {
#         "dataset_dir": "/home/paperspace/datasets/processed/hippocampus_coronal_mindboggle_176x256_inn",
#         "checkpoint_dir": "output/models",
#         "struct": "hippocampus",
#         "epochs": 100,
#         "batch_size": 16,
#         "train_ds_limit": None,
#         "valid_ds_limit": None,
#         "input_shape": [
#             256,
#             176
#         ],
#         "augment": True,
#         "seed": 5
#     },
#     "model": {
#             "arch": "Unet",
#             "filters": 16,
#             "loss_fn": "boundary_gdl",
#             "optimizer_fn": "RAdam",
#             "checkpoint": "hippocampus_coronal_boundary_gdl_unet_aug_radam"
#     },
#     "tags": ['hippocampus', 'coronal']
# }

In [7]:
with open('neptune.config.json', 'r') as cfg_file:
    config = json.load(cfg_file)

In [8]:
setup = config['setup']
model = config['model']
tags = config['tags'] or []
dataset_dir = setup['dataset_dir']

KeyError: 'model'

# Dataset configuration

#### Get training dataset generator

In [5]:
train_loader = get_loader(dataset_dir, 'train',
                          augment=setup['augment'],
                          shuffle=True,
                          limit=setup['train_ds_limit'])

ValueError: not enough values to unpack (expected 2, got 0)

#### Get validation dataset generator

In [None]:
valid_loader = get_loader(dataset_dir, 'valid',
                          limit=setup['valid_ds_limit'])

#### Calculate training dataset md5 hash

In [None]:
train_hash = calculate_hash(dataset_dir, 'train', verbose=1)

#### Calculate training dataset md5 hash

In [None]:
valid_hash = calculate_hash(dataset_dir, 'valid', verbose=1)

# Neptune.ai configuration

#### Initialize project

In [None]:
neptune.init(config['neptune']['project_name'])

# Neptune.ai experiment

#### Grab model hyperparameters
You can save any parameter you like, just append it to the dictionary

In [None]:
params = {
    'arch': model['arch'],
    'batch_size': setup['batch_size'],
    'filters': model['filters'],
    'loss_fn': model['loss_fn'],
    'optimizer_fn': model['optimizer_fn'],
    'data_augment': setup['augment'],
    'seed': setup['seed'],
    'train_data_version': train_hash,
    'valid_data_version': valid_hash
}

#### Create experiment

In [None]:
experiment = neptune.create_experiment(name=setup['struct'], params=params)

#### Add experiment tags

In [None]:
experiment.append_tag(tags)

# Training loop
Basically this is the place, where we integrate neptune.ai with our codebase

In [None]:
my_model = MyModel(checkpoint_dir=setup['checkpoint_dir'])

In [None]:
my_model.setup_model(
    train_generator=train_loader,
    valid_generator=valid_loader,
    checkpoint=model['checkpoint'])

In [None]:
my_model.create_model(arch=model['arch'],
                      optimizer_fn=model['optimizer_fn'],
                      loss_fn=model['loss_fn'],
                      n_filters=model['filters'],
                      input_shape=tuple(setup['input_shape']),
                      verbose=1)

#### Neptune Callback for training
We are using Tensorflow 2.0 with Keras helpers, so we can make use of Callbacks to save monitored metrics during the training. Each epoch restult metrics will be saved as chart log.

In [None]:
my_model.start_train(epochs=setup['epochs'], custom_callbacks=[
    NeptuneMonitor(experiment=experiment, evaluation=False)])

In [None]:
my_model.load_model(verbose=1)

#### Neptune Callback for evaluation
Similarily to training we are using callback to save metrics. This time we are doing it for model evaluation, so only final metrics will be saved, without epoch by epoch chart log.

In [None]:
my_model.start_evaluate(custom_callbacks=[
    NeptuneMonitor(experiment=experiment, evaluation=True)])

#### Upload saved model
After training we are sending best saved model checkpoint, as specified in the configuration.

In [None]:
experiment.log_artifact(os.path.join(
    setup['checkpoint_dir'], f"{model['checkpoint']}.h5"))

#### Stop experiment
It is very important to finish experiment after execution to free unused resources and to avoid unfinished statuses

In [None]:
experiment.stop()