In [1]:
import argparse
import logging
import os

import numpy as np
import torch
import torch.optim as optim
from tqdm import trange

import utils
import model.net as net
from model.data_loader import DataLoader


data_dir='data/small'
model_dir='experiments/base_model1'

In [5]:
def train(model, optimizer, loss_fn, data_iterator, metrics, params, num_steps):
    """Train the model on `num_steps` batches

    Args:
        model: (torch.nn.Module) the neural network
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    summ = []
    loss_avg = utils.RunningAverage()
    
    # Use tqdm for progress bar
    t = trange(num_steps) 
    for i in t:
        # fetch the next training batch
        train_batch, labels_batch = next(data_iterator)
        print(train_batch)
        print(labels_batch)

        # compute model output and loss
        output_batch = model(train_batch)
        loss = loss_fn(output_batch, labels_batch)

        # clear previous gradients, compute gradients of all variables wrt loss
        optimizer.zero_grad()
        loss.backward()

        # performs updates using calculated gradients
        optimizer.step()

        # Evaluate summaries only once in a while
        if i % params.save_summary_steps == 0:
            # extract data from torch Variable, move to cpu, convert to numpy arrays
            output_batch = output_batch.data.cpu().numpy()
            labels_batch = labels_batch.data.cpu().numpy()

            # compute all metrics on this batch
            summary_batch = {metric:metrics[metric](output_batch, labels_batch)
                             for metric in metrics}
            summary_batch['loss'] = loss.data.item()
            summ.append(summary_batch)

        # update the average loss
        loss_avg.update(loss.data.item())
        t.set_postfix(loss='{:05.3f}'.format(loss_avg()))

    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items())
    

def evaluate(model, loss_fn, data_iterator, metrics, params, num_steps):
    """Evaluate the model on `num_steps` batches.

    Args:
        model: (torch.nn.Module) the neural network
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to evaluation mode
    model.eval()

    # summary for current eval loop
    summ = []

    # compute metrics over the dataset
    for _ in range(num_steps):
        # fetch the next evaluation batch
        data_batch, labels_batch = next(data_iterator)
        
        # compute model output
        output_batch = model(data_batch)
        loss = loss_fn(output_batch, labels_batch)

        # extract data from torch Variable, move to cpu, convert to numpy arrays
        output_batch = output_batch.data.cpu().numpy()
        labels_batch = labels_batch.data.cpu().numpy()

        # compute all metrics on this batch
        summary_batch = {metric: metrics[metric](output_batch, labels_batch)
                         for metric in metrics}
        summary_batch['loss'] = loss.data.item()
        summ.append(summary_batch)

    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items())
    return metrics_mean

def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_data: (dict) training data with keys 'data' and 'labels'
        val_data: (dict) validaion data with keys 'data' and 'labels'
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        utils.load_checkpoint(restore_path, model, optimizer)
        
    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        # compute number of batches in one epoch (one full pass over the training set)
        num_steps = (params.train_size + 1) // params.batch_size
        train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True)
        train(model, optimizer, loss_fn, train_data_iterator, metrics, params, num_steps)
            
        # Evaluate for one epoch on validation set
        num_steps = (params.val_size + 1) // params.batch_size
        val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False)
        val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, params, num_steps)
        
        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict' : optimizer.state_dict()}, 
                               is_best=is_best,
                               checkpoint=model_dir)
            
        # If best_eval, best_save_path        
        if is_best:
            best_val_acc = val_acc
            
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)

In [6]:
restore_file=None

# Load the parameters from json file
json_path = os.path.join(model_dir, 'params.json')
assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
params = utils.Params(json_path)

# use GPU if available
params.cuda = torch.cuda.is_available()
    
# Set the random seed for reproducible experiments
torch.manual_seed(230)
if params.cuda: torch.cuda.manual_seed(230)
        
# Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

# Create the input data pipeline
logging.info("Loading the datasets...")
    
# load data
data_loader = DataLoader(data_dir, params)
data = data_loader.load_data(['train', 'val'], data_dir)
train_data = data['train']
val_data = data['val']

# specify the train and val dataset sizes
params.train_size = train_data['size']
params.val_size = val_data['size']

logging.info("- done.")

# Define the model and optimizer
model = net.Net(params).cuda() if params.cuda else net.Net(params)
optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)
    
# fetch loss function and metrics
loss_fn = net.loss_fn
metrics = net.metrics

# Train the model
logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir,
                    restore_file)

Loading the datasets...
- done.
Starting training for 10 epoch(s)
100%|██████████| 2/2 [00:00<00:00, 26.33it/s, loss=2.117]
100%|██████████| 2/2 [00:00<00:00, 25.24it/s, loss=2.083]
  0%|          | 0/2 [00:00<?, ?it/s]

tensor([[110, 115, 116, 117, 118,   1,   9, 114, 119,  53, 120, 121, 122, 123,
          21, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 22,   1,  23,  24,  11,   9,  25,  26,   9,  27,  28,  29,  30,  31,
          32,  33,  34,  35,  36,  37,  38,  39,  35,  13,  35,  40,   9,  41,
          21,  35],
        [124, 125, 126, 127, 128,   7, 129, 130,   7, 131, 132, 118,   1,   9,
         123, 107,  93, 133, 134, 135, 136, 137, 138, 139,  21, 366, 366, 366,
         366, 366],
        [ 61,   6,  85,  86,  87,   1,  88,  89,  90,  11,  91,  92,  93,  94,
          95,  93,  96,  93,  13,  97,  21, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 61,  98,  99, 100, 101,  78,   7, 102, 103, 104,   1, 105,  11, 106,
         107,  63, 108,   7, 109,   7, 110,  68, 111,   1, 112, 113, 114,  21,
         366, 366]], device='cuda:0')
tensor([[ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0, -1, -1, -1,
         -1

100%|██████████| 2/2 [00:00<00:00, 28.79it/s, loss=2.050]
  0%|          | 0/2 [00:00<?, ?it/s, loss=2.024]

tensor([[ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  0,  0,
          0,  5,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1],
        [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,
          0,  1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 0,  5,  6,  6,  6,  0,  0,  0,  0,  0,  0,  0,  0,  1,  7,  0,  0,  0,
          0,  0,  2,  0,  0,  0,  0,  0,  0,  0, -1, -1]], device='cuda:0')
tensor([[ 49,  50,   9,  51,   1,  52,  53,  54,  55,  56,  57,  58,  59,  60,
          21, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366],
        [ 61,  77,  78,  79,  80,  67,  68,  81,  11,   9,  12,  25,  13,   9,
          82,  83,   1,  84,  16,  17,  11,  19,  20,  21, 366],

100%|██████████| 2/2 [00:00<00:00, 28.20it/s, loss=2.016]
100%|██████████| 2/2 [00:00<00:00, 28.21it/s, loss=1.981]


Checkpoint Directory exists! 
tensor([[110, 115, 116, 117, 118,   1,   9, 114, 119,  53, 120, 121, 122, 123,
          21, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 22,   1,  23,  24,  11,   9,  25,  26,   9,  27,  28,  29,  30,  31,
          32,  33,  34,  35,  36,  37,  38,  39,  35,  13,  35,  40,   9,  41,
          21,  35],
        [124, 125, 126, 127, 128,   7, 129, 130,   7, 131, 132, 118,   1,   9,
         123, 107,  93, 133, 134, 135, 136, 137, 138, 139,  21, 366, 366, 366,
         366, 366],
        [ 61,   6,  85,  86,  87,   1,  88,  89,  90,  11,  91,  92,  93,  94,
          95,  93,  96,  93,  13,  97,  21, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 61,  98,  99, 100, 101,  78,   7, 102, 103, 104,   1, 105,  11, 106,
         107,  63, 108,   7, 109,   7, 110,  68, 111,   1, 112, 113, 114,  21,
         366, 366]], device='cuda:0')
tensor([[ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  

100%|██████████| 2/2 [00:00<00:00, 28.42it/s, loss=1.943]
100%|██████████| 2/2 [00:00<00:00, 28.84it/s, loss=1.901]


tensor([[110, 115, 116, 117, 118,   1,   9, 114, 119,  53, 120, 121, 122, 123,
          21, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 22,   1,  23,  24,  11,   9,  25,  26,   9,  27,  28,  29,  30,  31,
          32,  33,  34,  35,  36,  37,  38,  39,  35,  13,  35,  40,   9,  41,
          21,  35],
        [124, 125, 126, 127, 128,   7, 129, 130,   7, 131, 132, 118,   1,   9,
         123, 107,  93, 133, 134, 135, 136, 137, 138, 139,  21, 366, 366, 366,
         366, 366],
        [ 61,   6,  85,  86,  87,   1,  88,  89,  90,  11,  91,  92,  93,  94,
          95,  93,  96,  93,  13,  97,  21, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 61,  98,  99, 100, 101,  78,   7, 102, 103, 104,   1, 105,  11, 106,
         107,  63, 108,   7, 109,   7, 110,  68, 111,   1, 112, 113, 114,  21,
         366, 366]], device='cuda:0')
tensor([[ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0, -1, -1, -1,
         -1

100%|██████████| 2/2 [00:00<00:00, 29.11it/s, loss=1.855]
100%|██████████| 2/2 [00:00<00:00, 29.10it/s, loss=1.803]
  0%|          | 0/2 [00:00<?, ?it/s]

tensor([[110, 115, 116, 117, 118,   1,   9, 114, 119,  53, 120, 121, 122, 123,
          21, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 22,   1,  23,  24,  11,   9,  25,  26,   9,  27,  28,  29,  30,  31,
          32,  33,  34,  35,  36,  37,  38,  39,  35,  13,  35,  40,   9,  41,
          21,  35],
        [124, 125, 126, 127, 128,   7, 129, 130,   7, 131, 132, 118,   1,   9,
         123, 107,  93, 133, 134, 135, 136, 137, 138, 139,  21, 366, 366, 366,
         366, 366],
        [ 61,   6,  85,  86,  87,   1,  88,  89,  90,  11,  91,  92,  93,  94,
          95,  93,  96,  93,  13,  97,  21, 366, 366, 366, 366, 366, 366, 366,
         366, 366],
        [ 61,  98,  99, 100, 101,  78,   7, 102, 103, 104,   1, 105,  11, 106,
         107,  63, 108,   7, 109,   7, 110,  68, 111,   1, 112, 113, 114,  21,
         366, 366]], device='cuda:0')
tensor([[ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0, -1, -1, -1,
         -1

100%|██████████| 2/2 [00:00<00:00, 29.43it/s, loss=1.744]


tensor([[ 49,  50,   9,  51,   1,  52,  53,  54,  55,  56,  57,  58,  59,  60,
          21, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366],
        [ 61,  77,  78,  79,  80,  67,  68,  81,  11,   9,  12,  25,  13,   9,
          82,  83,   1,  84,  16,  17,  11,  19,  20,  21, 366],
        [  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
          14,   9,  15,   1,  16,  17,  18,  19,  20,  21, 366],
        [ 61,   8,  62,  63,   9,  64,   1,   9,  65,  66,   1,  67,  68,  69,
          70,  71,  11,   9,  72,  73,  74,  75,   1,  76,  21],
        [ 42,   4,  18,   9,  43,   1,  44,   7,  45,  46,  11,  47,  48,  21,
         366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366]],
       device='cuda:0')
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
          2,  0,  0,  0,  0,  0, -1],
        [ 0,  0, 

In [4]:
train_data

{'data': [[0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   9,
   15,
   1,
   16,
   17,
   18,
   19,
   20,
   21],
  [22,
   1,
   23,
   24,
   11,
   9,
   25,
   26,
   9,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   35,
   13,
   35,
   40,
   9,
   41,
   21,
   35],
  [42, 4, 18, 9, 43, 1, 44, 7, 45, 46, 11, 47, 48, 21],
  [49, 50, 9, 51, 1, 52, 53, 54, 55, 56, 57, 58, 59, 60, 21],
  [61,
   8,
   62,
   63,
   9,
   64,
   1,
   9,
   65,
   66,
   1,
   67,
   68,
   69,
   70,
   71,
   11,
   9,
   72,
   73,
   74,
   75,
   1,
   76,
   21],
  [61,
   77,
   78,
   79,
   80,
   67,
   68,
   81,
   11,
   9,
   12,
   25,
   13,
   9,
   82,
   83,
   1,
   84,
   16,
   17,
   11,
   19,
   20,
   21],
  [61,
   6,
   85,
   86,
   87,
   1,
   88,
   89,
   90,
   11,
   91,
   92,
   93,
   94,
   95,
   93,
   96,
   93,
   13,
   97,
   21],
  [61,
   98,
   99,
   100,


# Evaluation

In [5]:
"""
    Evaluate the model on the test set.
"""

restore_file='best'

# Load the parameters
json_path = os.path.join(model_dir, 'params.json')
assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
params = utils.Params(json_path)

# use GPU if available
params.cuda = torch.cuda.is_available()     # use GPU is available

# Set the random seed for reproducible experiments
torch.manual_seed(230)
if params.cuda: torch.cuda.manual_seed(230)

# load data
data_loader = DataLoader(data_dir, params)
data = data_loader.load_data(['test'], data_dir)
test_data = data['test']

# specify the test set size
params.test_size = test_data['size']
test_data_iterator = data_loader.data_iterator(test_data, params)

# Define the model
model = net.Net(params).cuda() if params.cuda else net.Net(params)
    
loss_fn = net.loss_fn
metrics = net.metrics

# Reload weights from the saved file
utils.load_checkpoint(os.path.join(model_dir, restore_file + '.pth.tar'), model)

# Evaluate
num_steps = (params.test_size + 1) // params.batch_size
test_metrics = evaluate(model, loss_fn, test_data_iterator, metrics, params, num_steps)
save_path = os.path.join(model_dir, "metrics_test_{}.json".format(restore_file))
utils.save_dict_to_json(test_metrics, save_path)
print(test_metrics)

{'accuracy': 0.7427669902912621, 'loss': 1.8864148259162903}
