In [None]:
import copy
import time
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim
import torchvision.transforms as transforms

from utils import dataset
from models import nvidia
from models import transformer

# Tqdm progress bar
from tqdm import tqdm_notebook, tqdm

LEARNING_RATE = 1e-8
MOMENTUM = 0.4
WEIGHT_DECAY_REGULARIZATION_TERM = 1e-6
BATCH_SIZE = 8
NUM_EPOCHS = 10


# Citation:
# - AverageMeter taken verbatim from the Assignment 2 training code.
# - Remainder of code in this file based on Assignment 2 training code.

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def train(epoch, data_loader, model, optimizer, criterion, scaler=None):
    iter_time = AverageMeter()
    losses = AverageMeter()
    

    # Get the progress bar for later modification
    progress_bar = tqdm_notebook(data_loader, ascii=True)

    for idx, (data, target) in enumerate(progress_bar):
        start = time.time()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()

        # Forward pass and computation of loss.
        with torch.autocast("cuda"): #Automatic Mixed precision
            out = model(data).reshape(target.shape)
          # RMSE loss
            loss = criterion(out, target)

        
        # Backwards pass to determine gradients and update model parameters.
        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()

        for param in model.parameters():
            param.grad = None
        scaler.scale(loss).backward()
        scaler.step(optimizer)

        scaler.update()
        losses.update(loss, out.shape[0])

        iter_time.update(time.time() - start)
        # if idx % 5 == 0:
        #     print(('Epoch: [{0}][{1}/{2}]\t'
        #            'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t'
        #            'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
        #           .format(epoch, idx, len(data_loader), iter_time=iter_time, loss=losses)))

        progress_bar.set_description_str(f"Epoch {epoch}, Batch: {idx+1}, Loss: {loss.item():.4f}")
            # "Batch: %d, Loss: %.4f" % ((idx + 1), loss.item()))
        
    return losses.avg

def validate(epoch, validation_loader, model, criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()

    # Get the progress bar for later modification
    progress_bar = tqdm_notebook(validation_loader, ascii=True)

    for idx, (data, target) in enumerate(progress_bar):
        start = time.time()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()

        out = None
        loss = None

        with torch.no_grad():
            out = model(data)
            # RMSE loss
            loss = torch.sqrt(criterion(out, target))

        # loss.squeeze()
        losses.update(loss, out.shape[0])

        iter_time.update(time.time() - start)

        # if idx % 10 == 0:
        #     print(('Epoch: [{0}][{1}/{2}]\t'
        #            'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t')
        #           .format(epoch, idx, len(validation_loader), iter_time=iter_time, loss=losses))

        progress_bar.set_description_str(f"Batch: {idx+1}, Loss: {loss.item():.4f}")
           
        
        # progress_bar.set_description_str(
        #     "Batch: %d, Loss: %.4f" % ((idx + 1), loss.item()))

    print("* Average Loss @1: {loss.avg:.4f}".format(loss=losses))
    return losses.avg

def test(testing_loader, model, criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()

    # Get the progress bar for later modification
    progress_bar = tqdm_notebook(testing_loader, ascii=True)

    for idx, (data, target) in enumerate(progress_bar):
        start = time.time()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()

        out = None
        loss = None

        with torch.no_grad():
            out = model(data)
            # MSE loss
            loss = criterion(out, target)

        losses.update(loss, out.shape[0])

        iter_time.update(time.time() - start)

        progress_bar.set_description_str(f"Batch: {idx+1}, Loss: {loss.item():.4f}")

    print("* Average Loss @1: {loss.avg:.4f}".format(loss=losses))
#   Return RMSE loss
    return losses.avg ** 0.5 

# RSME Loss function. eps prevents [nan] in the backward pass
def RSMELoss(yhat, y, eps=1e-6):
    return torch.sqrt(torch.mean((yhat-y)**2) + eps)

def plots(losses, lr=LEARNING_RATE, reg=WEIGHT_DECAY_REGULARIZATION_TERM, batch=BATCH_SIZE, momentum=MOMENTUM):
    fig1, ax1 = plt.subplots(figsize=(8, 10))
    ax1.plot(losses[0], label='Training Loss')
    ax1.plot(losses[1], label='Validation Loss')
    ax1.set_xlabel('#Epochs')
    ax1.set_ylabel('Loss')
    ax1.set_title(f'L-Curves -> Train & Valid LR={lr} , Reg_Term={reg}, Batch={batch}')
    ax1.legend(loc="best")
    plt.grid()
    plt.show()

In [None]:
TRAIN_TRANSFORMER = True

def main():
    # Normalizing images per the paper and resizing each image.
    transformations = [
        # Citation:
        # https://pytorch.org/vision/stable/transforms.html#scriptable-transforms
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        transforms.RandomRotation(2),
    ]
    
    if TRAIN_TRANSFORMER:
        # The PyTorch vit_b_16 model requires input images to be square and have height and width divisible by 16.
        transformations.append(transforms.Resize((128, 128)))
    else:
        # CNN images are resized to 66 x 200 to match the size used in the original Nvidia paper.
        transformations.append(transforms.Resize((128, 128)))
    
    transform = transforms.Compose(transformations)

    # Loading in images with normalization and resizing applied.
    training_set, validation_set, test_set = dataset.load_nvidia_dataset(transform=transform, batch_size=BATCH_SIZE)
    torch.cuda.empty_cache()


    model = None
    
    if TRAIN_TRANSFORMER:
        # Loading in the vision transformer model.
        model = transformer.VisionTransformer(image_size=128, patch_size=32, num_layers=1, \
                                              num_heads=8, hidden_dim=128, mlp_dim=1024)
    else:
        # Loading in the NVIDIA DAVE-2 model.
        model = nvidia.NvidiaDaveCNN()

    if torch.cuda.is_available():
        model = model.to(torch.device("cuda"))


    # Specify Mean Squared Error (MSE) or RSME as the criterion since this is a regression task.
    criterion = nn.MSELoss()
    # criterion = RMSELoss()

    # Using Stochastic Gradient Descent (SGD) as the optimizer.
    optimizer = torch.optim.Adam(model.parameters(), 
                                LEARNING_RATE,
                                weight_decay=WEIGHT_DECAY_REGULARIZATION_TERM)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True) 
    # torch.backends.cudnn.benchmark = True

    best = float('inf')
    best_model = None
    best_epoch = None
    train_losses = []
    valid_losses = []
    scaler = torch.cuda.amp.GradScaler()
    for epoch in range(NUM_EPOCHS):

        # Training.
        train_loss = train(epoch, training_set, model, optimizer, criterion, scaler=scaler)
        temp = train_loss
        temp.cpu()
        temp = float(temp)
        train_losses.append(temp)  # average losses
        print(f"Average training loss this epoch: {temp:.4f}")

        # Validation.
        valid_loss = validate(epoch, validation_set, model, criterion)
        temp = valid_loss
        temp.cpu()
        temp = float(temp)
        valid_losses.append(temp)

        scheduler.step(train_loss)

        if valid_loss < best:
            best = valid_loss
            best_model = copy.deepcopy(model)
            best_epoch = epoch
        print(f"Best performing model so far average validation loss: {best:.4f} on epoch {best_epoch}\n")

    print('Best Training Loss @1: {:.4f}'.format(best))

    torch.save(best_model.state_dict(), './checkpoints/vision_transformer.pth')
    
    # Testing the best model.
    test_loss = test(test_set, best_model, criterion)
    
    print('Test Loss @1: {:.4f}'.format(test_loss))

    losses_to_plot = train_losses, valid_losses
    plots(losses_to_plot, lr=LEARNING_RATE, reg=WEIGHT_DECAY_REGULARIZATION_TERM,
          batch=BATCH_SIZE, momentum=MOMENTUM)

    

if __name__ == '__main__':
    main()

In [None]:
import sigopt

def evaluate(args):       
        
    # log source of hyperparameter suggestion
    sigopt.log_metadata('optimizer', "Adam")
    sigopt.log_model("Pre-Built Vision Transformer")
    sigopt.log_dataset("Udacity self-driving dataset ")
    
    sigopt.params.setdefault("learning_rate", args['learning_rate'])
    sigopt.params.setdefault("momentum", args['momentum'])
    sigopt.params.setdefault("reg", args['reg'])
    #sigopt.params.setdefault("batch_size", int(args['batch_size']))
    sigopt.params.setdefault("patch_size", int(args['patch_size']))
    sigopt.params.setdefault("num_layers", int(args['num_layers']))
    sigopt.params.setdefault("num_heads", int(args['num_heads']))
    sigopt.params.setdefault("hidden_dim", int(args['hidden_dim']))
    
    # Normalizing images per the paper and resizing each image to 66 x 200.
    transform = transforms.Compose([
        # Citation:
        # https://pytorch.org/vision/stable/transforms.html#scriptable-transforms
        transforms.Resize((128, 128)),
    ])

    # Loading in images with normalization and resizing applied.
    training_set, validation_set, _ = dataset.load_nvidia_dataset(transform=transform, batch_size=int(args['batch_size']))
    torch.cuda.empty_cache()

    # Loading in the vision transformer model.
    model = transformer.VisionTransformer(image_size=128, patch_size=int(sigopt.params.patch_size), num_layers=int(sigopt.params.num_layers), \
                                          num_heads=int(sigopt.params.num_heads), hidden_dim=int(sigopt.params.hidden_dim), mlp_dim=1024)

    if torch.cuda.is_available():
        model = model.to(torch.device("cuda"))


    # Specify Mean Squared Error (MSE) as criterion since this is a regression task. (We ultimately take sqrt and convert it to RMSE)
    criterion = nn.MSELoss()

    # Using Stochastic Gradient Descent (SGD) as the optimizer.
    optimizer = torch.optim.Adam(model.parameters(), sigopt.params.learning_rate,
                                weight_decay=sigopt.params.reg)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True) 
    # torch.backends.cudnn.benchmark = True

    best = float('inf')
    best_model = None
    best_epoch = None
    train_losses = []
    valid_losses = []
    scaler = torch.cuda.amp.GradScaler()
    for epoch in range(NUM_EPOCHS):

        # Training.
        train_loss = train(epoch, training_set, model, optimizer, criterion, scaler=scaler)
        temp = train_loss
        temp.cpu()
        temp = float(temp)
        train_losses.append(temp)  # average losses
        print(f"Average training loss this epoch: {temp:.4f}")

        # Validation.
        valid_loss = validate(epoch, validation_set, model, criterion)  # JTS - changed from test to validation
        temp = valid_loss
        temp.cpu()
        temp = float(temp)
        valid_losses.append(temp)

        # scheduler.step(train_loss)  # JTS - this may or may not interfere with sigopt

        if valid_loss < best:
            best = valid_loss
            best_model = copy.deepcopy(model)
            best_epoch = epoch
        print(f"Best performing model so far average validation loss: {best:.4f} on epoch {best_epoch}\n")

    print('Best Training Loss @1: {:.4f}'.format(best))

    torch.save(best_model.state_dict(), './checkpoints/prebuilt_vision_transformer.pth')
    sigopt.log_metric(name='MSE', value=best)
    return best.cpu().numpy()

In [None]:
import os
os.environ["SIGOPT_API_TOKEN"] = "XWBIVDWCVQXALUZQFDHNGOELLLKDJBMOJALEPCNQXQGBNIGC"
os.environ['SIGOPT_PROJECT'] = 'prebuilt_vit'
%reload_ext sigopt
args = {
    'batch_size': 1,
    'learning_rate': 0.035,
    'momentum': 0.9,
    'reg': 0.0068,
    'patch_size': 8,
    'num_layers': 2,
    'num_heads': 2,
    'hidden_dim': 32
}

In [None]:
%%experiment
{
    'name': 'Pre-Built Vision Transformer Optimization',
    'metrics': [
        {
            'name': 'MSE',
            'strategy': 'optimize',
            'objective': 'minimize',
        }
    ],
    'parameters': [
        {
            'name': 'reg',
            'type': 'double',
            'bounds': {'min': 0.0001, 'max': 0.5},
            'transformation': 'log'
        },
        {
            'name': 'learning_rate',
            'type': 'double',
            'bounds': {'min': 0.0001, 'max': 0.9},
            'transformation': 'log'
        },
        {
            'name': 'batch_size',
            'type': 'categorical',
            'categorical_values': ['32', '64']
        },
        {
            'name': 'patch_size',
            'type': 'categorical',
            'categorical_values': ['1', '2', '4', '8', '16', '32', '64']
        },
        {
            'name': 'num_heads',
            'type': 'categorical',
            'categorical_values': ['2', '4', '8', '16', '32']
        },
        {
            'name': 'hidden_dim',
            'type': 'categorical',
            'categorical_values': ['512', '1024', '2048', '4096', '8192']
        },
        {
            'name': 'num_layers',
            'type': 'categorical',
            'categorical_values': ['2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
        }
    ],
    'budget': 30
}

In [None]:
%%optimize gcp_prebuilt_vision_transformer_run
evaluate(args)