# Import

In [None]:
import os
import struct
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from itertools import product
from collections import namedtuple, OrderedDict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)

import torchvision
from torchvision import datasets, models, transforms

from torch.utils.tensorboard import SummaryWriter
tensorboard_log_dir = os.path.join(os.environ["HOME"],"workspace","tensorboard_logdir")

from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, explained_variance_score, mean_squared_log_error, mean_absolute_error, median_absolute_error, mean_squared_error, r2_score, confusion_matrix, roc_curve, accuracy_score, roc_auc_score, homogeneity_score, completeness_score, classification_report, silhouette_samples

%load_ext autoreload
%autoreload 2

In [5]:
!python run_executor.py

Traceback (most recent call last):
  File "run_executor.py", line 750, in <module>
    executor.execute()
  File "run_executor.py", line 578, in execute
    self.end_run()
  File "run_executor.py", line 191, in end_run
    self.dump_metrics_to_csv()
  File "run_executor.py", line 292, in dump_metrics_to_csv
    _ = pd.read_csv(os.path.join(results_file, "results.csv"))
  File "/Users/petersontylerd/.pyenv/versions/main37/lib/python3.7/site-packages/pandas/io/parsers.py", line 676, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/Users/petersontylerd/.pyenv/versions/main37/lib/python3.7/site-packages/pandas/io/parsers.py", line 448, in _read
    parser = TextFileReader(fp_or_buf, **kwds)
  File "/Users/petersontylerd/.pyenv/versions/main37/lib/python3.7/site-packages/pandas/io/parsers.py", line 880, in __init__
    self._make_engine(self.engine)
  File "/Users/petersontylerd/.pyenv/versions/main37/lib/python3.7/site-packages/pandas/io/parsers.py", line 1114, in _make_engi

- pytorch_runs
    - YYYYMMDD_HHMMS_NAME
        - model
            - best_model.pkl
        - tensorboard
        - images
        - logs
            - results_csv

In [None]:
pd.read_csv(os.path.join(os.environ["HOME"], "workspace", "pytorch_runs","results.csv")).sort_values(["train_accuracy","validation_accuracy"], ascending=[False,False])[:30]

In [None]:
with open('./results.csv', 'a') as f:
    print("hi")

In [None]:
try:
    _ = pd.read_csv("./results.csv")
except:
    print("hi")

# Utility functions

## Data load functions

## Imaging functions

In [None]:
def image_sample(inp, figsize=(20,20)):
    inp = inp.numpy().transpose((1, 2, 0))
    inp = np.clip(inp, 0, 1)
    plt.figure(figsize=figsize)
    plt.imshow(
        inp,
        interpolation="nearest"
    )

In [None]:
# plot sample image
def plot_sample(image):
    plt.imshow(image, cmap="gray")

# Workflow

## Load training data

In [None]:
# load source files
X_train, y_train = load_mnist(
    path=os.path.join(os.environ["HOME"], "s3buckets", "mnist"),
    kind="train"
)

# transformation instructions
norm_mean = [0.1307]
norm_std = [0.3801]

train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        norm_mean,
        norm_std
    ),
])

# load data into Pytorch Dataset
train_data = MNISTDataset(
    images=X_train[:6000,:],
    targets=y_train[:6000],
    transform=train_transform,
)

# create Pytorch DataLoader
train_data_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size=32,
    shuffle=False,
)

### review samples

In [None]:
# visualize image batch grid
inputs, classes = next(iter(train_data_loader))
out = torchvision.utils.make_grid(inputs)
image_sample(out)

In [None]:
sample = iter(train_data_loader.dataset.images)
plot_sample(next(sample))

## Load validation data

In [None]:
# load source files
X_valid, y_valid = load_mnist(
    path=os.path.join(os.environ["HOME"], "s3buckets", "mnist"),
    kind="t10k"
)

# transformation instructions
norm_mean = [0.1307]
norm_std = [0.3801]

validation_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        norm_mean,
        norm_std
    )
])

# load data into Pytorch dataset
validation_data = MNISTDataset(
    images=X_valid,
    targets=y_valid,
    transform=validation_transform,
)

# create Pytorch DataLoader
validation_data_loader = torch.utils.data.DataLoader(
    validation_data,
    batch_size=32,
    shuffle=False,
    # sampler=weighted_sampler
)

### Review samples

In [None]:
# visualize image batch grid
inputs, classes = next(iter(validation_data_loader))
out = torchvision.utils.make_grid(inputs)

image_sample(out)

In [None]:
sample = iter(validation_data_loader.dataset.images)
plot_sample(next(sample))

### RunExecutor

## Training

### Parameter setup

In [None]:
# set input kwargs as object attributes
class ParamConfig:  
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

# configure all necessary parameters
model_params = ParamConfig(
    model = FCNet,
    model_name = "FCNet",
#     model_object_dir = "/content/drive/model_objects/20191202_1622_VGG16",
    model_object_dir = None,
    optimizer = torch.optim.Adam,
    criterion = F.cross_entropy,
#     criterion = F.nll_loss,
    train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True),
    valid_data_loader = torch.utils.data.DataLoader(valid_data, batch_size=128, shuffle=True),
    cuda = True if torch.cuda.is_available() else False,
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    seed = 0,
    lr = 0.001,
    epochs = 50,
    tensorboard_files = False,
    verbose = True,
    save_model_objects=True,
)

### TensorBoard

In [None]:
class ClassificationBoard():
    
    def __init__(self, root_log_dir, experiment_name):
        self.experiment_name = experiment_name
        self.root_log_dir = root_log_dir        
        self.log_dir = os.path.join(self.root_log_dir, self.experiment_name, datetime.today().strftime('%Y%m%d_%H%M'))
        self.summary_writer = SummaryWriter(self.log_dir)
        
        
    def log_scalars(self, scalars):
        
        for tag, value in scalars.items():
            self.summary_writer.add_scalar(tag, value, step+1)

test = ClassificationBoard(log_dir=tensorboard_log_dir)

In [None]:
os.path.isdir("tensorboard_logdir")

In [None]:
if (step+1) % 100 == 0:
    print ('Step [{}/{}], Loss: {:.4f}, Acc: {:.2f}' 
           .format(step+1, total_step, loss.item(), accuracy.item()))

    # ================================================================== #
    #                        Tensorboard Logging                         #
    # ================================================================== #

    # 1. Log scalar values (scalar summary)
    info = { 'loss': loss.item(), 'accuracy': accuracy.item() }

    for tag, value in info.items():
        logger.scalar_summary(tag, value, step+1)

    # 2. Log values and gradients of the parameters (histogram summary)
    for tag, value in model.named_parameters():
        tag = tag.replace('.', '/')
        logger.histo_summary(tag, value.data.cpu().numpy(), step+1)
        logger.histo_summary(tag+'/grad', value.grad.data.cpu().numpy(), step+1)

    # 3. Log training images (image summary)
    info = { 'images': images.view(-1, 28, 28)[:10].cpu().numpy() }

    for tag, images in info.items():
        logger.image_summary(tag, images, step+1)

### Training procedure

In [None]:
class PyTorchTrainer:

    def __init__(self, config):

        # random seed settings
        self.seed = config.seed
        torch.manual_seed(self.seed)
        self.verbose = config.verbose

        # data loaders
        self.train_data_loader = config.train_data_loader
        self.validation_data_loader = config.validation_data_loader

        ## model object creation and device assignment
        self.device = config.device

        # if passing in the name of model Class object
        if isinstance(config.model, type):
            self.model = config.model().to(self.device)
        # if model is already instantiated, or if transfer learning model is used
        else:
            self.model = config.model.to(self.device)

        # name to use when saving model state
        if config.model_name is not None:
            self.model_name = config.model_name
        else:
            self.model_name = "untitled"

        # model training settings
        self.lr = config.lr
        self.epochs = config.epochs
        self.optimizer = config.optimizer(self.model.parameters(), lr=self.lr)
        self.criterion = config.criterion

        self.n_epochs_stop = 5
        self.min_val_loss = np.inf
        self.epochs_no_improve = 0

        ## load previous state
        # use checkpoint to load model state and associated objects
        if config.model_object_dir is not None:
            print(">>> Resuming training...")
            self.model_object_dir = config.model_object_dir

            # establish directory
            self.model_dir = os.path.join(self.model_object_dir, "models")
            self.object_dir = os.path.join(self.model_object_dir, "objects")
            self.log_dir = os.path.join(self.model_object_dir, "logs")
            self.log_train_dir = os.path.join(self.model_object_dir, "logs","train")
            self.log_validation_dir = os.path.join(self.model_object_dir, "logs","validation")
            
            # load model
            self.model.load_state_dict(torch.load(os.path.join(self.model_dir, os.listdir(self.model_dir)[0])))
            self.model = self.model.to(self.device)
            self.model_name = os.listdir(self.model_dir)[0].split(".")[0]
            
            # load statistics objects
            self.running_avg_train_f1 = torch.load(os.path.join(self.object_dir, "running_avg_train_f1.pt"))
            self.running_avg_train_precision = torch.load(os.path.join(self.object_dir, "running_avg_train_precision.pt"))
            self.running_avg_train_recall = torch.load(os.path.join(self.object_dir, "running_avg_train_recall.pt"))
            self.running_avg_train_accuracy = torch.load(os.path.join(self.object_dir, "running_avg_train_accuracy.pt"))
            self.running_avg_train_loss = torch.load(os.path.join(self.object_dir, "running_avg_train_loss.pt"))
            
            self.running_avg_validation_f1 = torch.load(os.path.join(self.object_dir, "running_avg_validation_f1.pt"))
            self.running_avg_validation_precision = torch.load(os.path.join(self.object_dir, "running_avg_validation_precision.pt"))
            self.running_avg_validation_recall = torch.load(os.path.join(self.object_dir, "running_avg_validation_recall.pt"))
            self.running_avg_validation_accuracy = torch.load(os.path.join(self.object_dir, "running_avg_validation_accuracy.pt"))
            self.running_avg_validation_loss = torch.load(os.path.join(self.object_dir, "running_avg_validation_loss.pt"))

            self.globaliter = torch.load(os.path.join(self.object_dir, "globaliter.pt"))
    
        else:
            # directory tree for storing model attributes
            current = datetime.today().strftime('%Y%m%d_%H%M') + "_" + self.model_name
            
            self.model_object_dir = os.path.join(os.getcwd(), "model_objects", current)
            self.model_dir = os.path.join(self.model_object_dir, "models")
            self.object_dir = os.path.join(self.model_object_dir, "objects")
            self.log_dir = os.path.join(self.model_object_dir, "logs")
            self.log_train_dir = os.path.join(self.model_object_dir, "logs","train")
            self.log_validation_dir = os.path.join(self.model_object_dir, "logs","validation")
            
            os.makedirs(self.model_object_dir, exist_ok=True)
            os.makedirs(self.model_dir, exist_ok=True)
            os.makedirs(self.object_dir, exist_ok=True)
            os.makedirs(self.log_dir, exist_ok=True)
            os.makedirs(self.log_train_dir, exist_ok=True)
            os.makedirs(self.log_validation_dir, exist_ok=True)
            
            self.globaliter = 0

        # tensorboard
        self.tensorboard_files = config.tensorboard_files
        if self.tensorboard_files:
            self.train_summary_writer = SummaryWriter(self.log_train_dir)
            self.validation_summary_writer = SummaryWriter(self.log_validation_dir)
        else:
            self.train_summary_writer = None
            self.validation_summary_writer = None
            
        self.beginning_time = time.time()

    def train(self, epoch):
        epoch_preds = []
        epoch_targets = []

        epoch_f1 = []
        epoch_precision = []
        epoch_recall = []
        epoch_accuracy = []
        epoch_loss = []

        self.globaliter += 1
        epoch_beginning_time = time.time()
        
        # sample batch number for data capture
        num_batches = np.floor(len(self.train_data_loader.dataset.image_paths) / self.train_data_loader.batch_size)
        sample_batch_idx = np.random.randint(0, num_batches)

        self.model.train()
        print("*" * 100)
        for batch_idx, (data, target) in enumerate(self.train_data_loader):
            batch_beginning_time = time.time()

            data = data.to(self.device)
            target = target.to(self.device)

            output = self.model(data)
            train_loss = self.criterion(output, target)
            epoch_loss.append(train_loss.item())

            self.optimizer.zero_grad()
            train_loss.backward()
            self.optimizer.step()

            #Metrics
            _, pred = torch.max(output, dim=1)
            epoch_preds = epoch_preds + pred.detach().cpu().numpy().tolist()
            epoch_targets = epoch_targets + target.detach().cpu().numpy().tolist()

            metric_f1 = f1_score(epoch_targets, epoch_preds)
            epoch_f1.append(metric_f1)

            metric_precision = precision_score(epoch_targets, epoch_preds)
            epoch_precision.append(metric_precision)

            metric_recall = recall_score(epoch_targets, epoch_preds)
            epoch_recall.append(metric_recall)

            metric_accuracy = accuracy_score(epoch_targets, epoch_preds)
            epoch_accuracy.append(metric_accuracy)

            # print progress report
            if self.verbose:
                if batch_idx % 50 == 0 and batch_idx > 0:
                    print("\nTrain epoch: {} | Batch: {} | [Processed {}/{} ({:.0f}%)]\n\tLoss: {:.6f} | F1: {:.6f} | Precision: {:.6f} | Recall: {:.6f} | Accuracy: {:.6f}".format(
                        epoch, batch_idx, len(epoch_preds), len(self.train_data_loader.dataset),
                        100. * len(epoch_preds) / len(self.train_data_loader.dataset), train_loss.item(), metric_f1,
                        metric_precision, metric_recall, metric_accuracy))
                    print("\tBatch time elapsed: {}\n".format(self.train_timer(batch_beginning_time, time.time())))
                    print("\n" + "*" * 10)

            # # image batch sample
            # if batch_idx == sample_batch_idx:
            #     image_grid = torchvision.utils.make_grid(data.cpu())
            #     self.train_summary_writer.add_image('train/Sample batch', image_grid, global_step=self.globaliter)
            
        # mark epoch end timestamp
        epoch_ending_time = time.time()

        try:
            self.running_avg_train_f1.append((sum(epoch_f1) / len(epoch_f1)))
            self.running_avg_train_precision.append((sum(epoch_precision) / len(epoch_precision)))
            self.running_avg_train_recall.append((sum(epoch_recall) / len(epoch_recall)))
            self.running_avg_train_accuracy.append((sum(epoch_accuracy) / len(epoch_accuracy)))
            self.running_avg_train_loss.append((sum(epoch_loss) / len(epoch_loss)))

            torch.save(self.running_avg_train_f1, os.path.join(self.object_dir, "running_avg_train_f1.pt"))
            torch.save(self.running_avg_train_precision, os.path.join(self.object_dir, "running_avg_train_precision.pt"))
            torch.save(self.running_avg_train_recall, os.path.join(self.object_dir, "running_avg_train_recall.pt"))
            torch.save(self.running_avg_train_accuracy, os.path.join(self.object_dir, "running_avg_train_accuracy.pt"))
            torch.save(self.running_avg_train_loss, os.path.join(self.object_dir, "running_avg_train_loss.pt"))

            # tensorboard
            if self.tensorboard_files:
                self.train_summary_writer.add_scalar('train/F1', self.running_avg_train_f1[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Precision', self.running_avg_train_precision[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Recall', self.running_avg_train_recall[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Accuracy', self.running_avg_train_accuracy[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Loss', self.running_avg_train_loss[-1], global_step=self.globaliter)

                self.train_summary_writer.add_scalar('F1', self.running_avg_train_f1[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Precision', self.running_avg_train_precision[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Recall', self.running_avg_train_recall[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Accuracy', self.running_avg_train_accuracy[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Loss', self.running_avg_train_loss[-1], global_step=self.globaliter)
                                
                self.train_summary_writer.flush()

        except AttributeError:
            self.running_avg_train_f1 = [(sum(epoch_f1) / len(epoch_f1))]
            self.running_avg_train_precision = [(sum(epoch_precision) / len(epoch_precision))]
            self.running_avg_train_recall = [(sum(epoch_recall) / len(epoch_recall))]
            self.running_avg_train_accuracy = [(sum(epoch_accuracy) / len(epoch_accuracy))]
            self.running_avg_train_loss = [(sum(epoch_loss) / len(epoch_loss))]

        # print progress report
        if self.verbose:
            print("*" * 10 + "\n")
            print("Train epoch: {} \n\tLoss: {:.6f} | F1: {:.6f} | Precision: {:.6f} | Recall: {:.6f} | Accuracy: {:.6f}\n".format(
                        epoch, self.running_avg_train_loss[-1], self.running_avg_train_f1[-1],
                        self.running_avg_train_precision[-1], self.running_avg_train_recall[-1], self.running_avg_train_accuracy[-1]))
            print("\tEpoch time elapsed: {}".format(self.train_timer(epoch_beginning_time, epoch_ending_time)))
            print("\tTotal time elapsed: {}".format(self.train_timer(self.beginning_time, time.time())))
        
        # capture globaliter
        torch.save(self.globaliter, os.path.join(self.object_dir, "globaliter.pt"))

    def validation(self, epoch):
        epoch_preds = []
        epoch_targets = []

        epoch_f1 = []
        epoch_precision = []
        epoch_recall = []
        epoch_accuracy = []
        epoch_loss = []
        
        # sample batch number for data capture
        num_batches = np.floor(len(self.validation_data_loader.dataset.image_paths) / self.validation_data_loader.batch_size)
        sample_batch_idx = np.random.randint(0, num_batches)

        # turn off gradients
        self.model.eval()
        with torch.no_grad():

            for batch_idx, (data, target) in enumerate(self.validation_data_loader):
                # reshape data as needed and send data to GPU if available
                data = data.to(self.device)
                target = target.to(self.device)

                # generate predictions
                output = self.model(data)

                validation_loss = self.criterion(output, target)
                epoch_loss.append(validation_loss.item())

                #Metrics
                _, pred = torch.max(output, dim=1)
                epoch_preds = epoch_preds + pred.detach().cpu().numpy().tolist()
                epoch_targets = epoch_targets + target.detach().cpu().numpy().tolist()

                metric_f1 = f1_score(epoch_targets, epoch_preds)
                epoch_f1.append(metric_f1)

                metric_precision = precision_score(epoch_targets, epoch_preds)
                epoch_precision.append(metric_precision)

                metric_recall = recall_score(epoch_targets, epoch_preds)
                epoch_recall.append(metric_recall)

                metric_accuracy = accuracy_score(epoch_targets, epoch_preds)
                epoch_accuracy.append(metric_accuracy)
            
            # #
            # if batch_idx == sample_batch_idx:
            #     image_grid = torchvision.utils.make_grid(data.cpu())
            #     self.validation_summary_writer.add_image('validation/Sample batch', image_grid, global_step=self.globaliter)
                
            # 
            try:
                self.running_avg_validation_f1.append((sum(epoch_f1) / len(epoch_f1)))
                self.running_avg_validation_precision.append((sum(epoch_precision) / len(epoch_precision)))
                self.running_avg_validation_recall.append((sum(epoch_recall) / len(epoch_recall)))
                self.running_avg_validation_accuracy.append((sum(epoch_accuracy) / len(epoch_accuracy)))
                self.running_avg_validation_loss.append((sum(epoch_loss) / len(epoch_loss)))

                torch.save(self.running_avg_validation_f1, os.path.join(self.object_dir, "running_avg_validation_f1.pt"))
                torch.save(self.running_avg_validation_precision, os.path.join(self.object_dir, "running_avg_validation_precision.pt"))
                torch.save(self.running_avg_validation_recall, os.path.join(self.object_dir, "running_avg_validation_recall.pt"))
                torch.save(self.running_avg_validation_accuracy, os.path.join(self.object_dir, "running_avg_validation_accuracy.pt"))
                torch.save(self.running_avg_validation_loss, os.path.join(self.object_dir, "running_avg_validation_loss.pt"))

                # tensorboard
                if self.tensorboard_files:
                    # validation panel - one scalar per metric per plot
                    self.validation_summary_writer.add_scalar('validation/F1', self.running_avg_validation_f1[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Precision', self.running_avg_validation_precision[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Recall', self.running_avg_validation_recall[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Accuracy', self.running_avg_validation_accuracy[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Loss', self.running_avg_validation_loss[-1], global_step=self.globaliter)

                    # metric-specific plots
                    self.validation_summary_writer.add_scalar('F1', self.running_avg_validation_f1[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Precision', self.running_avg_validation_precision[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Recall', self.running_avg_validation_recall[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Accuracy', self.running_avg_validation_accuracy[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Loss', self.running_avg_validation_loss[-1], global_step=self.globaliter)
                    
                    self.validation_summary_writer.flush()

            # create statistics object and continue
            except AttributeError:
                self.running_avg_validation_f1 = [(sum(epoch_f1) / len(epoch_f1))]
                self.running_avg_validation_precision = [(sum(epoch_precision) / len(epoch_precision))]
                self.running_avg_validation_recall = [(sum(epoch_recall) / len(epoch_recall))]
                self.running_avg_validation_accuracy = [(sum(epoch_accuracy) / len(epoch_accuracy))]
                self.running_avg_validation_loss = [(sum(epoch_loss) / len(epoch_loss))]
                
            # print progress report
            if self.verbose:
                print("\nValidation epoch: {} \n\tLoss: {:.6f} | F1: {:.6f} | Precision: {:.6f} | Recall: {:.6f} | Accuracy: {:.6f}\n".format(
                        epoch, self.running_avg_validation_loss[-1], self.running_avg_validation_f1[-1],
                        self.running_avg_validation_precision[-1], self.running_avg_validation_recall[-1], self.running_avg_validation_accuracy[-1]))
        
            # early stopping
            if self.running_avg_validation_loss[-1] < self.min_val_loss:
                # Save the model checkpoint
                torch.save(self.model.state_dict(), os.path.join(self.model_dir, "{}.pt".format(self.model_name)))
                self.epochs_no_improve = 0
                self.min_val_loss = self.running_avg_validation_loss[-1]
                
                if self.verbose:
                    print(">>> Improved - saving model\n\n\n")

            else:
                self.epochs_no_improve += 1
                if self.verbose:
                    print(">>> No improvement - {} consecutive epochs\n\n\n".format(self.epochs_no_improve))
                if self.epochs_no_improve == self.n_epochs_stop:
                    if self.verbose:
                        print("\n!!! Early stopping - {} epochs without improvement\n".format(self.n_epochs_stop))
                self.running_avg_validation_loss = []
                    
    def train_timer(self, start, end):
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)
        return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)

### Execute

In [None]:
### fit model
# instantiate model object
trainer = PyTorchTrainer(config=model_params)

# iterate fitting procedure over specified epoch count
for epoch in range(1, trainer.epochs + 1):
    trainer.train(epoch)
    trainer.validation(epoch)
trainer.train_summary_writer.close()
trainer.validation_summary_writer.close()

# scratch

In [None]:
class PyTorchTrainer:

    def __init__(self, config):

        # random seed settings
        self.seed = config.seed
        torch.manual_seed(self.seed)
        self.verbose = config.verbose

        # data loaders
        self.train_data_loader = config.train_data_loader
        self.validation_data_loader = config.validation_data_loader

        ## model object creation and device assignment
        self.device = config.device

        # if passing in the name of model Class object
        if isinstance(config.model, type):
            self.model = config.model().to(self.device)
        # if model is already instantiated, or if transfer learning model is used
        else:
            self.model = config.model.to(self.device)

        # name to use when saving model state
        if config.model_name is not None:
            self.model_name = config.model_name
        else:
            self.model_name = "untitled"

        # model training settings
        self.lr = config.lr
        self.epochs = config.epochs
        self.optimizer = config.optimizer(self.model.parameters(), lr=self.lr)
        self.criterion = config.criterion

        self.n_epochs_stop = 5
        self.min_val_loss = np.inf
        self.epochs_no_improve = 0

        ## load previous state
        # use checkpoint to load model state and associated objects
        if config.model_object_dir is not None:
            print(">>> Resuming training...")
            self.model_object_dir = config.model_object_dir

            # establish directory
            self.model_dir = os.path.join(self.model_object_dir, "models")
            self.object_dir = os.path.join(self.model_object_dir, "objects")
            self.log_dir = os.path.join(self.model_object_dir, "logs")
            self.log_train_dir = os.path.join(self.model_object_dir, "logs","train")
            self.log_validation_dir = os.path.join(self.model_object_dir, "logs","validation")
            
            # load model
            self.model.load_state_dict(torch.load(os.path.join(self.model_dir, os.listdir(self.model_dir)[0])))
            self.model = self.model.to(self.device)
            self.model_name = os.listdir(self.model_dir)[0].split(".")[0]
            
            # load statistics objects
            self.running_avg_train_f1 = torch.load(os.path.join(self.object_dir, "running_avg_train_f1.pt"))
            self.running_avg_train_precision = torch.load(os.path.join(self.object_dir, "running_avg_train_precision.pt"))
            self.running_avg_train_recall = torch.load(os.path.join(self.object_dir, "running_avg_train_recall.pt"))
            self.running_avg_train_accuracy = torch.load(os.path.join(self.object_dir, "running_avg_train_accuracy.pt"))
            self.running_avg_train_loss = torch.load(os.path.join(self.object_dir, "running_avg_train_loss.pt"))
            
            self.running_avg_validation_f1 = torch.load(os.path.join(self.object_dir, "running_avg_validation_f1.pt"))
            self.running_avg_validation_precision = torch.load(os.path.join(self.object_dir, "running_avg_validation_precision.pt"))
            self.running_avg_validation_recall = torch.load(os.path.join(self.object_dir, "running_avg_validation_recall.pt"))
            self.running_avg_validation_accuracy = torch.load(os.path.join(self.object_dir, "running_avg_validation_accuracy.pt"))
            self.running_avg_validation_loss = torch.load(os.path.join(self.object_dir, "running_avg_validation_loss.pt"))

            self.globaliter = torch.load(os.path.join(self.object_dir, "globaliter.pt"))
    
        else:
            # directory tree for storing model attributes
            current = datetime.today().strftime('%Y%m%d_%H%M') + "_" + self.model_name
            
            self.model_object_dir = os.path.join(os.getcwd(), "model_objects", current)
            self.model_dir = os.path.join(self.model_object_dir, "models")
            self.object_dir = os.path.join(self.model_object_dir, "objects")
            self.log_dir = os.path.join(self.model_object_dir, "logs")
            self.log_train_dir = os.path.join(self.model_object_dir, "logs","train")
            self.log_validation_dir = os.path.join(self.model_object_dir, "logs","validation")
            
            os.makedirs(self.model_object_dir, exist_ok=True)
            os.makedirs(self.model_dir, exist_ok=True)
            os.makedirs(self.object_dir, exist_ok=True)
            os.makedirs(self.log_dir, exist_ok=True)
            os.makedirs(self.log_train_dir, exist_ok=True)
            os.makedirs(self.log_validation_dir, exist_ok=True)
            
            self.globaliter = 0

        # tensorboard
        self.tensorboard_files = config.tensorboard_files
        if self.tensorboard_files:
            self.train_summary_writer = SummaryWriter(self.log_train_dir)
            self.validation_summary_writer = SummaryWriter(self.log_validation_dir)
        else:
            self.train_summary_writer = None
            self.validation_summary_writer = None
            
        self.beginning_time = time.time()

    def train(self, epoch):
        epoch_preds = []
        epoch_targets = []

        epoch_f1 = []
        epoch_precision = []
        epoch_recall = []
        epoch_accuracy = []
        epoch_loss = []

        self.globaliter += 1
        epoch_beginning_time = time.time()
        
        # sample batch number for data capture
        num_batches = np.floor(len(self.train_data_loader.dataset.image_paths) / self.train_data_loader.batch_size)
        sample_batch_idx = np.random.randint(0, num_batches)

        self.model.train()
        print("*" * 100)
        for batch_idx, (data, target) in enumerate(self.train_data_loader):
            batch_beginning_time = time.time()

            data = data.to(self.device)
            target = target.to(self.device)

            output = self.model(data)
            train_loss = self.criterion(output, target)
            epoch_loss.append(train_loss.item())

            self.optimizer.zero_grad()
            train_loss.backward()
            self.optimizer.step()

            #Metrics
            _, pred = torch.max(output, dim=1)
            epoch_preds = epoch_preds + pred.detach().cpu().numpy().tolist()
            epoch_targets = epoch_targets + target.detach().cpu().numpy().tolist()

            metric_f1 = f1_score(epoch_targets, epoch_preds)
            epoch_f1.append(metric_f1)

            metric_precision = precision_score(epoch_targets, epoch_preds)
            epoch_precision.append(metric_precision)

            metric_recall = recall_score(epoch_targets, epoch_preds)
            epoch_recall.append(metric_recall)

            metric_accuracy = accuracy_score(epoch_targets, epoch_preds)
            epoch_accuracy.append(metric_accuracy)

            # print progress report
            if self.verbose:
                if batch_idx % 50 == 0 and batch_idx > 0:
                    print("\nTrain epoch: {} | Batch: {} | [Processed {}/{} ({:.0f}%)]\n\tLoss: {:.6f} | F1: {:.6f} | Precision: {:.6f} | Recall: {:.6f} | Accuracy: {:.6f}".format(
                        epoch, batch_idx, len(epoch_preds), len(self.train_data_loader.dataset),
                        100. * len(epoch_preds) / len(self.train_data_loader.dataset), train_loss.item(), metric_f1,
                        metric_precision, metric_recall, metric_accuracy))
                    print("\tBatch time elapsed: {}\n".format(self.train_timer(batch_beginning_time, time.time())))
                    print("\n" + "*" * 10)

            # # image batch sample
            # if batch_idx == sample_batch_idx:
            #     image_grid = torchvision.utils.make_grid(data.cpu())
            #     self.train_summary_writer.add_image('train/Sample batch', image_grid, global_step=self.globaliter)
            
        # mark epoch end timestamp
        epoch_ending_time = time.time()

        try:
            self.running_avg_train_f1.append((sum(epoch_f1) / len(epoch_f1)))
            self.running_avg_train_precision.append((sum(epoch_precision) / len(epoch_precision)))
            self.running_avg_train_recall.append((sum(epoch_recall) / len(epoch_recall)))
            self.running_avg_train_accuracy.append((sum(epoch_accuracy) / len(epoch_accuracy)))
            self.running_avg_train_loss.append((sum(epoch_loss) / len(epoch_loss)))

            torch.save(self.running_avg_train_f1, os.path.join(self.object_dir, "running_avg_train_f1.pt"))
            torch.save(self.running_avg_train_precision, os.path.join(self.object_dir, "running_avg_train_precision.pt"))
            torch.save(self.running_avg_train_recall, os.path.join(self.object_dir, "running_avg_train_recall.pt"))
            torch.save(self.running_avg_train_accuracy, os.path.join(self.object_dir, "running_avg_train_accuracy.pt"))
            torch.save(self.running_avg_train_loss, os.path.join(self.object_dir, "running_avg_train_loss.pt"))

            # tensorboard
            if self.tensorboard_files:
                self.train_summary_writer.add_scalar('train/F1', self.running_avg_train_f1[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Precision', self.running_avg_train_precision[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Recall', self.running_avg_train_recall[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Accuracy', self.running_avg_train_accuracy[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('train/Loss', self.running_avg_train_loss[-1], global_step=self.globaliter)

                self.train_summary_writer.add_scalar('F1', self.running_avg_train_f1[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Precision', self.running_avg_train_precision[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Recall', self.running_avg_train_recall[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Accuracy', self.running_avg_train_accuracy[-1], global_step=self.globaliter)
                self.train_summary_writer.add_scalar('Loss', self.running_avg_train_loss[-1], global_step=self.globaliter)
                                
                self.train_summary_writer.flush()

        except AttributeError:
            self.running_avg_train_f1 = [(sum(epoch_f1) / len(epoch_f1))]
            self.running_avg_train_precision = [(sum(epoch_precision) / len(epoch_precision))]
            self.running_avg_train_recall = [(sum(epoch_recall) / len(epoch_recall))]
            self.running_avg_train_accuracy = [(sum(epoch_accuracy) / len(epoch_accuracy))]
            self.running_avg_train_loss = [(sum(epoch_loss) / len(epoch_loss))]

        # print progress report
        if self.verbose:
            print("*" * 10 + "\n")
            print("Train epoch: {} \n\tLoss: {:.6f} | F1: {:.6f} | Precision: {:.6f} | Recall: {:.6f} | Accuracy: {:.6f}\n".format(
                        epoch, self.running_avg_train_loss[-1], self.running_avg_train_f1[-1],
                        self.running_avg_train_precision[-1], self.running_avg_train_recall[-1], self.running_avg_train_accuracy[-1]))
            print("\tEpoch time elapsed: {}".format(self.train_timer(epoch_beginning_time, epoch_ending_time)))
            print("\tTotal time elapsed: {}".format(self.train_timer(self.beginning_time, time.time())))
        
        # capture globaliter
        torch.save(self.globaliter, os.path.join(self.object_dir, "globaliter.pt"))

    def validation(self, epoch):
        epoch_preds = []
        epoch_targets = []

        epoch_f1 = []
        epoch_precision = []
        epoch_recall = []
        epoch_accuracy = []
        epoch_loss = []
        
        # sample batch number for data capture
        num_batches = np.floor(len(self.validation_data_loader.dataset.image_paths) / self.validation_data_loader.batch_size)
        sample_batch_idx = np.random.randint(0, num_batches)

        # turn off gradients
        self.model.eval()
        with torch.no_grad():

            for batch_idx, (data, target) in enumerate(self.validation_data_loader):
                # reshape data as needed and send data to GPU if available
                data = data.to(self.device)
                target = target.to(self.device)

                # generate predictions
                output = self.model(data)

                validation_loss = self.criterion(output, target)
                epoch_loss.append(validation_loss.item())

                #Metrics
                _, pred = torch.max(output, dim=1)
                epoch_preds = epoch_preds + pred.detach().cpu().numpy().tolist()
                epoch_targets = epoch_targets + target.detach().cpu().numpy().tolist()

                metric_f1 = f1_score(epoch_targets, epoch_preds)
                epoch_f1.append(metric_f1)

                metric_precision = precision_score(epoch_targets, epoch_preds)
                epoch_precision.append(metric_precision)

                metric_recall = recall_score(epoch_targets, epoch_preds)
                epoch_recall.append(metric_recall)

                metric_accuracy = accuracy_score(epoch_targets, epoch_preds)
                epoch_accuracy.append(metric_accuracy)
            
            # #
            # if batch_idx == sample_batch_idx:
            #     image_grid = torchvision.utils.make_grid(data.cpu())
            #     self.validation_summary_writer.add_image('validation/Sample batch', image_grid, global_step=self.globaliter)
                
            # 
            try:
                self.running_avg_validation_f1.append((sum(epoch_f1) / len(epoch_f1)))
                self.running_avg_validation_precision.append((sum(epoch_precision) / len(epoch_precision)))
                self.running_avg_validation_recall.append((sum(epoch_recall) / len(epoch_recall)))
                self.running_avg_validation_accuracy.append((sum(epoch_accuracy) / len(epoch_accuracy)))
                self.running_avg_validation_loss.append((sum(epoch_loss) / len(epoch_loss)))

                torch.save(self.running_avg_validation_f1, os.path.join(self.object_dir, "running_avg_validation_f1.pt"))
                torch.save(self.running_avg_validation_precision, os.path.join(self.object_dir, "running_avg_validation_precision.pt"))
                torch.save(self.running_avg_validation_recall, os.path.join(self.object_dir, "running_avg_validation_recall.pt"))
                torch.save(self.running_avg_validation_accuracy, os.path.join(self.object_dir, "running_avg_validation_accuracy.pt"))
                torch.save(self.running_avg_validation_loss, os.path.join(self.object_dir, "running_avg_validation_loss.pt"))

                # tensorboard
                if self.tensorboard_files:
                    # validation panel - one scalar per metric per plot
                    self.validation_summary_writer.add_scalar('validation/F1', self.running_avg_validation_f1[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Precision', self.running_avg_validation_precision[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Recall', self.running_avg_validation_recall[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Accuracy', self.running_avg_validation_accuracy[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('validation/Loss', self.running_avg_validation_loss[-1], global_step=self.globaliter)

                    # metric-specific plots
                    self.validation_summary_writer.add_scalar('F1', self.running_avg_validation_f1[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Precision', self.running_avg_validation_precision[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Recall', self.running_avg_validation_recall[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Accuracy', self.running_avg_validation_accuracy[-1], global_step=self.globaliter)
                    self.validation_summary_writer.add_scalar('Loss', self.running_avg_validation_loss[-1], global_step=self.globaliter)
                    
                    self.validation_summary_writer.flush()

            # create statistics object and continue
            except AttributeError:
                self.running_avg_validation_f1 = [(sum(epoch_f1) / len(epoch_f1))]
                self.running_avg_validation_precision = [(sum(epoch_precision) / len(epoch_precision))]
                self.running_avg_validation_recall = [(sum(epoch_recall) / len(epoch_recall))]
                self.running_avg_validation_accuracy = [(sum(epoch_accuracy) / len(epoch_accuracy))]
                self.running_avg_validation_loss = [(sum(epoch_loss) / len(epoch_loss))]
                
            # print progress report
            if self.verbose:
                print("\nValidation epoch: {} \n\tLoss: {:.6f} | F1: {:.6f} | Precision: {:.6f} | Recall: {:.6f} | Accuracy: {:.6f}\n".format(
                        epoch, self.running_avg_validation_loss[-1], self.running_avg_validation_f1[-1],
                        self.running_avg_validation_precision[-1], self.running_avg_validation_recall[-1], self.running_avg_validation_accuracy[-1]))
        
            # early stopping
            if self.running_avg_validation_loss[-1] < self.min_val_loss:
                # Save the model checkpoint
                torch.save(self.model.state_dict(), os.path.join(self.model_dir, "{}.pt".format(self.model_name)))
                self.epochs_no_improve = 0
                self.min_val_loss = self.running_avg_validation_loss[-1]
                
                if self.verbose:
                    print(">>> Improved - saving model\n\n\n")

            else:
                self.epochs_no_improve += 1
                if self.verbose:
                    print(">>> No improvement - {} consecutive epochs\n\n\n".format(self.epochs_no_improve))
                if self.epochs_no_improve == self.n_epochs_stop:
                    if self.verbose:
                        print("\n!!! Early stopping - {} epochs without improvement\n".format(self.n_epochs_stop))
                self.running_avg_validation_loss = []
                    
    def train_timer(self, start, end):
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)
        return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)

# References

https://www.kaggle.com/xinruizhuang/skin-lesion-classification-acc-90-pytorch

https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html

https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/04-utils/tensorboard

https://pytorch.org/docs/stable/tensorboard.html

https://towardsdatascience.com/transfer-learning-with-convolutional-neural-networks-in-pytorch-dd09190245ce

https://towardsdatascience.com/https-medium-com-dinber19-take-a-deeper-look-at-your-pytorch-model-with-the-new-tensorboard-built-in-513969cf6a72

https://github.com/andyhahaha/Uncertainty-Mnist-with-Pytorch

https://discuss.pytorch.org/t/using-nn-dropout2d-at-eval-time-for-modelling-uncertainty/45274

https://xuwd11.github.io/Dropout_Tutorial_in_PyTorch/

https://towardsdatascience.com/making-your-neural-network-say-i-dont-know-bayesian-nns-using-pyro-and-pytorch-b1c24e6ab8cd

