## Notebook Description
This notebook contains the code for comparing the performance of out-of-the-box models as fixed feature extractors <br>
**Date of Last Update**: November 27th 2020

---

**INPUTS**: Data/train, Data/val, Data/validation <br>
directory containing image files sorted by class and train test val

**OUTPUTS**: Hyperparameter Choices<br>

---

### Previous Steps
- Base Model Selection

### Next Steps
- Final Model Training

---

## Import Packages

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim 
from torch.optim import lr_scheduler
import torchvision
from torchvision import models, transforms, datasets
from torch.utils.tensorboard import SummaryWriter
from tensorboard import notebook

import numpy as np
import pandas as pd

import time 
import os
import copy

import itertools

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [None]:
%load_ext tensorboard

## Import Pretrained Model

In [None]:
resnet18 = models.resnet18(pretrained = True)

In [None]:
resnet18

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
writer = SummaryWriter()

In [None]:
%tensorboard --logdir=runs

Reusing TensorBoard on port 6006 (pid 4173), started 1:03:06 ago. (Use '!kill 4173' to kill it.)

## Load Data 
`path = 'Data/train/..'
path = 'Data/val..'` <br>
the final directory will be used as a final performance indicator

Considerations for data augmentation:
- technique to create synthetic data
- applied in the dataloader ONLY to the train data 
- will randomly create a variant in each epoch (n epochs = n variants seen by model)

> from torchvision documentation - used as all pre-trained models expect images normalized in the same way : <br>
`normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])` 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# base path to data location
###
base_path = 'gdrive/MyDrive/Data'

# create data transforms 
# normalize and augment(where applicable)
###
data_transforms = {
    'train' : transforms.Compose([
        transforms.Resize(256),
        transforms.RandomResizedCrop(224), 
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    ]),
    'val' : transforms.Compose([ 
        transforms.Resize(256), 
        transforms.CenterCrop(224), 
        transforms.ToTensor(), 
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])])
                  }

# create image datasets 
###
image_datasets ={ x : datasets.ImageFolder(os.path.join(base_path, x), 
                                          data_transforms[x])
                for x in ['train', 'val']}

# create dataloaders
###
dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                batch_size = 2, 
                                                shuffle = True, 
                                                num_workers = 4)
              for x in ['train', 'val']}

datasizes = { x : len(image_datasets[x]) for x in ['train', 'val']}

#create class labels
###
class_labels = image_datasets['train'].classes

In [None]:
## define device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

***Visualize Model with TensorBoard***

In [None]:
dataiter = iter(dataloaders['train'])
inputs, labels = dataiter.next()

writer.add_graph(resnet18, inputs)
writer.close()

## Define a Training Function

In [None]:
def train_model(model, criterion, optimizer, num_epochs):
    """Trains the Model
    INPUTS - model, criterion, optimizer, num_epochs:
    RETURNS - time_elapsed = time elapsed while training model
            - best_acc = maximum accuracy on validation set obtained while training
            - max_memory = maximum memory allocated during training"""
    
    # initialize model information tracking
    #start = time.time()
    best_acc = 0.0
    model_epoch = 0
    trainloss = {}
    valloss = {}
    trainaccbyepoch = {}
    valaccbyepoch = {}

    # create loop for no. of epochs
    for epoch in range(num_epochs):
        #print(epoch+1, ' / ', num_epochs)
        #print('-'*15)
        
        for phase in ['train', 'val'] : 
            if phase == 'train':
                model.train()
            else: 
                model.eval()
            
            #initialize epoch tracking
            running_loss = 0.0
            no_correct = 0
            
            #loop for data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                #zero the gradients
                optimizer.zero_grad()
                
                #forward (only for train phase)
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs,1)
                    loss = criterion(outputs, labels)
                    
                    #backward (only for train)
                    if phase == 'train' : 
                        loss.backward()
                        optimizer.step()
                
                #update tracking
                running_loss += loss.item() * inputs.size(0)
                no_correct += torch.sum(preds == labels.data)
            
            #get epoch stats
            epoch_loss = running_loss / datasizes[phase]
            epoch_acc = no_correct.double() / datasizes[phase]
            
            if phase == 'train':
                trainloss[epoch] = epoch_loss
                trainaccbyepoch[epoch] = epoch_acc.item()
            else:
                valloss[epoch] = epoch_loss
                valaccbyepoch[epoch] = epoch_acc.item()
        
        
            #print(phase, " Loss: ", epoch_loss, " Accuracy: ", epoch_acc)
            
            #if phase == 'val' and epoch_acc > best_acc:
                #best_acc = epoch_acc
                #deep copy model to store
                #best_model = copy.deepcopy(model.state_dict())
                #model_epoch = epoch
            
    #time_elapsed = time.time() - start
    
    return valloss, trainloss, trainaccbyepoch, valaccbyepoch

---
## Transfer Learning Approach (aside)
> *as feature extractors:* <br>
freeze the models layers with the exception of the layers completing the classification (in the case of ResNet18 this is a fully connected (fc) layer). <br>
*finetuning approach:* <br> 
in addition to training the fc layer as addressed above we will also selectively train previous layers <br>
we will adjust the learning rate to a lower than typical rate to prevent over adjusting the model weights <br>
additionally different layers can be "frozen" or left as "trainable" to get an approach somewhere in the middle of the two approaches

---

## Fine Tune the Model

### Investigate the Model
- observe the layers and params 
- to understand the state of the layers
    - `.requires_grad == True` model layer is trainable
    - `.requires_grad == False` model layer is frozen

In [None]:
def get_children_info(model):
    """prints informaiton about the children in a model
    does not return information about grandchildren 
    INPUTS: model
    RETURNS: count of children """
    count = 0
    for child in model.children():
        print(" child", count, "is: ")
        print(child)
        count += 1
    return count

In [None]:
# get child count for our model
childcount = get_children_info(resnet18)
print('--'*10)
print("the ResNet18 model has ", childcount, " children.")

 child 0 is: 
Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
 child 1 is: 
BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 child 2 is: 
ReLU(inplace=True)
 child 3 is: 
MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
 child 4 is: 
Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (1): BasicBlock(
    (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv

---
## ResNet18 Architecture (Aside) 
[link to article understanding and visualizing ResNets](https://towardsdatascience.com/understanding-and-visualizing-resnets-442284831be8)
> used to assist in solving the vanishing gradient problem <br> 
layers are composed of blocks <br>
in the case of ResNet18 these are in PyTorch called "BasicBlock" and include 2 operations <br> 
>less pooling than typical CNNs (examples AlexNet or VGG), downsampling achieved more with increasing stride <br>
---

## Freezing Blocks
- For a first go around we will freeze all the layers up to child 7 <br>
- this will allow us to adjust the weights in 2 blocks and our fc layer

In [None]:
def block_freeze (model, child_stop_num):
    """performs layer freezing
    INPUTS: model, child_stop_num - layer at which the layers will not be frozen (exclusive)
    RETURNS: updated model"""
    
    count = 0
    for child in model.children():
        if count < child_stop_num:
            for p in child.parameters():
                p.requires_grad = False
        count += 1
    
    return model

## Adjusting FC Layer
- need to adjust the fc layer to reflect the number of classes we are predicting

In [None]:
original_class_len = resnet18.fc.in_features

model_resnet18.fc = nn.Linear(original_class_len, len(class_labels))

## Criterion
- we will use the CrossEntropyLoss as our criterion function
    - from [Machine Learning Mastery](https://machinelearningmastery.com/cross-entropy-for-machine-learning/): cross-entropy is a measure of the difference between two probability distributions for a given random variable or set of events.
- This is a commonly used loss function when dealing with multiclass classification

In [None]:
criterion = nn.CrossEntropyLoss()

## Optimizer 
- we need to define an optimizer to define how our model will update
- we will look at 2 optimizers
    - Adam : combines advantages of AdaGrad and RMSProp as an extension to stocastic gradient descent
    - SGD : approximation of classical gradient descent working with random samples of data
- need to adjust our optimizer to account for some layers being frozen vs not frozen

In [None]:
optimizer_adam = optim.Adam(filter(lambda p:p.requires_grad, resnet18.parameters()), lr = learning_rate)
optimizer_sgd = optim.SGD(filter(lambda p: p.requires_grad, resnet18.parameters()), lr = learning_rate, momentum = moment)

## Visualize Loss and Accuracy
- create a function that will output the line plots for the given training configuration

In [None]:
def get_plots(df):
  """creates plots of train vs val loss and train vs val accuracy
  INPUTS df, dataframe of training stats by epoch
  RETURNS None"""

  # plot for loss
  fig = go.Figure(layout=go.Layout(
        title=go.layout.Title(text="Loss by Training Epoch")
    ))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["training_loss"],
                    mode='lines',
                    name='training',
                    line_color = '#1B848E'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["val_loss"],
                    mode='lines',
                    name='validation',
                    line_color = '#33B8B5'))

  fig.show()


  # plot for accuracy
  fig = go.Figure(layout=go.Layout(
        title=go.layout.Title(text="Accuracy by Training Epoch")
    ))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["training_acc"],
                    mode='lines',
                    name='training',
                    line_color = '#1B848E'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["val_acc"],
                    mode='lines',
                    name='validation',
                    line_color = '#33B8B5'))

  fig.show()


  return

## Tuning and Iteration
- we will iterate through several options to observe model performance with consideration for several potential setups

Parameters of Interest 
- 0.001 < `learning_rate` < 1
- `momentum` = [0, 0.5, 0.9, 0.99] (SGD)
- `batch_size` as powers of 2 (ie, 2, 4, 8...)
- `epochs` 

First looking at Adam vs SGD:
- comparing at 5 and 25 Epochs 
- with mini batch of 4 
- learning rate of 0.01 and 0.001

In [None]:
no_of_epochs = [5, 25]
optimize_opt = ["Adam", "SGD"]
learning_rates = [0.01, 0.001] 
momentum = 0.9 #SGD
batch_size = 4

count = 0

# loop through epoch settings
for epochs in no_of_epochs:
  #loop through learning rates
  for learning_rate in learning_rates:
    #compare optimizers
    for optimize in optimize_opt:
        
      #reset model
      resnet18 = models.resnet18(pretrained = True)
      original_class_len = resnet18.fc.in_features
      resnet18.fc = nn.Linear(original_class_len, len(class_labels))
      model = block_freeze(resnet18, 7)
      model.to(device)
      
      #initalize optimizer
      if optimize == "Adam":
        optimizer = optim.Adam(filter(lambda p:p.requires_grad, 
                                      model.parameters()), 
                                      lr = learning_rate)
      elif optimize == "SGD":
        optimizer = optim.SGD(filter(lambda p: p.requires_grad, 
                                                     model.parameters()), 
                                              lr = learning_rate, 
                                              momentum = momentum)
      
      dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                    batch_size = batch_size, 
                                                    shuffle = True, 
                                                    num_workers = 4)
                  for x in ['train', 'val']}


      train_loss, val_loss, train_acc, val_acc = train_model(model, 
                                                             criterion, 
                                                             optimizer, 
                                                             epochs)

      print("For ", epochs, "epochs")
      print("With a batch size of: ", batch_size)
      print("With learning rate of: ", learning_rate)
      print("Using the ", optimize, "optimizer")
      print('-' * 15)

      df = pd.DataFrame.from_dict(train_loss, orient='index')
      df1 = pd.DataFrame.from_dict(val_loss, orient='index')
      df2 = pd.DataFrame.from_dict(train_acc, orient='index')
      df3 = pd.DataFrame.from_dict(val_acc, orient='index')
      df = df.merge(df1, right_index=True, left_index=True)
      df.rename(columns = {'0_x' : "training_loss", "0_y" : "val_loss"}, inplace=True)
      df = df.merge(df2, right_index=True, left_index=True)
      df = df.merge(df3, right_index=True, left_index=True)
      df.rename(columns = {'0_x' : "training_acc", "0_y" : "val_acc"}, inplace=True)
      df.loc[:,"epoch"] = df.index
      
      get_plots(df)

For  5 epochs
With a batch size of:  4
With learning rate of:  0.01
Using the  Adam optimizer
---------------


For  5 epochs
With a batch size of:  4
With learning rate of:  0.01
Using the  SGD optimizer
---------------


For  5 epochs
With a batch size of:  4
With learning rate of:  0.001
Using the  Adam optimizer
---------------


For  5 epochs
With a batch size of:  4
With learning rate of:  0.001
Using the  SGD optimizer
---------------


For  25 epochs
With a batch size of:  4
With learning rate of:  0.01
Using the  Adam optimizer
---------------


For  25 epochs
With a batch size of:  4
With learning rate of:  0.01
Using the  SGD optimizer
---------------


For  25 epochs
With a batch size of:  4
With learning rate of:  0.001
Using the  Adam optimizer
---------------


For  25 epochs
With a batch size of:  4
With learning rate of:  0.001
Using the  SGD optimizer
---------------


Moving forward we will use the Adam Optimizer 

Next we will look at the impact of mini batch size:
- `learning_rate` at 0.01
- `epochs` = [5,25,50]

In [None]:
no_of_epochs = [5, 25, 50]
learning_rate = 0.01 
batch_sizes = [2,4,8,16]

count = 0

# loop through epoch settings
for epochs in no_of_epochs:
  #loop through learning rates
  for batch_size in batch_sizes:
    #reset model
    resnet18 = models.resnet18(pretrained = True)
    original_class_len = resnet18.fc.in_features
    resnet18.fc = nn.Linear(original_class_len, len(class_labels))
    model = block_freeze(resnet18, 7)
    model.to(device)
      
    #initalize optimizer
    optimizer = optim.Adam(filter(lambda p:p.requires_grad, 
                                  model.parameters()), 
                           lr = learning_rate)
    dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                    batch_size = batch_size, 
                                                    shuffle = True, 
                                                    num_workers = 4)
                  for x in ['train', 'val']}
    train_loss, val_loss, train_acc, val_acc = train_model(model, 
                                                             criterion, 
                                                             optimizer, 
                                                             epochs)

    print("For ", epochs, "epochs")
    print("With a batch size of: ", batch_size)
    print("With learning rate of: ", learning_rate)
    print('-' * 15)

    df = pd.DataFrame.from_dict(train_loss, orient='index')
    df1 = pd.DataFrame.from_dict(val_loss, orient='index')
    df2 = pd.DataFrame.from_dict(train_acc, orient='index')
    df3 = pd.DataFrame.from_dict(val_acc, orient='index')
    df = df.merge(df1, right_index=True, left_index=True)
    df.rename(columns = {'0_x' : "training_loss", "0_y" : "val_loss"}, inplace=True)
    df = df.merge(df2, right_index=True, left_index=True)
    df = df.merge(df3, right_index=True, left_index=True)
    df.rename(columns = {'0_x' : "training_acc", "0_y" : "val_acc"}, inplace=True)
    df.loc[:,"epoch"] = df.index
      
    get_plots(df)

For  5 epochs
With a batch size of:  2
With learning rate of:  0.01
---------------


For  5 epochs
With a batch size of:  4
With learning rate of:  0.01
---------------


For  5 epochs
With a batch size of:  8
With learning rate of:  0.01
---------------


For  5 epochs
With a batch size of:  16
With learning rate of:  0.01
---------------


For  25 epochs
With a batch size of:  2
With learning rate of:  0.01
---------------


For  25 epochs
With a batch size of:  4
With learning rate of:  0.01
---------------


For  25 epochs
With a batch size of:  8
With learning rate of:  0.01
---------------


For  25 epochs
With a batch size of:  16
With learning rate of:  0.01
---------------


For  50 epochs
With a batch size of:  2
With learning rate of:  0.01
---------------


For  50 epochs
With a batch size of:  4
With learning rate of:  0.01
---------------


For  50 epochs
With a batch size of:  8
With learning rate of:  0.01
---------------


For  50 epochs
With a batch size of:  16
With learning rate of:  0.01
---------------


Moving forward we will use the Adam Optimizer and a Batch Size of 2 

Next we will look at where we see the model overfitting we will run for 500 epochs
- `learning_rate` at 0.01

In [None]:
batch_size = 2
learning_rate = 0.01
epochs = 500

#reset model
resnet18 = models.resnet18(pretrained = True)
original_class_len = resnet18.fc.in_features
resnet18.fc = nn.Linear(original_class_len, len(class_labels))
model = block_freeze(resnet18, 7)
model.to(device)
      
#initalize optimizer
optimizer = optim.Adam(filter(lambda p:p.requires_grad, 
                                  model.parameters()), 
                           lr = learning_rate)
dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                    batch_size = batch_size, 
                                                    shuffle = True, 
                                                    num_workers = 4)
                  for x in ['train', 'val']}
train_loss, val_loss, train_acc, val_acc = train_model(model, 
                                                             criterion, 
                                                             optimizer, 
                                                             epochs)

print("For ", epochs, "epochs")
print("With a batch size of: ", batch_size)
print("With learning rate of: ", learning_rate)
print('-' * 15)

df = pd.DataFrame.from_dict(train_loss, orient='index')
df1 = pd.DataFrame.from_dict(val_loss, orient='index')
df2 = pd.DataFrame.from_dict(train_acc, orient='index')
df3 = pd.DataFrame.from_dict(val_acc, orient='index')
df = df.merge(df1, right_index=True, left_index=True)
df.rename(columns = {'0_x' : "training_loss", "0_y" : "val_loss"}, inplace=True)
df = df.merge(df2, right_index=True, left_index=True)
df = df.merge(df3, right_index=True, left_index=True)
df.rename(columns = {'0_x' : "training_acc", "0_y" : "val_acc"}, inplace=True)
df.loc[:,"epoch"] = df.index
      
get_plots(df)

For  500 epochs
With a batch size of:  2
With learning rate of:  0.01
---------------


From running the 500 epochs we can see that by 40 epochs we have run into problems with overfitting as our training accuracy is improving while our validation accuracy is not. 

The loss function shows a slightly steeper than desirable curve so we will adjust the learning_rate of our mode.

In [None]:
batch_size = 2
learning_rates = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006,
                 0.007, 0.008, 0.009, 0.01]
epochs = 40

for learning_rate in learning_rates:
  #reset model
  resnet18 = models.resnet18(pretrained = True)
  original_class_len = resnet18.fc.in_features
  resnet18.fc = nn.Linear(original_class_len, len(class_labels))
  model = block_freeze(resnet18, 7)
  model.to(device)
        
  #initalize optimizer
  optimizer = optim.Adam(filter(lambda p:p.requires_grad, 
                                    model.parameters()), 
                            lr = learning_rate)
  dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                      batch_size = batch_size, 
                                                      shuffle = True, 
                                                      num_workers = 4)
                    for x in ['train', 'val']}
  train_loss, val_loss, train_acc, val_acc = train_model(model, 
                                                              criterion, 
                                                              optimizer, 
                                                              epochs)

  print("For ", epochs, "epochs")
  print("With a batch size of: ", batch_size)
  print("With learning rate of: ", learning_rate)
  print('-' * 15)

  df = pd.DataFrame.from_dict(train_loss, orient='index')
  df1 = pd.DataFrame.from_dict(val_loss, orient='index')
  df2 = pd.DataFrame.from_dict(train_acc, orient='index')
  df3 = pd.DataFrame.from_dict(val_acc, orient='index')
  df = df.merge(df1, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_loss", "0_y" : "val_loss"}, inplace=True)
  df = df.merge(df2, right_index=True, left_index=True)
  df = df.merge(df3, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_acc", "0_y" : "val_acc"}, inplace=True)
  df.loc[:,"epoch"] = df.index
        
  get_plots(df)

For  40 epochs
With a batch size of:  2
With learning rate of:  0.001
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.002
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.003
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.004
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.005
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.006
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.007
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.008
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.009
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.01
---------------


The learning rate of 0.005 seemed to offer some improvements to our model 

Next we will look at the addition of a learning rate schedule using the `torch.optim.lr_scheduler`

We will create a scheduler and experiment with varying `step_size` while keeping our `gamma` constant

> note we will create a revised function for training our model to account for the addition of the schedule

In [None]:
def train_model_with_sched(model, criterion, optimizer, num_epochs, scheduler):
    """Trains the Model
    INPUTS - model, criterion, optimizer, num_epochs, scheduler:
    RETURNS - time_elapsed = time elapsed while training model
            - best_acc = maximum accuracy on validation set obtained while training
            - max_memory = maximum memory allocated during training"""
    
    # initialize model information tracking
    #start = time.time()
    best_acc = 0.0
    model_epoch = 0
    trainloss = {}
    valloss = {}
    trainaccbyepoch = {}
    valaccbyepoch = {}

    # create loop for no. of epochs
    for epoch in range(num_epochs):
        #print(epoch+1, ' / ', num_epochs)
        #print('-'*15)
        
        for phase in ['train', 'val'] : 
            if phase == 'train':
                model.train()
            else: 
                model.eval()
            
            #initialize epoch tracking
            running_loss = 0.0
            no_correct = 0
            
            #loop for data
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                #zero the gradients
                optimizer.zero_grad()
                
                #forward (only for train phase)
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs,1)
                    loss = criterion(outputs, labels)
                    
                    #backward (only for train)
                    if phase == 'train' : 
                        loss.backward()
                        optimizer.step()
                
                #update tracking
                running_loss += loss.item() * inputs.size(0)
                no_correct += torch.sum(preds == labels.data)
            
            #step for scheduler 
            if phase == 'train':
              scheduler.step()
            
            #get epoch stats
            epoch_loss = running_loss / datasizes[phase]
            epoch_acc = no_correct.double() / datasizes[phase]
            
            if phase == 'train':
                trainloss[epoch] = epoch_loss
                trainaccbyepoch[epoch] = epoch_acc.item()
            else:
                valloss[epoch] = epoch_loss
                valaccbyepoch[epoch] = epoch_acc.item()
        
        
            #print(phase, " Loss: ", epoch_loss, " Accuracy: ", epoch_acc)
            
            #if phase == 'val' and epoch_acc > best_acc:
                #best_acc = epoch_acc
                #deep copy model to store
                #best_model = copy.deepcopy(model.state_dict())
                #model_epoch = epoch
            
    #time_elapsed = time.time() - start
    
    return valloss, trainloss, trainaccbyepoch, valaccbyepoch

In [None]:
step_sizes = [1,5,7,10,15]
gamma = 0.1

batch_size = 2
learning_rate = 0.05
epochs = 40

for step_size in step_sizes:
  #reset model
  resnet18 = models.resnet18(pretrained = True)
  original_class_len = resnet18.fc.in_features
  resnet18.fc = nn.Linear(original_class_len, len(class_labels))
  model = block_freeze(resnet18, 7)
  model.to(device)
        
  #initalize optimizer
  optimizer = optim.Adam(filter(lambda p:p.requires_grad, 
                                    model.parameters()), 
                            lr = learning_rate)
  
  new_lr_scheduler = lr_scheduler.StepLR(optimizer, 
                                         step_size=step_size, 
                                         gamma = gamma)

  dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                      batch_size = batch_size, 
                                                      shuffle = True, 
                                                      num_workers = 4)
                    for x in ['train', 'val']}
  train_loss, val_loss, train_acc, val_acc = train_model_with_sched(model, 
                                                          criterion, 
                                                          optimizer, 
                                                          epochs, 
                                                          new_lr_scheduler)

  print("For ", epochs, "epochs")
  print("With a batch size of: ", batch_size)
  print("With learning rate of: ", learning_rate)
  print("With a scheduler step size of: ", step_size)
  print('-' * 15)

  df = pd.DataFrame.from_dict(train_loss, orient='index')
  df1 = pd.DataFrame.from_dict(val_loss, orient='index')
  df2 = pd.DataFrame.from_dict(train_acc, orient='index')
  df3 = pd.DataFrame.from_dict(val_acc, orient='index')
  df = df.merge(df1, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_loss", "0_y" : "val_loss"}, inplace=True)
  df = df.merge(df2, right_index=True, left_index=True)
  df = df.merge(df3, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_acc", "0_y" : "val_acc"}, inplace=True)
  df.loc[:,"epoch"] = df.index
        
  get_plots(df)



For  40 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  1
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  5
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  7
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  10
---------------


For  40 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  15
---------------


In [None]:
step_sizes = [5,6,7,8,9,10]
gamma = 0.1

batch_size = 2
learning_rate = 0.05
epochs = 70

for step_size in step_sizes:
  #reset model
  resnet18 = models.resnet18(pretrained = True)
  original_class_len = resnet18.fc.in_features
  resnet18.fc = nn.Linear(original_class_len, len(class_labels))
  model = block_freeze(resnet18, 7)
  model.to(device)
        
  #initalize optimizer
  optimizer = optim.Adam(filter(lambda p:p.requires_grad, 
                                    model.parameters()), 
                            lr = learning_rate)
  
  new_lr_scheduler = lr_scheduler.StepLR(optimizer, 
                                         step_size=step_size, 
                                         gamma = gamma)

  dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                      batch_size = batch_size, 
                                                      shuffle = True, 
                                                      num_workers = 4)
                    for x in ['train', 'val']}
  train_loss, val_loss, train_acc, val_acc = train_model_with_sched(model, 
                                                          criterion, 
                                                          optimizer, 
                                                          epochs, 
                                                          new_lr_scheduler)

  print("For ", epochs, "epochs")
  print("With a batch size of: ", batch_size)
  print("With learning rate of: ", learning_rate)
  print("With a scheduler step size of: ", step_size)
  print('-' * 15)

  df = pd.DataFrame.from_dict(train_loss, orient='index')
  df1 = pd.DataFrame.from_dict(val_loss, orient='index')
  df2 = pd.DataFrame.from_dict(train_acc, orient='index')
  df3 = pd.DataFrame.from_dict(val_acc, orient='index')
  df = df.merge(df1, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_loss", "0_y" : "val_loss"}, inplace=True)
  df = df.merge(df2, right_index=True, left_index=True)
  df = df.merge(df3, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_acc", "0_y" : "val_acc"}, inplace=True)
  df.loc[:,"epoch"] = df.index
        
  get_plots(df)



For  70 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  5
---------------


For  70 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  6
---------------


For  70 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  7
---------------


For  70 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  8
---------------


For  70 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  9
---------------


For  70 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  10
---------------


In [None]:
step_sizes = [7,8,9,10]
gamma = 0.1

batch_size = 2
learning_rate = 0.05
epochs = 150

for step_size in step_sizes:
  #reset model
  resnet18 = models.resnet18(pretrained = True)
  original_class_len = resnet18.fc.in_features
  resnet18.fc = nn.Linear(original_class_len, len(class_labels))
  model = block_freeze(resnet18, 7)
  model.to(device)
        
  #initalize optimizer
  optimizer = optim.Adam(filter(lambda p:p.requires_grad, 
                                    model.parameters()), 
                            lr = learning_rate)
  
  new_lr_scheduler = lr_scheduler.StepLR(optimizer, 
                                         step_size=step_size, 
                                         gamma = gamma)

  dataloaders = { x : torch.utils.data.DataLoader(image_datasets[x], 
                                                      batch_size = batch_size, 
                                                      shuffle = True, 
                                                      num_workers = 4)
                    for x in ['train', 'val']}
  train_loss, val_loss, train_acc, val_acc = train_model_with_sched(model, 
                                                          criterion, 
                                                          optimizer, 
                                                          epochs, 
                                                          new_lr_scheduler)

  print("For ", epochs, "epochs")
  print("With a batch size of: ", batch_size)
  print("With learning rate of: ", learning_rate)
  print("With a scheduler step size of: ", step_size)
  print('-' * 15)

  df = pd.DataFrame.from_dict(train_loss, orient='index')
  df1 = pd.DataFrame.from_dict(val_loss, orient='index')
  df2 = pd.DataFrame.from_dict(train_acc, orient='index')
  df3 = pd.DataFrame.from_dict(val_acc, orient='index')
  df = df.merge(df1, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_loss", "0_y" : "val_loss"}, inplace=True)
  df = df.merge(df2, right_index=True, left_index=True)
  df = df.merge(df3, right_index=True, left_index=True)
  df.rename(columns = {'0_x' : "training_acc", "0_y" : "val_acc"}, inplace=True)
  df.loc[:,"epoch"] = df.index
        
  get_plots(df)



For  150 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  7
---------------


For  150 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  8
---------------


For  150 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  9
---------------


For  150 epochs
With a batch size of:  2
With learning rate of:  0.05
With a scheduler step size of:  10
---------------


Repeat for other model layouts
