
**Install requirements**

In [13]:
!pip3 install 'torch==1.3.1'
!pip3 install 'torchvision==0.4.2'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

!pip install --upgrade pillow

Requirement already up-to-date: pillow in /usr/local/lib/python3.6/dist-packages (6.2.1)


**Import libraries**

In [0]:
import os
import logging

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import alexnet
from torchvision.models import vgg16
from torchvision.models import resnet50

from PIL import Image
from tqdm import tqdm

import copy

**Set Arguments**

In [0]:
DEVICE = 'cuda' # 'cuda' or 'cpu'

NUM_CLASSES = 101 # 101 + 1: There is an extra Background class that should be removed 

NETWORK = 'resnet'     # 'alexnet', 'vgg' or 'resnet'
BATCH_SIZE = 16      # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = 1e-3            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
NESTEROV = [False, True, False, False, False]
WEIGHT_DECAY = [5e-5, 1e-2, 5e-5, 5e-5, 5e-5]  # Regularization, you can keep this at the default

NUM_EPOCHS = [30, 30, 30, 30, 60]      # Total number of training epochs (iterations over dataset)
STEP_SIZE = [20, 20, 30, 10, 30]       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = [0.1, 0.1, 0.1, 0.01, 0.1]      # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 10

BESTNET = True
TRANSFER = True
FREEZING = 'no_freezing' # Can be 'no_freezing', 'freeze_conv', 'freeze_fc'
                         # 'freeze_conv' is suggested
DATA_AUGM = 'center' # Can be 'center', 'flip', 'random', 'random_flip'
                     # 'center' and 'random' refer to the crop
                     # definitions are cumulative wrt. center crop 
                     # It selects the hyperparameter set among the following ones:

hyperparams = 0     # 0. Default values
                    # 1. Nesterov momentum, weight decay = 0.01
                    # 2. step size = 30
                    # 3. step size = 10, gamma = 0.01 
                    # 4. epochs = 60

**Define Data Preprocessing**

In [0]:
if TRANSFER:
  mean = (0.485, 0.456, 0.406) # mean in ImageNet
  stdDev = (0.229, 0.224, 0.225) # standard deviation in ImageNet
else:
  mean = (0.5, 0.5, 0.5)
  stdDev = (0.5, 0.5, 0.5)  

# Define transforms for training phase
center_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.CenterCrop(224),  # Crops a central square patch of the image
                                                                   # 224 because torchvision's AlexNet needs a 224x224 input!
                                                                   # Remember this when applying different transformations, otherwise you get an error
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize(mean, stdDev) # Normalizes tensor with mean and standard deviation
                                      ])

random_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.RandomCrop(224),  # Crops a random square patch of the image
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize(mean, stdDev) # Normalizes tensor with mean and standard deviation
                                      ])

flip_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.RandomHorizontalFlip(1.0), 
                                      transforms.CenterCrop(224),  # Crops a random square patch of the image                                      
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize(mean, stdDev) # Normalizes tensor with mean and standard deviation
                                      ])

random_flip_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.RandomCrop(224),  # Crops a random square patch of the image
                                      transforms.RandomHorizontalFlip(1.0), 
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize(mean, stdDev) # Normalizes tensor with mean and standard deviation
                                      ])

# Define transforms for the evaluation phase
eval_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize(mean, stdDev)                                  
])  

**Define Caltech**

In [0]:
from torchvision.datasets import VisionDataset

from PIL import Image

import os
import os.path
import sys

def pil_loader(path):
    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')

IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp')

def make_dataset(dir, class_to_idx, extensions=None, is_valid_file=None, split='train', transf='None'):
    images = []
    dir = os.path.expanduser(dir)
    if not ((extensions is None) ^ (is_valid_file is None)):
        raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time")
    if extensions is not None:
        def is_valid_file(x):
            return x.lower().endswith(extensions)

    inFile = os.path.join(dir, split + '.txt')
    with open(inFile, 'r') as f:      
      inImages = f.read().splitlines()

    root = os.path.join(dir, '101_ObjectCategories/')

    for fname in sorted(inImages):
      fpath = os.path.split(fname)
      target = fpath[0]
      path = os.path.join(root, fname)
      if is_valid_file(path) and target != 'BACKGROUND_Google':
        image = pil_loader(path)        
        if transf is not None:
            image = transf(image) # Applies preprocessing
        item = (image, class_to_idx[target])
        images.append(item)

    return images

class Caltech(VisionDataset):
    def __init__(self, root, split='train', transform=None, target_transform=None):
        super(Caltech, self).__init__(root, transform=transform, target_transform=target_transform)

        self.split = split # This defines the split you are going to use
                           # (split files are called 'train.txt' and 'test.txt')

        classes, class_to_idx = self._find_classes(self.root)
        samples = make_dataset(self.root, class_to_idx, IMG_EXTENSIONS, split=self.split, transf=transform)
        if len(samples) == 0:
            raise (RuntimeError("Found 0 files in subfolders of: " + self.root + "\n"
                                "Supported extensions are: " + ",".join(extensions)))


        self.classes = classes
        self.class_to_idx = class_to_idx
        self.samples = samples
        #self.targets = [s[1] for s in samples]

    def _find_classes(self, dir):        
        root = os.path.join(dir, '101_ObjectCategories/')
        if sys.version_info >= (3, 5):
            # Faster and available in Python 3.5 and above
            classes = [d.name for d in os.scandir(root) if d.is_dir()]
        else:          
            classes = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
        
        classes.remove('BACKGROUND_Google')
        classes.sort()                
        #print(classes)
        class_to_idx = {classes[i]: i for i in range(len(classes))}
        return classes, class_to_idx

    def __getitem__(self, index):
        '''
        __getitem__ should access an element through its index
        Args:
            index (int): Index

        Returns:
            tuple: (sample, target) where target is class_index of the target class.
        '''

        image, label = self.samples[index]        
        return image, label

    def __len__(self):        
        return len(self.samples)


**Prepare dataset**

In [18]:
# Clone github repository with data
if not os.path.isdir('./Homework2-Caltech101'):
  !git clone https://github.com/MachineLearning2020/Homework2-Caltech101.git
  
DATA_DIR = 'Homework2-Caltech101/'
train_dataset = Caltech(DATA_DIR, split='train', transform=center_transform)
test_dataset = Caltech(DATA_DIR, split='test', transform=center_transform)

if (DATA_AUGM == 'random' or DATA_AUGM == 'random_flip'):
  # use randomly cropped images, too
  augm_train_dataset = Caltech(DATA_DIR, split='train', transform=random_transform)
  augm_test_dataset = Caltech(DATA_DIR, split='test', transform=random_transform)
  train_dataset.samples += augm_train_dataset.samples

if (DATA_AUGM == 'flip' or DATA_AUGM == 'random_flip'):
  # use flipped images, too
  augm_train_dataset = Caltech(DATA_DIR, split='train', transform=flip_transform)
  augm_test_dataset = Caltech(DATA_DIR, split='test', transform=flip_transform)
  train_dataset.samples += augm_train_dataset.samples
  
print('Train Dataset: {}'.format(len(train_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))

Train Dataset: 5784
Test Dataset: 2893


**Create validation set**

In [19]:
val_dataset = Caltech(DATA_DIR, split='train', transform=center_transform)
val_dataset.samples = []
new_train_dataset = Caltech(DATA_DIR, split='train', transform=center_transform)
new_train_dataset.samples = []

lastLabel = 0
i = 0
for sample in train_dataset.samples:
  if (sample[1] == lastLabel):
    if (i%3 == 0):
      val_dataset.samples.append(sample)
    else:
      new_train_dataset.samples.append(sample)
    i += 1
  else:
    lastLabel = sample[1]
    val_dataset.samples.append(sample)
    i = 1 # we've already inserted a value into the validation set

print('Actual training set: {}'.format(len(new_train_dataset)))
print('Validation set: {}'.format(len(val_dataset)))


Actual training set: 3824
Validation set: 1960


**Prepare Dataloaders**

In [0]:
# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

**Prepare Network**

In [21]:
if (NETWORK == 'alexnet'):
  net = alexnet(pretrained=TRANSFER) # Loading AlexNet model
  # AlexNet has 1000 output neurons, corresponding to the 1000 ImageNet's classes
  # We need 101 outputs for Caltech-101
  net.classifier[6] = nn.Linear(4096, NUM_CLASSES) # nn.Linear in pytorch is a fully connected layer
                                                   # The convolutional layer is nn.Conv2d
elif (NETWORK == 'vgg'):
  net = vgg16(pretrained='imagenet')
  net.classifier[6] = nn.Linear(4096, NUM_CLASSES)

elif (NETWORK == 'resnet'):                                               
  net = resnet50(pretrained=TRANSFER)
  num_features=net.fc.in_features
  net.fc = nn.Linear(num_features, NUM_CLASSES)
  NUM_EPOCHS = [10, 10, 10, 10, 10]




Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/checkpoints/resnet50-19c8e357.pth
100%|██████████| 97.8M/97.8M [00:02<00:00, 36.4MB/s]


**Prepare Training**

In [0]:
# Define loss function
criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy

# Choose parameters to optimize
if (FREEZING == 'no_freezing'):
  parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet
elif (FREEZING == 'freeze_conv'):
  parameters_to_optimize = net.classifier.parameters()
elif (FREEZING == 'freeze_fc'):
  parameters_to_optimize = net.features.parameters()

# Define optimizer
optimizer = optim.SGD(parameters_to_optimize, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY[hyperparams], nesterov=NESTEROV[hyperparams])
# Define scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE[hyperparams], gamma=GAMMA[hyperparams])

**Train**

In [23]:
print("You are using the following hyperparameters:")
print("Step size %d" % (scheduler.step_size))
print("Gamma %f" % (scheduler.gamma))
print("")

net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
bestnet = 0
bestepoch = 0
bestaccuracy = 0

cudnn.benchmark # Calling this optimizes runtime

current_step = 0
# Start iterating over the epochs
for epoch in range(NUM_EPOCHS[hyperparams]):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS[hyperparams], scheduler.get_lr()))
  
  # Iterate over the dataset
  for images, labels in train_dataloader:
    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    net.train() # Sets module in training mode

    # PyTorch, by default, accumulates gradients after each backward pass
    # We need to manually set the gradients to zero before starting a new iteration
    optimizer.zero_grad() # Zero-ing the gradients

    # Forward pass to the network
    outputs = net(images)

    # Compute loss based on output and ground truth
    loss = criterion(outputs, labels)

    # Log loss
    if current_step % LOG_FREQUENCY == 0:
      print('Step {}, Loss {}'.format(current_step, loss.item()))

    # Compute gradients for each layer and update weights
    loss.backward()  # backward pass: computes gradients
    optimizer.step() # update weights based on accumulated gradients

    current_step += 1

  # Evaluate accuracy on the validation set
  net.eval() # Set Network to evaluation mode

  running_corrects = 0
  for images, labels in val_dataloader:
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    # Forward Pass
    outputs = net(images)

    # Get predictions
    _, preds = torch.max(outputs.data, 1)

    # Update Corrects
    running_corrects += torch.sum(preds == labels.data).data.item()

  # Calculate Accuracy
  accuracy = running_corrects / float(len(val_dataset))
  
  print("Validation accuracy: %f (%d/%d)\n" % (accuracy, running_corrects, len(val_dataset)))
  print('Step {}, Loss {}'.format(current_step, loss.item()))
  if (accuracy > bestaccuracy):
    bestaccuracy = accuracy
    bestepoch = epoch
    bestnet = copy.deepcopy(net)
      
  # Step the scheduler
  scheduler.step()  

print("Best epoch: %d (%f accuracy)\n" % (bestepoch+1, bestaccuracy))      

You are using the following hyperparameters:
Step size 20
Gamma 0.100000

Starting epoch 1/10, LR = [0.001]
Step 0, Loss 4.674830913543701
Step 10, Loss 4.420759201049805
Step 20, Loss 3.8017258644104004
Step 30, Loss 3.6755781173706055
Step 40, Loss 3.170764207839966
Step 50, Loss 3.194246292114258
Step 60, Loss 3.2537007331848145
Step 70, Loss 3.3420164585113525
Step 80, Loss 2.756744861602783
Step 90, Loss 2.232487440109253
Step 100, Loss 2.5235977172851562
Step 110, Loss 2.60671067237854
Step 120, Loss 1.7183111906051636
Step 130, Loss 2.7336463928222656
Step 140, Loss 2.5290534496307373
Step 150, Loss 2.3971779346466064
Step 160, Loss 2.033663034439087
Step 170, Loss 3.055086851119995
Step 180, Loss 1.825255036354065
Step 190, Loss 1.8143584728240967
Step 200, Loss 1.4026204347610474
Step 210, Loss 1.5512109994888306
Step 220, Loss 1.1090075969696045
Step 230, Loss 1.419191837310791
Step 240, Loss 1.9981316328048706
Step 250, Loss 1.4559071063995361
Step 260, Loss 1.7639080286026


**Test**

In [24]:
if (BESTNET == False):
  bestnet = net # use the net as after the last epoch

bestnet = bestnet.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
bestnet.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = bestnet(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(test_dataset))

print("Test Accuracy: %f (%d/%d)\n" % (accuracy, running_corrects, len(test_dataset)))

100%|██████████| 181/181 [00:06<00:00, 32.14it/s]

Test Accuracy: 0.959212 (2775/2893)




