# Incremental learning on image classification

## Libraries and packages


In [0]:
#@TODO : better explains transformations

In [1]:
!pip3 install 'torch==1.4.0'
!pip3 install 'torchvision==0.5.0'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

Collecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 23kB/s 
[31mERROR: torchvision 0.6.0+cu101 has requirement torch==1.5.0, but you'll have torch 1.4.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.5.0+cu101
    Uninstalling torch-1.5.0+cu101:
      Successfully uninstalled torch-1.5.0+cu101
Successfully installed torch-1.4.0
Collecting torchvision==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/7e/90/6141bf41f5655c78e24f40f710fdd4f8a8aff6c8b7c6f0328240f649bdbe/torchvision-0.5.0-cp36-cp36m-manylinux1_x86_64.whl (4.0MB)
[K     |████████████████████████████████| 4.0MB 4.8MB/s 
Installing collected packages: torchvision
  Found existing installation: torchvision 0.6.0+cu101
    Uninstalling torchvision-0



In [0]:
import os
import urllib
import logging

import numpy as np

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import Subset, DataLoader, ConcatDataset
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import resnet34

from PIL import Image
from tqdm import tqdm

In [3]:
# GitHub credentials for cloning private repository
username = ''
password = ''

# Download packages from repository
password = urllib.parse.quote(password)
!git clone https://$username:$password@github.com/manuelemacchia/incremental-learning-image-classification.git
password = ''

!mv -v incremental-learning-image-classification/* .
!rm -rf incremental-learning-image-classification README.md

Cloning into 'incremental-learning-image-classification'...
remote: Enumerating objects: 231, done.[K
remote: Total 231 (delta 0), reused 0 (delta 0), pack-reused 231
Receiving objects: 100% (231/231), 1.51 MiB | 23.35 MiB/s, done.
Resolving deltas: 100% (113/113), done.
renamed 'incremental-learning-image-classification/data' -> './data'
renamed 'incremental-learning-image-classification/model' -> './model'
renamed 'incremental-learning-image-classification/notebook.ipynb' -> './notebook.ipynb'
renamed 'incremental-learning-image-classification/README.md' -> './README.md'
renamed 'incremental-learning-image-classification/utils' -> './utils'


In [4]:
from data.cifar100 import Cifar100
from model.resnet_cifar import resnet32
from model.manager import Manager
from utils import plot

  import pandas.util.testing as tm


## Arguments

In [0]:
# Directories
DATA_DIR = 'data'       # Directory where the dataset will be downloaded

# Settings
DEVICE = 'cuda'

# Dataset
RANDOM_STATE = 420      # For reproducibility of results
                        # Note: different random states give very different
                        # splits and therefore very different results.

NUM_CLASSES = 100       # Total number of classes
NUM_BATCHES = 10
CLASS_BATCH_SIZE = 10   # Size of batch of classes for incremental learning

VAL_SIZE = 0.1          # Proportion of validation set with respect to training set (between 0 and 1)

# Training
BATCH_SIZE = 64         # Batch size (iCaRL sets this to 128)
LR = 0.2                # Initial learning rate
                        # iCaRL sets LR = 2. Since they use BinaryCrossEntropy loss it is feasible,
                        # in our case it would diverge as we use CrossEntropy loss.
MOMENTUM = 0.9          # Momentum for stochastic gradient descent (SGD)
WEIGHT_DECAY = 1e-5     # Weight decay from iCaRL

NUM_RUNS = 3            # Number of runs of every method
                        # Note: this should be at least 3 to have a fair benchmark

NUM_EPOCHS = 70         # Total number of training epochs
MILESTONES = [49, 63]   # Step down policy from iCaRL (MultiStepLR)
                        # Decrease the learning rate by gamma at each milestone
GAMMA = 0.2             # Gamma factor from iCaRL

## Data preparation

In [0]:
# Define transformations for training
train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Define transformations for evaluation
test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))                                    
])

In [7]:
train_dataloaders = [[] for i in range(NUM_RUNS)]
val_dataloaders = [[] for i in range(NUM_RUNS)]
test_dataloaders = [[] for i in range(NUM_RUNS)]

for run_i in range(NUM_RUNS):

  test_subsets = []

  for split_i in range(CLASS_BATCH_SIZE):

    # Download dataset only at first instantiation
    if(run_i+split_i == 0):
      download = True
    else:
      download = False

    # Create CIFAR100 dataset
    train_dataset = Cifar100(DATA_DIR, train = True, download = download, random_state = RANDOM_STATE+run_i, transform=train_transform)
    test_dataset = Cifar100(DATA_DIR, train = False, download = False, random_state = RANDOM_STATE+run_i, transform=test_transform)
   
    # Subspace of CIFAR100 of 10 classes
    train_dataset.set_classes_batch(train_dataset.batch_splits[split_i]) 
    test_dataset.set_classes_batch([test_dataset.batch_splits[i] for i in range(0, split_i+1)])

    # Define train and validation indices
    train_indices, val_indices = train_dataset.train_val_split(VAL_SIZE, RANDOM_STATE)
    
    train_dataloaders[run_i].append(DataLoader(Subset(train_dataset, train_indices), 
                               batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True))
    
    val_dataloaders[run_i].append(DataLoader(Subset(train_dataset, val_indices), 
                                batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True))
    
    # Dataset with all seen class
    test_dataloaders[run_i].append(DataLoader(test_dataset, 
                               batch_size=BATCH_SIZE, shuffle=True, num_workers=4))           

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to data/cifar-100-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting data/cifar-100-python.tar.gz to data


In [0]:
# Sanity check: visualize a batch of images
dataiter = iter(test_dataloaders[0][5])
images, labels = dataiter.next()

plot.image_grid(images, one_channel=False)
unique_labels = np.unique(labels, return_counts=True)
unique_labels

## Fine tuning

In [0]:
# @todo try xavier initialization 

In [0]:
train_loss_history = []
train_accuracy_history = []
val_loss_history = []
val_accuracy_history = []
test_accuracy_history = []

# Iterate over runs
for train_dataloader, val_dataloader, test_dataloader in zip(train_dataloaders,
                                                             val_dataloaders, test_dataloaders):
  
    
    train_loss_history.append({})
    train_accuracy_history.append({})
    val_loss_history.append({})
    val_accuracy_history.append({})
    test_accuracy_history.append({})

    net = resnet32()  # Define the net
    
    criterion = nn.CrossEntropyLoss()  # Define the loss

    # In this case we optimize over all the parameters of Resnet
    parameters_to_optimize = net.parameters()
    optimizer = optim.SGD(parameters_to_optimize, lr=LR,
                          momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, 
                                               milestones=MILESTONES, gamma=GAMMA)
        
    
    i = 0
    for train_split, val_split, test_split in zip(train_dataloader,
                                                  val_dataloader, test_dataloader):
      
        
        current_split = "Split %i"%(i)
        print(current_split)

        # Define Manager Object
        manager = Manager(DEVICE, net, criterion, optimizer, scheduler,
                          train_split, val_split, test_split)

        scores = manager.train(NUM_EPOCHS)  # train the model

        # score[i] = dictionary with key:epoch, value: score
        train_loss_history[-1][current_split] = scores[0]
        train_accuracy_history[-1][current_split] = scores[1]
        val_loss_history[-1][current_split] = scores[2]
        val_accuracy_history[-1][current_split] = scores[3]

        # Test the model on classes seen until now
        test_accuracy, all_preds = manager.test()

        test_accuracy_history[-1][current_split] = test_accuracy

        # Uncomment if default resnet has 10 node at last FC layer
        #manager.increment_classes(n=10)  # add 10 nodes to last FC layer

        i+=1

In [0]:
# Confusion matrix over last run test predictions
targets = test_dataset.targets
preds = all_preds.to('cpu').numpy()

plot.heatmap_cm(targets, preds)

In [0]:
def mean_std_scores(train_loss_history, train_accuracy_history,
                   val_loss_history, val_accuracy_history, test_accuracy_history):
  '''
      Average the scores of runs different splits
  '''
  # keys = 'Split i-esim'
  keys = train_loss_history[0].keys()

  # Containers for average scores
  avg_train_loss = {k:[] for k in keys}
  avg_train_accuracy = {k:[] for k in keys}
  avg_val_loss = {k:[] for k in keys}
  avg_val_accuracy = {k:[] for k in keys}
  avg_test_accuracy = {k:[] for k in keys}
  
  train_loss = []
  train_accuracy = []
  val_loss = []
  val_accuracy = []
  test_accuracy = []

  for key in keys:
    for run in range(NUM_RUNS):

      # Append all i-th scores (split i-esim) for the different runs
      avg_train_loss[key].append(train_loss_history[run][key])
      avg_train_accuracy[key].append(train_accuracy_history[run][key])
      avg_val_loss[key].append(val_loss_history[run][key])
      avg_val_accuracy[key].append(val_accuracy_history[run][key])
      avg_test_accuracy[key].append(test_accuracy_history[run][key])

    # Define (mean, std) of the i-th score for each split
    train_loss.append([np.array(avg_train_loss[key]).mean(), np.array(avg_train_loss[key]).std()])
    train_accuracy.append([np.array(avg_train_accuracy[key]).mean(), np.array(avg_train_accuracy[key]).std()])
    val_loss.append([np.array(avg_val_loss[key]).mean(), np.array(avg_val_loss[key]).std()])
    val_accuracy.append([np.array(avg_val_accuracy[key]).mean(), np.array(avg_val_accuracy[key]).std()])
    test_accuracy.append([np.array(avg_test_accuracy[key]).mean(), np.array(avg_test_accuracy[key]).std()])

  train_loss = np.array(train_loss)
  train_accuracy = np.array(train_accuracy)
  val_loss = np.array(val_loss)
  val_accuracy = np.array(val_accuracy)
  test_accuracy = np.array(test_accuracy)

  # Return averaged scores
  return(train_loss, train_accuracy, val_loss, val_accuracy, test_accuracy)

In [0]:
# Get the average scores
train_loss, train_accuracy, val_loss, val_accuracy,\
test_accuracy = mean_std_scores(train_loss_history, train_accuracy_history,
                                   val_loss_history, val_accuracy_history, test_accuracy_history)

In [0]:
plot.train_val_scores(train_loss, train_accuracy, val_loss, val_accuracy, None)

In [0]:
plot.test_scores(test_accuracy, None)

In [0]:
# @todo: create utils package for functions

import ast

def load_json_scores(root):

  with open(os.path.join(root, 'train_accuracy_history.json')) as f:
      train_accuracy_history = ast.literal_eval(f.read())

  with open(os.path.join(root, 'train_loss_history.json') as f:
      train_loss_history = ast.literal_eval(f.read())

  with open(os.path.join(root, 'val_accuracy_history.json') as f:
      val_accuracy_history = ast.literal_eval(f.read())

  with open(os.path.join(root, 'val_loss_history.json')) as f:
      val_loss_history = ast.literal_eval(f.read())

  with open(os.path.join(root, 'test_accuracy_history.json') as f:
      test_accuracy_history = ast.literal_eval(f.read())

  return(train_loss_history, train_accuracy_history, val_loss_history,
         val_accuracy_history, test_accuracy_history)

In [0]:
# @todo: create utils package for functions
import json

def save_json_scores(root, train_loss_history, train_accuracy_history,
                   val_loss_history, val_accuracy_history, test_accuracy_history):

    with open(os.path.join(root, 'train_loss_history.json'), 'w') as fout:
        json.dump(train_loss_history, fout)

    with open(os.path.join(root, 'train_accuracy_history.json'), 'w') as fout:
        json.dump(train_accuracy_history, fout)

    with open(os.path.join(root, 'val_loss_history.json'), 'w') as fout:
        json.dump(val_loss_history, fout)

    with open(os.path.join(root, 'val_accuracy_history.json'), 'w') as fout:
        json.dump(val_accuracy_history, fout)

    with open(os.path.join(root, 'test_accuracy_history.json'), 'w') as fout:
        json.dump(test_accuracy_history, fout)

In [0]:
save_json_scores('scores', train_loss_history, train_accuracy_history,
                   val_loss_history, val_accuracy_history, test_accuracy_history)

In [0]:
!zip -r scores.zip scores
files.download("scores.zip")

## Learning Without Forgetting

In [0]:
from torch.nn import functional as F
from torch.nn import BCEWithLogitsLoss

'''BCE formulation:
 let x = logits, z = labels. The logistic loss is

  z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
'''


class LWF():
  def __init__(self, device, net, old_net, criterion, optimizer, scheduler,
               temperature, train_dataloader, val_dataloader, test_dataloader):
    
    self.device = device

    self.net = net
    self.best_net = self.net
    self.old_net = old_net # None for first ten classes

    self.criterion = BCEWithLogitsLoss() # Classifier criterion 
    self.optimizer = optimizer
    self.scheduler = scheduler

    self.temperature = temperature

    self.train_dataloader = train_dataloader
    self.val_dataloader = val_dataloader
    self.test_dataloader = test_dataloader

    CLASS_BATCH_SIZE = 10

  def to_onehot(self, targets, num_classes=100): #set = 10 for incremental net
    '''
    Args:
    targets : dataloader.dataset.targets of the new task images
    '''
    one_hot_targets = torch.eye(num_classes)[targets]

    return one_hot_targets


  def distillation_loss(self, outputs, one_hot_targets):
    # BCEWithLogitsLoss receives as input the logits ( out of softmax layer ) 
    # and internally combines a Sigmoid layer + a BCE loss
    '''
      Args:
      temperature (float): T parameter in Hinton distillation loss
      old_net_outputs (tensor): predictions form the old netowrk on unseen data
    '''

    pass

  def warm_up():
    pass


  def do_batch(self, images, labels):
    # @ TODO net.train and old_net.train in do_epoch

    batch.to(self.device)
    labels.to(self.device)

    one_hot_labels = to_onehot(labels) # num_classes = 100 witout incremental net

    # Zero-ing the gradients
    self.optimizer.zero_grad()

    # Old net forward pass
    old_outputs = self.old_net(batch)
    dist_loss = distillation_loss(old_outputs, one_hot_labels, self.temperature)

    # Training net forward pass
    outputs = self.net(batch)
    # Sigmoid embedded in the loss
    class_loss = criterion(outputs, one_hot_labels)

    # Get predictions
    _, preds = torch.max(outputs.data, 1)

    # Compute the number of correctly classified images
    running_corrects = \
        torch.sum(preds == labels.data).data.item()

    # Backward pass: computes gradients
    # Ask Cermelli if we can update separately (gradient is a linear funciton, should be ok), or whether some issues arise
    dist_loss.backward()
    class_loss.backward() 


  def do_epoch():
    pass


  def do_train():
    pass


  def validate():
    pass


  def test():
    pass

In [0]:
net = resnet32()
  
criterion = nn.CrossEntropyLoss()  # Define the loss

# In this case we optimize over all the parameters of Resnet
parameters_to_optimize = net.parameters()
optimizer = optim.SGD(parameters_to_optimize, lr=LR,
                      momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, 
                                            milestones=MILESTONES, gamma=GAMMA)

net.train()
net.to(DEVICE)

out = []

lwf = LWF(DEVICE, net, None, criterion, optimizer, scheduler,
          train_dataloaders[0][0], val_dataloaders[0][0], test_dataloaders[0][0])

for batch, labels in train_dataloaders[0][0]:
  batch = batch.to(DEVICE)
  labels = labels.to(DEVICE)

  # Zero-ing the gradients
  optimizer.zero_grad() 

  # Forward pass
  outputs = net(batch)
  out.append(outputs)
  loss = criterion(outputs, labels)
  one_hot = lwf.one_hot(labels, 10)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Compute the number of correctly classified images
  running_corrects = \
      torch.sum(preds == labels.data).data.item()

  # Backward pass: computes gradients
  loss.backward()  

  # Update weights based on accumulated gradients
  optimizer.step()

In [29]:
one_hot.size()

torch.Size([64, 10])

In [12]:
print(out[0])


tensor([[-0.7995, -0.9830,  0.1291,  ...,  0.0597,  1.3356, -0.3800],
        [-1.0638, -0.5950, -0.0150,  ...,  0.3233,  1.5460, -0.3869],
        [-0.9680, -0.9738,  0.1185,  ...,  0.1212,  1.4894, -0.3766],
        ...,
        [-1.1317, -0.8696, -0.0582,  ...,  0.1562,  1.8088, -0.6589],
        [-0.9746, -0.9958,  0.1708,  ...,  0.1280,  1.7715, -0.5349],
        [-0.8641, -0.8057,  0.2574,  ...,  0.4364,  1.1935, -0.3431]],
       device='cuda:0', grad_fn=<AddmmBackward>)


In [13]:
print(out[0].data)

tensor([[-0.7995, -0.9830,  0.1291,  ...,  0.0597,  1.3356, -0.3800],
        [-1.0638, -0.5950, -0.0150,  ...,  0.3233,  1.5460, -0.3869],
        [-0.9680, -0.9738,  0.1185,  ...,  0.1212,  1.4894, -0.3766],
        ...,
        [-1.1317, -0.8696, -0.0582,  ...,  0.1562,  1.8088, -0.6589],
        [-0.9746, -0.9958,  0.1708,  ...,  0.1280,  1.7715, -0.5349],
        [-0.8641, -0.8057,  0.2574,  ...,  0.4364,  1.1935, -0.3431]],
       device='cuda:0')


In [11]:
print(out[0][-1])

tensor([-8.6408e-01, -8.0572e-01,  2.5741e-01,  9.5915e-01,  1.2186e-01,
        -2.5116e-01,  3.5747e-01,  2.7579e-01,  2.0296e+00,  4.7976e-01,
         9.0585e-01,  2.7405e-01, -1.1492e-01,  4.8443e-02, -6.7335e-01,
         4.3757e-01,  2.6613e-01, -4.5360e-01, -2.4909e-01, -2.1453e-01,
         4.5871e-01,  5.6763e-01, -2.1225e-01, -2.0759e-01,  5.5339e-01,
        -1.0563e+00, -1.6482e-01, -2.2758e-01,  7.0833e-01,  6.5000e-02,
        -3.3487e-01,  6.3127e-01, -1.0974e+00,  2.6798e-01,  5.7368e-01,
         9.3862e-01,  4.0857e-01, -6.8902e-01,  1.0784e+00, -7.1559e-01,
         7.4092e-01, -6.0035e-02,  2.5247e-01,  9.7254e-01, -3.0541e-01,
         4.7372e-01, -7.1319e-02,  2.5955e-01,  8.5564e-01, -5.4694e-01,
        -7.1975e-03,  2.4284e-01,  1.1638e+00,  6.0530e-02, -1.2561e-01,
        -5.6042e-01,  3.1759e-01,  1.4026e-01,  1.0724e+00,  1.0589e+00,
        -1.9121e-01, -2.6533e-01, -5.1522e-01, -3.0642e-01,  2.7391e-01,
         4.6811e-01,  4.0740e-02, -2.7171e-01, -4.1

In [14]:
print(type(out[0].data))

<class 'torch.Tensor'>
