# Incremental learning on image classification
**Ablation studies**

## Libraries and packages


In [1]:
!pip3 install 'torch==1.4.0'
!pip3 install 'torchvision==0.5.0'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

Collecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 22kB/s 
[31mERROR: torchvision 0.6.0+cu101 has requirement torch==1.5.0, but you'll have torch 1.4.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.5.0+cu101
    Uninstalling torch-1.5.0+cu101:
      Successfully uninstalled torch-1.5.0+cu101
Successfully installed torch-1.4.0
Collecting torchvision==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/7e/90/6141bf41f5655c78e24f40f710fdd4f8a8aff6c8b7c6f0328240f649bdbe/torchvision-0.5.0-cp36-cp36m-manylinux1_x86_64.whl (4.0MB)
[K     |████████████████████████████████| 4.0MB 2.6MB/s 
Installing collected packages: torchvision
  Found existing installation: torchvision 0.6.0+cu101
    Uninstalling torchvision-0



In [1]:
import os
import urllib
import logging

import numpy as np

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import Dataset, Subset, DataLoader, ConcatDataset
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import resnet34

from PIL import Image
from tqdm import tqdm

from copy import deepcopy

from sklearn.metrics import confusion_matrix

In [3]:
# GitHub credentials for cloning private repository
username = 'LilMowgli'
password = '_Kora3030_'

# Download packages from repository
password = urllib.parse.quote(password)
!git clone https://$username:$password@github.com/manuelemacchia/incremental-learning-image-classification.git
password = ''

!mv -v incremental-learning-image-classification/* .
!rm -rf incremental-learning-image-classification README.md

Cloning into 'incremental-learning-image-classification'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (176/176), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 620 (delta 96), reused 154 (delta 85), pack-reused 444[K
Receiving objects: 100% (620/620), 2.61 MiB | 2.42 MiB/s, done.
Resolving deltas: 100% (325/325), done.
renamed 'incremental-learning-image-classification/data' -> './data'
renamed 'incremental-learning-image-classification/icarlSVM.ipynb' -> './icarlSVM.ipynb'
renamed 'incremental-learning-image-classification/joint_training.ipynb' -> './joint_training.ipynb'
renamed 'incremental-learning-image-classification/losses' -> './losses'
renamed 'incremental-learning-image-classification/model' -> './model'
renamed 'incremental-learning-image-classification/notebook.ipynb' -> './notebook.ipynb'
renamed 'incremental-learning-image-classification/README.md' -> './README.md'
renamed 'incremental-learning-image-classificat

In [2]:
from data.cifar100 import Cifar100
from model.resnet_cifar import resnet32
from model.manager import Manager
from model.icarl import Exemplars
from model.icarl import iCaRL
from utils import plot

  import pandas.util.testing as tm


## Arguments

In [15]:
# Directories
DATA_DIR = 'data'       # Directory where the dataset will be downloaded

# Settings
DEVICE = 'cuda'

# Dataset

RANDOM_STATE = None

RANDOM_STATES = [658, 423, 422]      # For reproducibility of results                        
                                     # Note: different random states give very different
                                     # splits and therefore very different results.

NUM_CLASSES = 100       # Total number of classes
NUM_BATCHES = 10
CLASS_BATCH_SIZE = 10   # Size of batch of classes for incremental learning

VAL_SIZE = 0.1          # Proportion of validation set with respect to training set (between 0 and 1)

# Training
BATCH_SIZE = 64         # Batch size (iCaRL sets this to 128)
LR = 2                  # Initial learning rate
                       
MOMENTUM = 0.9          # Momentum for stochastic gradient descent (SGD)
WEIGHT_DECAY = 1e-5     # Weight decay from iCaRL

NUM_RUNS = 3            # Number of runs of every method
                        # Note: this should be at least 3 to have a fair benchmark

NUM_EPOCHS = 70         # Total number of training epochs
MILESTONES = [49, 63]   # Step down policy from iCaRL (MultiStepLR)
                        # Decrease the learning rate by gamma at each milestone
GAMMA = 0.2             # Gamma factor from iCaRL

## Data preparation

In [4]:
# Transformations for Learning Without Forgetting
train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))                                    
])

In [5]:
train_subsets = [[] for i in range(NUM_RUNS)]
val_subsets = [[] for i in range(NUM_RUNS)]
test_subsets = [[] for i in range(NUM_RUNS)]

for run_i in range(NUM_RUNS):
    for split_i in range(CLASS_BATCH_SIZE):
        if run_i+split_i == 0: # Download dataset only at first instantiation
            download = False
        else:
            download = False

        # Create CIFAR100 dataset
        train_dataset = Cifar100(DATA_DIR, train=True, download=download, random_state=RANDOM_STATES[run_i], transform=train_transform)
        test_dataset = Cifar100(DATA_DIR, train=False, download=False, random_state=RANDOM_STATES[run_i], transform=test_transform)
    
        # Subspace of CIFAR100 of 10 classes
        train_dataset.set_classes_batch(train_dataset.batch_splits[split_i]) 
        test_dataset.set_classes_batch([test_dataset.batch_splits[i] for i in range(0, split_i+1)])

        # Define train and validation indices
        train_indices, val_indices = train_dataset.train_val_split(VAL_SIZE, RANDOM_STATES[run_i])

        # Define subsets
        train_subsets[run_i].append(Subset(train_dataset, train_indices))
        val_subsets[run_i].append(Subset(train_dataset, val_indices))
        test_subsets[run_i].append(test_dataset)

## Classifiers

### K-nearest neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

class iCaRLwithKNN(iCaRL):
    def classifier_fit(self, train_dataset, n_neighbors):
        """Fit classifier on the union of training dataset and exemplars."""

        # Union of training dataset and exemplars
        exemplars_dataset = Exemplars(self.exemplars, self.train_transform)
        train_dataset_with_exemplars = ConcatDataset([exemplars_dataset, train_dataset])

        # Convert dataset to numpy format
        # X contains training samples, y contains labels
        X, y = self.dataset_to_numpy(train_dataset_with_exemplars)

        # Extract features from the training dataset
        X_features = self.extract_features(torch.tensor(X, dtype=torch.float))
        for i in range(X_features.size(0)):
            X_features[i] = X_features[i]/X_features[i].norm()
        X_features = X_features.to('cpu').numpy()

        self.clf = KNeighborsClassifier(n_neighbors=n_neighbors)
        self.clf.fit(X_features, y)

    def classifier_predict(self, test_dataset):
        """Predict labels of test_dataset."""

        X_test, y_test = self.dataset_to_numpy(test_dataset)

        # Extract features from the test set
        X_test_features = self.extract_features(torch.tensor(X_test, dtype=torch.float))
        for i in range(X_test_features.size(0)):
            X_test_features[i] = X_test_features[i]/X_test_features[i].norm()
        X_test_features = X_test_features.to('cpu').numpy()
        
        y_pred = self.clf.predict(X_test_features)

        return y_test, y_pred

    def dataset_to_numpy(self, dataset):
        # Preallocate arrays
        X = np.zeros((len(dataset), 3, 32, 32))
        y = np.zeros(len(dataset), dtype=int)

        dataloader = DataLoader(dataset, batch_size=1)

        for idx, (image, labels) in enumerate(dataloader):
            X[idx] = image[0].numpy()
            y[idx] = labels.numpy()[0]

        return X, y

    def test_knn(self, test_dataset, train_dataset, n_neighbors):
        """Test the model.

        Args:
            test_dataset: dataset on which to test the network
            train_dataset: training set used to train the last split
        Returns:
            accuracy (float): accuracy of the model on the test set
        """

        self.net.train(False)
        if self.best_net is not None: self.best_net.train(False)  # Set Network to evaluation mode
        if self.old_net is not None: self.old_net.train(False)

        with torch.no_grad():
            self.classifier_fit(train_dataset, n_neighbors=n_neighbors)
            y_truth, y_pred = self.classifier_predict(test_dataset)

        # Calculate accuracy
        accuracy = accuracy_score(y_truth, y_pred)

        print(f"Test accuracy (iCaRL with KNN): {accuracy} ")

        return accuracy, torch.tensor(y_pred)

In [None]:
logs = [[] for _ in range(NUM_RUNS)]

for run_i in range(NUM_RUNS):
    net = resnet32()
    icarl_knn = iCaRLwithKNN(DEVICE, net, LR, MOMENTUM, WEIGHT_DECAY, MILESTONES, GAMMA, NUM_EPOCHS, BATCH_SIZE, train_transform, test_transform)

    for split_i in range(10):
        print(f"## Split {split_i} of run {run_i} ##")
        
        icarl_knn.incremental_train(split_i, train_subsets[run_i][split_i], val_subsets[run_i][split_i])

        targets = torch.stack([label[0] for _, label in DataLoader(test_subsets[run_i][split_i])])

        logs[run_i].append({})

        # Test classic iCaRL classifier
        acc, preds = icarl_knn.test(test_subsets[run_i][split_i], train_subsets[run_i][split_i])
        logs[run_i][split_i]['accuracy'] = acc
        logs[run_i][split_i]['conf_mat'] = confusion_matrix(targets.to('cpu'), preds.to('cpu'))
        
        # Test KNN classifier
        acc, preds = icarl_knn.test_knn(test_subsets[run_i][split_i], train_subsets[run_i][split_i], n_neighbors=3)
        logs[run_i][split_i]['knn_accuracy'] = acc
        logs[run_i][split_i]['knn_conf_mat'] = confusion_matrix(targets.to('cpu'), preds.to('cpu'))

### Cosine linear layer

In [13]:
from model.resnet_cifar import resnet32cosine
from math import sqrt



def ang(a,b):
  a = a.type(torch.FloatTensor)
  b = b.type(torch.FloatTensor)
  return torch.dot(a, b)/(torch.norm(a) * torch.norm(b))


# distillation
class LFCLoss(nn.Module):

  def __init__(self, weight = None, reduction = 'mean'):
    super(LFCLoss, self).__init__()

  def forward(self, new_outputs, new_targets, old_features = None, new_features = None, num_classes = 10):
    '''Args:
    new_outputs: torch.tensor(). Size = [64, 10]. New classes outputs
    new_targets: torch.tensor(). Size = [64, 10]. One hot encoded targets of new classes
    '''
    
    BATCH_SIZE = 64
    
    lambda_base = 5 # from paper
    cur_lambda = lambda_base * sqrt(num_classes-10/num_classes) # from paper
    
#     EPS = 1e-10
#     sigmoid= nn.Sigmoid()
#     clf_loss = torch.mean(-new_targets[:, :num_classes-10]*torch.log(sigmoid(outputs[:, num_classes-10:])+EPS)\
#                         + (1-new_targets[:, num_classes-10:])* torch.pow(sigmoid(outputs[:, num_classes-10:]), 2))
 
    clf_criterion = nn.CrossEntropyLoss()
    clf_loss = clf_criterion(new_outputs, new_targets)
    
    if num_classes == 10:
      return clf_loss
    
    dist_criterion = nn.CosineEmbeddingLoss()
    dist_loss = dist_criterion(new_features, old_features, torch.ones(64).to(DEVICE))

    # dist_loss = torch.mean(torch.tensor([1-ang(new_outputs, new_targets), ang(new_outputs, new_targets), 0]))


    
    dist = (num_classes - 10)/num_classes
    clf = 10/num_classes
    
    loss = clf*clf_loss + dist*dist_loss*cur_lambda
    
    return loss




class iCaRLwithCosine(iCaRL):


    def test(self, test_dataset):
        self.net.train(False)
        if self.best_net is not None: self.best_net.train(False) # Set Network to evaluation mode
        if self.old_net is not None: self.old_net.train(False)

        self.test_dataloader = DataLoader(test_dataset, batch_size=self.BATCH_SIZE, shuffle=True, num_workers=4)

        running_corrects = 0
        total = 0

        all_preds = torch.tensor([]) # to store all predictions
        all_preds = all_preds.type(torch.LongTensor)
        
        for images, labels in self.test_dataloader:
            images = images.to(self.device)
            labels = labels.to(self.device)
            total += labels.size(0)

            # Forward Pass
            with torch.no_grad():
                if self.VALIDATE:
                    outputs = self.best_net(images)
                else:
                    outputs = self.net(images)

            # Get predictions
            _, preds = torch.max(outputs.data, 1)

            # Update Corrects
            running_corrects += torch.sum(preds == labels.data).data.item()

            # Append batch predictions
            all_preds = torch.cat(
                (all_preds.to(self.device), preds.to(self.device)), dim=0
            )

        # Calculate accuracy
        accuracy = running_corrects / float(total)  

        print(f"Test accuracy (Cosine): {accuracy}")

        return accuracy, all_preds

    def extract_features(self, sample, batch=True, transform=None, old = False):
        """Extract features from single sample or from batch.
        
        Args:
            sample (PIL image or torch.tensor): sample(s) from which to
                extract features
            batch (bool): if True, sample is a torch.tensor containing a batch
                of images with dimensions (batch_size, 3, 32, 32)
            transform: transformations to apply to the PIL image before
                processing
        Returns:
            features: torch.tensor, 1-D of dimension 64 for single samples or
                2-D of dimension (batch_size, 64) for batch
        """

        if not old:

          assert not (batch is False and transform is None), "if a PIL image is passed to extract_features, a transform must be defined"

          self.net.train(False)
          if self.best_net is not None: self.best_net.train(False)
          if self.old_net is not None: self.old_net.train(False)

          if batch is False: # Treat sample as single PIL image
              sample = transform(sample)
              sample = sample.unsqueeze(0) # https://stackoverflow.com/a/59566009/6486336

          sample = sample.to(self.device)

          if self.VALIDATE:
              features = self.best_net.features(sample)
          else:
              features = self.net.features(sample)

          if batch is False:
              features = features[0]

          return features

        else:
          assert not (batch is False and transform is None), "if a PIL image is passed to extract_features, a transform must be defined"

          if self.best_net is not None: self.best_net.train(False)
          if self.old_net is not None: self.old_net.train(False)

          if batch is False: # Treat sample as single PIL image
              sample = transform(sample)
              sample = sample.unsqueeze(0) # https://stackoverflow.com/a/59566009/6486336

          sample = sample.to(self.device)

          if self.VALIDATE:
              features = self.best_net.features(sample)
          else:
              features = self.old_net.features(sample)

          if batch is False:
              features = features[0]

          return features



    def do_epoch(self, current_epoch):
        """Trains model for one epoch.
        
        Args:
            current_epoch (int): current epoch number (begins from 1)
        Returns:
            train_loss: average training loss over all batches of the
                current epoch.
            train_accuracy: training accuracy of the current epoch over
                all samples.
        """

        # Set the current network in training mode
        self.net.train()
        if self.old_net is not None: self.old_net.train(False)
        if self.best_net is not None: self.best_net.train(False)

        running_train_loss = 0
        running_corrects = 0
        total = 0
        batch_idx = 0

        print(f"Epoch: {current_epoch}, LR: {self.scheduler.get_last_lr()}")

        for images, labels in self.train_dataloader:
            loss, corrects = self.do_batch(images, labels)

            running_train_loss += loss.item()
            running_corrects += corrects
            total += labels.size(0)
            batch_idx += 1

        self.scheduler.step()

        # Calculate average scores
        train_loss = running_train_loss / batch_idx # Average over all batches
        train_accuracy = running_corrects / float(total) # Average over all samples

        print(f"Train loss: {train_loss}, Train accuracy: {train_accuracy}")

        return train_loss, train_accuracy




    def do_batch(self, batch, labels):
        """Train network for a batch. Loss is applied here.
        Args:
            batch: batch of data used for training the network
            labels: targets of the batch
        Returns:
            loss: output of the criterion applied
            running_corrects: number of correctly classified elements
        """
        batch = batch.to(self.device)
        labels = labels.to(self.device)

        # Zero-ing the gradients
        self.optimizer.zero_grad()
        
        # One-hot encoding of labels of the new training data (new classes)
        # Size: batch size (rows) by number of classes seen until now (columns)
        #
        # e.g., suppose we have four images in a batch, and each incremental
        #   step adds three new classes. At the second step, the one-hot
        #   encoding may return the following tensor:
        #
        #       tensor([[0., 0., 0., 1., 0., 0.],   # image 0 (label 3)
        #               [0., 0., 0., 0., 1., 0.],   # image 1 (label 4)
        #               [0., 0., 0., 0., 0., 1.],   # image 2 (label 5)
        #               [0., 0., 0., 0., 1., 0.]])  # image 3 (label 4)
        #
        #   The first three elements of each vector will always be 0, as the
        #   new training batch does not contain images belonging to classes
        #   already seen in previous steps.
        #
        #   The last three elements of each vector will contain the actual
        #   information about the class of each image (one-hot encoding of the
        #   label). Therefore, we slice the tensor and remove the columns 
        #   related to old classes (all zeros).
        num_classes = self.output_neurons_count() # Number of classes seen until now, including new classes
        

        if self.old_net is None:
            # Network is training for the first time, so we only apply the
            # classification loss.

            # Forward pass
            outputs = self.net(batch)
            loss = self.criterion(outputs, labels)


        else:
            # Old net forward pass. We compute the outputs of the old network
            # and apply a sigmoid function. These are used in the distillation
            # loss. We discard the output of the new neurons, as they are not
            # considered in the distillation loss.
            old_net_batch_features = self.extract_features(batch, old=True) # (batch size, 64)

            new_net_batch_features = self.extract_features(batch, old=False)


            # Concatenate the outputs of the old network and the one-hot encoded
            # labels along dimension 1 (columns).
            # 
            # Each row refers to an image in the training set, and contains:
            # - the output of the old network for that image, used by the
            #   distillation loss
            # - the one-hot label of the image, used by the classification loss
            

            # Forward pass
            outputs = self.net(batch)
            loss = self.criterion(outputs, labels, old_net_batch_features, new_net_batch_features, num_classes)

        # Get predictions
        _, preds = torch.max(outputs.data, 1)

        # Accuracy over NEW IMAGES, not over all images
        running_corrects = torch.sum(preds == labels.data).data.item() 

        # Backward pass: computes gradients
        loss.backward()

        self.optimizer.step()

        return loss, running_corrects


    def validate(self):
        """Validate the model.
        
        Returns:
            val_loss: average loss function computed on the network outputs
                of the validation set (val_dataloader).
            val_accuracy: accuracy computed on the validation set.
        """

        self.net.train(False)
        if self.old_net is not None: self.old_net.train(False)
        if self.best_net is not None: self.best_net.train(False)

        running_val_loss = 0
        running_corrects = 0
        total = 0
        batch_idx = 0

        for images, labels in self.val_dataloader:
            images = images.to(self.device)
            labels = labels.to(self.device)
            total += labels.size(0)

            # New net forward pass
            outputs = self.net(images)  
            loss = self.criterion(outputs, labels) # BCE Loss with sigmoids over outputs

            running_val_loss += loss.item()

            # Get predictions
            _, preds = torch.max(outputs.data, 1)

            # Update the number of correctly classified validation samples
            running_corrects += torch.sum(preds == labels.data).data.item()

            batch_idx += 1

        # Calculate scores
        val_loss = running_val_loss / batch_idx
        val_accuracy = running_corrects / float(total)

        print(f"Validation loss: {val_loss}, Validation accuracy: {val_accuracy}")

        return val_loss, val_accuracy

In [None]:
logs = [[] for _ in range(NUM_RUNS)]

LR = 0.01
MILESTONES = [80, 120] # paper suggests number of epochs = 160

for run_i in range(NUM_RUNS):
    net = resnet32cosine()
    icarl_cosine = iCaRLwithCosine(DEVICE, net, LR, MOMENTUM, WEIGHT_DECAY, MILESTONES, GAMMA, NUM_EPOCHS, BATCH_SIZE, train_transform, test_transform)
    icarl_cosine.criterion = LFCLoss()

    for split_i in range(10):
        print(f"## Split {split_i} of run {run_i} ##")
        
        icarl_cosine.incremental_train(split_i, train_subsets[run_i][split_i], val_subsets[run_i][split_i])

        targets = torch.stack([label[0] for _, label in DataLoader(test_subsets[run_i][split_i])])

        logs[run_i].append({})
        
        # Test Cosine layer classifier
        acc, preds = icarl_cosine.test(test_subsets[run_i][split_i])
        logs[run_i][split_i]['cosine_accuracy'] = acc
        logs[run_i][split_i]['cosine_conf_mat'] = confusion_matrix(targets.to('cpu'), preds.to('cpu'))

## Split 0 of run 0 ##
Length of exemplars set: 0
Epoch: 1, LR: [0.01]
Train loss: 2.099417725631169, Train accuracy: 0.32276785714285716
Validation loss: 1.9866281066622054, Validation accuracy: 0.39285714285714285
Epoch: 2, LR: [0.01]
Train loss: 1.8952408330781119, Train accuracy: 0.4707589285714286
Validation loss: 1.941627059664045, Validation accuracy: 0.3794642857142857
Epoch: 3, LR: [0.01]
Train loss: 1.8203732456479753, Train accuracy: 0.5142857142857142
Validation loss: 1.7851165533065796, Validation accuracy: 0.5223214285714286
Epoch: 4, LR: [0.01]
Train loss: 1.7830747604370116, Train accuracy: 0.5430803571428572
Validation loss: 1.7691782712936401, Validation accuracy: 0.5223214285714286
Epoch: 5, LR: [0.01]
Train loss: 1.7564758726528713, Train accuracy: 0.5703125
Validation loss: 1.7453553676605225, Validation accuracy: 0.5669642857142857
Epoch: 6, LR: [0.01]
Train loss: 1.731564426422119, Train accuracy: 0.5875
Validation loss: 1.72082485471453, Validation accuracy: 0.6

In [13]:
icarl_cosine.criterion

LFCLoss()