Note: If the below statement evaluates to false, please remove the .cuda() method calls on each Tensor. This statement determines if your GPU can be utilized for model training and evaluation.

In [75]:
print(torch.cuda.is_available())

True


In [76]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.sampler import Sampler
import matplotlib.pyplot as plt
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
import sys
sys.path.append("./pytorch-classification")
sys.path.append("./pytorch-classification/models")
sys.path.append("./pytorch-classification/models/cifar/")
import os
import cbas
import torch.backends.cudnn as cudnn
import models.cifar as models
import torchvision
import torchvision.transforms as transforms
import numpy as np

<br />
<br />
Next, we load our dataset (CBAS-34) into ImageFolder and DataLoader classes.

In [77]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

testset = torchvision.datasets.ImageFolder(root='../images/cbas34_val', transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)
trainset = torchvision.datasets.ImageFolder(root='../images/cbas34_train', transform=transform)
classes = trainset.classes
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=4)

<br />  

<br />  

In order to determine the true loss ordering associated with each epoch we need to override the Sampler class found in torch.utils.data.sampler. In particular, we need to be able to attain the order of indices sampled throughout training (note the self.random_list class variable).

In [78]:
class MySampler(Sampler):
    def __init__(self, data_source):
        self.data_source = data_source
        self.random_list = None

    def __iter__(self):
        self.random_list = torch.randperm(len(self.data_source)).tolist()
        return iter(self.random_list)

    def get_idx(self):
        return self.random_list

    def __len__(self):
        return len(self.data_source)


class MyWeightedSampler(Sampler):

    def __init__(self, weights, num_samples, replacement=True):
        self.weights = torch.DoubleTensor(weights)
        self.num_samples = num_samples
        self.replacement = replacement
        self.random_list = None

    def __iter__(self):
        ret = torch.multinomial(self.weights, self.num_samples, self.replacement)
        self.random_list = ret.numpy().tolist()
        return iter(ret)

    def get_idx(self):
        return self.random_list

    def __len__(self):
        return self.num_samples

  
  <br />  
    
<br />


Here we define our method of determining a difficulty metric based on loss from the previous epoch. This involves utilizing 'get_idx' from the Sampler classes defined above (overriding methods defined in torch.utils.data.sampler) to determine the true ordering of images (relative to the original dataset) and definining a set of weights for our 'MyWeightedSampler' class.

In [79]:
def adjust_lr(optimizer):
    for param_group in optimizer.param_groups:
        param_group['lr'] *= 0.1

def normal_weights(losses, mu=None):
    mu, var = mu if mu else np.mean(losses), np.var(losses)
    return (1/(np.sqrt(np.pi*2*var)))*np.exp(-((losses-mu)**2)/(2*var))


def real_time(training_set, model, loss_fn, optimizer, deviations):
    """
    training_set: class type 'torchvision.datasets.ImageFolder'

    deviations: a sequence of standard deviations scalars to be applied to the sampling distribution's
    mean to determine the probability of sampling and image with a given loss value. If set to [0...0],
    the probability of sampling each image (based on loss value) will be determined by the normal
    distribution's pdf. If deviation = -1, probability will be dictated by a normal with shifted mean
    mean(loss) -1*std(loss). This in effect allows us to shift the difficulty of training images over
    each epoch. Images are sampled with replacement, so we can shift the focus from easy to hard. For
    example: [-1, 0, 1] samples from a normal distribution centered at mean(loss) -1*std(loss),
    mean(loss), then mean(loss) + 1*std(loss) for the training epochs.

    Note: number of epochs == len(deviations) + 1 (+1 for the initial training epoch)
    """

    def real_time_curriculum(sampler, loader, net, criterion, optimizer):
        orderings = []
        running_loss = 0.0
        for i, data in enumerate(loader, 0):
            # get the inputs
            inputs, labels = data

            try:
                numpy_labels = labels.numpy()
            except:
                numpy_labels = labels.data.numpy()

            # wrap them in Variable
            inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            try:
                numpy_outputs = outputs.cpu().numpy()
            except:
                numpy_outputs = outputs.cpu().data.numpy()
            log_probs = -np.log(np.exp(numpy_outputs)
                                / np.reshape(np.sum(np.exp(numpy_outputs), axis=1), (numpy_labels.shape[0], 1)))
            orderings += log_probs[:, numpy_labels].tolist()[0]

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print statistics
            running_loss += loss.data[0]
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('%5d loss: %.3f' %
                      (i + 1, running_loss / 2000))
                running_loss = 0.0
        idx = np.argsort(np.array(sampler.get_idx()))
        culmulative_orderings = np.array(orderings)[idx]
        return culmulative_orderings

    my_sampler = MySampler(training_set)
    trainloader = torch.utils.data.DataLoader(
        training_set, batch_size=4, shuffle=False, sampler=my_sampler, num_workers=4)

    print("epoch #1")
    real_time_curr = \
        real_time_curriculum(my_sampler, trainloader, model, loss_fn, optimizer)
    epoch = 1
    num_samples = real_time_curr.shape[0]

    for deviation in deviations:
        epoch += 1
        print("epoch #%d" % epoch)
        weights = normal_weights(real_time_curr, np.mean(real_time_curr) + deviation * np.std(real_time_curr))
        weight_denom = np.sum(weights)
        weight_denom = weight_denom if weight_denom > (1/1e30) else (1/1e30)
        weights = weights / weight_denom
        sampler = MyWeightedSampler(weights, num_samples, replacement=True)
        real_time_curriculum_loader = \
            torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=False, sampler=sampler, num_workers=4)
        real_time_curr = \
            real_time_curriculum(sampler, real_time_curriculum_loader, model, loss_fn, optimizer)
        if epoch % 2 == 0:
            adjust_lr(optimizer)

<br />
<br />
Here we define a training loop for our comparison models (with no curriculum).

In [80]:
def no_curriculum(net, loss_fn, optimizer, epochs):
    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.data[0]
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
    print('Finished Training')

<br />
<br />
### Load pre-trained models: 
To test our curriculum strategy we employ a variety of models to see how it interacts with different network architectures.

In [81]:
loss_fn = nn.CrossEntropyLoss()

real_time_alex = models.alexnet(num_classes=34).cuda()
real_time_dile = models.dilenet(num_classes=34).cuda()
real_time_lenet = models.lenet(num_classes=34).cuda()

no_curriculum_alex = models.alexnet(num_classes=34).cuda()
no_curriculum_dile = models.dilenet(num_classes=34).cuda()
no_curriculum_lenet = models.lenet(num_classes=34).cuda()

opt_real_alex = optim.SGD(real_time_alex.parameters(), lr=0.001, momentum=0.9)
opt_real_dile = optim.SGD(real_time_dile.parameters(), lr=0.001, momentum=0.9)
opt_real_lenet = optim.SGD(real_time_lenet.parameters(), lr=0.001, momentum=0.9)

opt_alex = optim.SGD(no_curriculum_alex.parameters(), lr=0.001, momentum=0.9)
opt_dile = optim.SGD(no_curriculum_dile.parameters(), lr=0.001, momentum=0.9)
opt_lenet = optim.SGD(no_curriculum_lenet.parameters(), lr=0.001, momentum=0.9)

<br />
<br />
Here we train alexnet, dilenet and lenet using our real-time loss based curriculum.

In [82]:
print("AlexNet with real-time curriculum:")
real_time(trainset, real_time_alex, loss_fn, opt_real_alex, np.arange(-1.0, 2, 1))
print("\n\nDileNet with real-time curriculum:")
real_time(trainset, real_time_dile, loss_fn, opt_real_dile, np.arange(-1.0, 2, 1))
print("\n\nLeNet with real-time curriculum:")
real_time(trainset, real_time_lenet, loss_fn, opt_real_lenet, np.arange(-1.0, 2, 1))

AlexNet with real-time curriculum:
epoch #1
 2000 loss: 3.523
 4000 loss: 3.475
 6000 loss: 3.348
 8000 loss: 3.304
10000 loss: 3.267
12000 loss: 3.193
epoch #2
 2000 loss: 3.130
 4000 loss: 3.090
 6000 loss: 3.056
 8000 loss: 3.013
10000 loss: 2.971
12000 loss: 2.962
epoch #3
 2000 loss: 2.865
 4000 loss: 2.835
 6000 loss: 2.811
 8000 loss: 2.819
10000 loss: 2.807
12000 loss: 2.779
epoch #4
 2000 loss: 2.736
 4000 loss: 2.761
 6000 loss: 2.719
 8000 loss: 2.700
10000 loss: 2.687
12000 loss: 2.671


DileNet with real-time curriculum:
epoch #1
 2000 loss: 3.336
 4000 loss: 3.144
 6000 loss: 3.015
 8000 loss: 2.956
10000 loss: 2.914
12000 loss: 2.842
epoch #2
 2000 loss: 2.716
 4000 loss: 2.638
 6000 loss: 2.610
 8000 loss: 2.543
10000 loss: 2.516
12000 loss: 2.451
epoch #3
 2000 loss: 2.355
 4000 loss: 2.279
 6000 loss: 2.245
 8000 loss: 2.244
10000 loss: 2.187
12000 loss: 2.149
epoch #4
 2000 loss: 2.113
 4000 loss: 2.085
 6000 loss: 2.021
 8000 loss: 2.065
10000 loss: 2.003
12000 loss

For comparison, we train the same models using no curriculum strategy.

In [83]:
print("AlexNet with no curriculum:")
no_curriculum(no_curriculum_alex, loss_fn, opt_alex, 4)
print("\n\nDileNet with no curriculum:")
no_curriculum(no_curriculum_dile, loss_fn, opt_dile, 4)
print("\n\nLeNet with no curriculum:")
no_curriculum(no_curriculum_lenet, loss_fn, opt_lenet, 4)

AlexNet with no curriculum:
[1,  2000] loss: 3.526
[1,  4000] loss: 3.470
[1,  6000] loss: 3.344
[1,  8000] loss: 3.312
[1, 10000] loss: 3.246
[1, 12000] loss: 3.213
[2,  2000] loss: 3.137
[2,  4000] loss: 3.107
[2,  6000] loss: 3.061
[2,  8000] loss: 3.038
[2, 10000] loss: 3.006
[2, 12000] loss: 2.987
[3,  2000] loss: 2.913
[3,  4000] loss: 2.917
[3,  6000] loss: 2.914
[3,  8000] loss: 2.860
[3, 10000] loss: 2.868
[3, 12000] loss: 2.860
[4,  2000] loss: 2.772
[4,  4000] loss: 2.747
[4,  6000] loss: 2.769
[4,  8000] loss: 2.738
[4, 10000] loss: 2.754
[4, 12000] loss: 2.747
Finished Training


DileNet with no curriculum:
[1,  2000] loss: 3.339
[1,  4000] loss: 3.143
[1,  6000] loss: 3.033
[1,  8000] loss: 2.962
[1, 10000] loss: 2.907
[1, 12000] loss: 2.864
[2,  2000] loss: 2.750
[2,  4000] loss: 2.711
[2,  6000] loss: 2.691
[2,  8000] loss: 2.669
[2, 10000] loss: 2.633
[2, 12000] loss: 2.624
[3,  2000] loss: 2.442
[3,  4000] loss: 2.463
[3,  6000] loss: 2.440
[3,  8000] loss: 2.463
[3, 

In [84]:
model_names = ['real_time_alex', 'no_curriculum_alex', 'real_time_dile', 
               'no_curriculum_dile', 'real_time_lenet', 'no_curriculum_lenet']
models_trained = [real_time_alex, no_curriculum_alex, real_time_dile, no_curriculum_dile, real_time_lenet, no_curriculum_lenet]


<br />
<br />
We now output the results of both curriculum and non-curriculum based training strategies. Above the model appears to converge at a faster rate, though performance improvements were minimal. Given the right task this could certainly make a difference.

In [85]:
for name, model_trained in zip(model_names, models_trained):
    correct = 0
    total = 0
    for data in testloader:
        images, labels = data
        outputs = model_trained(Variable(images).cuda())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.cuda()).sum()

    print(name + ' Accuracy of the network on the 3744 test images: %d %%' % (
        100 * correct / total))

real_time_alex Accuracy of the network on the 3744 test images: 21 %
no_curriculum_alex Accuracy of the network on the 3744 test images: 22 %
real_time_dile Accuracy of the network on the 3744 test images: 30 %
no_curriculum_dile Accuracy of the network on the 3744 test images: 29 %
real_time_lenet Accuracy of the network on the 3744 test images: 21 %
no_curriculum_lenet Accuracy of the network on the 3744 test images: 20 %


In [86]:
for name, model_trained in zip(model_names, models_trained):
    class_correct = list(0. for i in range(34))
    class_total = list(0. for i in range(34))
    for data in testloader:
        images, labels = data
        outputs = model_trained(Variable(images).cuda())
        _, predicted = torch.max(outputs.data, 1)
        c = (predicted == labels.cuda()).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i]
            class_total[label] += 1


    for i in range(34):
        print(name + ' Accuracy of %5s : %2d %%' % (
            classes[i], 100 * class_correct[i] / class_total[i]))


real_time_alex Accuracy of airplane : 35 %
real_time_alex Accuracy of backpack : 13 %
real_time_alex Accuracy of banana : 70 %
real_time_alex Accuracy of bench :  6 %
real_time_alex Accuracy of bicycle : 16 %
real_time_alex Accuracy of  bird :  1 %
real_time_alex Accuracy of  boat : 10 %
real_time_alex Accuracy of  book : 11 %
real_time_alex Accuracy of bottle :  1 %
real_time_alex Accuracy of  bowl :  5 %
real_time_alex Accuracy of   car :  6 %
real_time_alex Accuracy of carrot : 65 %
real_time_alex Accuracy of chair :  4 %
real_time_alex Accuracy of clock : 43 %
real_time_alex Accuracy of   cow : 16 %
real_time_alex Accuracy of   cup :  1 %
real_time_alex Accuracy of donut : 21 %
real_time_alex Accuracy of  fork : 18 %
real_time_alex Accuracy of handbag : 24 %
real_time_alex Accuracy of horse : 25 %
real_time_alex Accuracy of  kite : 50 %
real_time_alex Accuracy of knife :  7 %
real_time_alex Accuracy of person :  0 %
real_time_alex Accuracy of pottedplant : 12 %
real_time_alex Accur