Note: If the below statement evaluates to false, please remove the .cuda() method calls on each Tensor. This statement determines if your GPU can be utilized for model training and evaluation.

In [55]:
print(torch.cuda.is_available())

True


In [56]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.sampler import Sampler
import matplotlib.pyplot as plt
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
import sys
sys.path.append("./pytorch-classification")
sys.path.append("./pytorch-classification/models")
sys.path.append("./pytorch-classification/models/cifar/")
import os
import cbas
import torch.backends.cudnn as cudnn
import models.cifar as models
import torchvision
import torchvision.transforms as transforms
import numpy as np

<br />
<br />
Next, we load our dataset (CBAS-34) into ImageFolder and DataLoader classes.

In [57]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

testset = torchvision.datasets.ImageFolder(root='../images/cbas34_val', transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)
trainset = torchvision.datasets.ImageFolder(root='../images/cbas34_train', transform=transform)
classes = trainset.classes
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=4)

<br />  

<br />  

In order to determine the true loss ordering associated with each epoch we need to override the Sampler class found in torch.utils.data.sampler. In particular, we need to be able to attain the order of indices sampled throughout training (note the self.random_list class variable).

In [58]:
class MySampler(Sampler):
    def __init__(self, data_source):
        self.data_source = data_source
        self.random_list = None

    def __iter__(self):
        self.random_list = torch.randperm(len(self.data_source)).tolist()
        return iter(self.random_list)

    def get_idx(self):
        return self.random_list

    def __len__(self):
        return len(self.data_source)


class MyWeightedSampler(Sampler):

    def __init__(self, weights, num_samples, replacement=True):
        self.weights = torch.DoubleTensor(weights)
        self.num_samples = num_samples
        self.replacement = replacement
        self.random_list = None

    def __iter__(self):
        ret = torch.multinomial(self.weights, self.num_samples, self.replacement)
        self.random_list = ret.numpy().tolist()
        return iter(ret)

    def get_idx(self):
        return self.random_list

    def __len__(self):
        return self.num_samples

  
  <br />  
    
<br />


Here we define our method of determining a difficulty metric based on loss from the previous epoch. This involves utilizing 'get_idx' from the Sampler classes defined above (overriding methods defined in torch.utils.data.sampler) to determine the true ordering of images (relative to the original dataset) and definining a set of weights for our 'MyWeightedSampler' class.

In [59]:

def normal_weights(losses, mu=None):
    mu, var = mu if mu else np.mean(losses), np.var(losses)
    return (1/(np.sqrt(np.pi*2*var)))*np.exp(-((losses-mu)**2)/(2*var))


def real_time(training_set, model, loss_fn, optimizer, deviations):
    """
    training_set: class type 'torchvision.datasets.ImageFolder'

    deviations: a sequence of standard deviations scalars to be applied to the sampling distribution's
    mean to determine the probability of sampling and image with a given loss value. If set to [0...0],
    the probability of sampling each image (based on loss value) will be determined by the normal
    distribution's pdf. If deviation = -1, probability will be dictated by a normal with shifted mean
    mean(loss) -1*std(loss). This in effect allows us to shift the difficulty of training images over
    each epoch. Images are sampled with replacement, so we can shift the focus from easy to hard. For
    example: [-1, 0, 1] samples from a normal distribution centered at mean(loss) -1*std(loss),
    mean(loss), then mean(loss) + 1*std(loss) for the training epochs.

    Note: number of epochs == len(deviations) + 1 (+1 for the initial training epoch)
    """

    def real_time_curriculum(sampler, loader, net, criterion, optimizer):
        orderings = []
        running_loss = 0.0
        for i, data in enumerate(loader, 0):
            # get the inputs
            inputs, labels = data

            try:
                numpy_labels = labels.numpy()
            except:
                numpy_labels = labels.data.numpy()

            # wrap them in Variable
            inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            try:
                numpy_outputs = outputs.cpu().numpy()
            except:
                numpy_outputs = outputs.cpu().data.numpy()
            log_probs = -np.log(np.exp(numpy_outputs)
                                / np.reshape(np.sum(np.exp(numpy_outputs), axis=1), (numpy_labels.shape[0], 1)))
            orderings += log_probs[:, numpy_labels].tolist()[0]

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print statistics
            running_loss += loss.data[0]
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('%5d loss: %.3f' %
                      (i + 1, running_loss / 2000))
                running_loss = 0.0
        idx = np.argsort(np.array(sampler.get_idx()))
        culmulative_orderings = np.array(orderings)[idx]
        return culmulative_orderings

    my_sampler = MySampler(training_set)
    trainloader = torch.utils.data.DataLoader(
        training_set, batch_size=4, shuffle=False, sampler=my_sampler, num_workers=4)

    print("epoch #1")
    real_time_curr = \
        real_time_curriculum(my_sampler, trainloader, model, loss_fn, optimizer)
    epoch = 1
    num_samples = real_time_curr.shape[0]

    for deviation in deviations:
        epoch += 1
        print("epoch #%d" % epoch)
        weights = normal_weights(real_time_curr, np.mean(real_time_curr) + deviation * np.std(real_time_curr))
        weight_denom = np.sum(weights)
        weight_denom = weight_denom if weight_denom > (1/1e30) else (1/1e30)
        weights = weights / weight_denom
        sampler = MyWeightedSampler(weights, num_samples, replacement=True)
        real_time_curriculum_loader = \
            torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=False, sampler=sampler, num_workers=4)
        real_time_curr = \
            real_time_curriculum(sampler, real_time_curriculum_loader, model, loss_fn, optimizer)

<br />
<br />
Here we define a training loop for our comparison models (with no curriculum).

In [62]:
def no_curriculum(net, loss_fn, optimizer, epochs):
    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.data[0]
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
    print('Finished Training')

<br />
<br />
### Load pre-trained models: 
To test our curriculum strategy we employ a variety of models to see how it interacts with different network architectures.

In [63]:
loss_fn = nn.CrossEntropyLoss()

real_time_alex = models.alexnet(num_classes=34).cuda()
real_time_dile = models.dilenet(num_classes=34).cuda()
real_time_lenet = models.lenet(num_classes=34).cuda()

no_curriculum_alex = models.alexnet(num_classes=34).cuda()
no_curriculum_dile = models.dilenet(num_classes=34).cuda()
no_curriculum_lenet = models.lenet(num_classes=34).cuda()

opt_real_alex = optim.SGD(real_time_alex.parameters(), lr=0.001, momentum=0.9)
opt_real_dile = optim.SGD(real_time_dile.parameters(), lr=0.001, momentum=0.9)
opt_real_lenet = optim.SGD(real_time_lenet.parameters(), lr=0.001, momentum=0.9)

opt_alex = optim.SGD(no_curriculum_alex.parameters(), lr=0.001, momentum=0.9)
opt_dile = optim.SGD(no_curriculum_dile.parameters(), lr=0.001, momentum=0.9)
opt_lenet = optim.SGD(no_curriculum_lenet.parameters(), lr=0.001, momentum=0.9)

<br />
<br />
Here we train alexnet, dilenet and lenet using our real-time loss based curriculum.

In [64]:
print("AlexNet with real-time curriculum:")
real_time(trainset, real_time_alex, loss_fn, opt_real_alex, np.arange(-2.0, 2, 1))
print("\n\nDileNet with real-time curriculum:")
real_time(trainset, real_time_dile, loss_fn, opt_real_dile, np.arange(-2.0, 2, 1))
print("\n\nLeNet with real-time curriculum:")
real_time(trainset, real_time_lenet, loss_fn, opt_real_lenet, np.arange(-2.0, 2, 1))

epoch #1
 2000 loss: 3.525
 4000 loss: 3.432
 6000 loss: 3.333
 8000 loss: 3.291
10000 loss: 3.266
12000 loss: 3.206
epoch #2
 2000 loss: 2.983
 4000 loss: 2.942
 6000 loss: 2.900
 8000 loss: 2.849
10000 loss: 2.822
12000 loss: 2.788
epoch #3
 2000 loss: 2.938
 4000 loss: 2.886
 6000 loss: 2.886
 8000 loss: 2.851
10000 loss: 2.795
12000 loss: 2.792
epoch #4
 2000 loss: 2.785
 4000 loss: 2.756
 6000 loss: 2.729
 8000 loss: 2.711
10000 loss: 2.677
12000 loss: 2.649
epoch #5
 2000 loss: 2.581
 4000 loss: 2.563
 6000 loss: 2.522
 8000 loss: 2.507
10000 loss: 2.483
12000 loss: 2.424
epoch #1
 2000 loss: 3.371
 4000 loss: 3.161
 6000 loss: 3.045
 8000 loss: 2.967
10000 loss: 2.909
12000 loss: 2.861
epoch #2
 2000 loss: 2.490
 4000 loss: 2.416
 6000 loss: 2.342
 8000 loss: 2.288
10000 loss: 2.199
12000 loss: 2.121
epoch #3
 2000 loss: 2.494
 4000 loss: 2.404
 6000 loss: 2.381
 8000 loss: 2.277
10000 loss: 2.246
12000 loss: 2.180
epoch #4
 2000 loss: 2.194
 4000 loss: 2.151
 6000 loss: 2.091
 

For comparison, we train the same models using no curriculum strategy.

In [65]:
no_curriculum(no_curriculum_alex, loss_fn, opt_alex, 5)
no_curriculum(no_curriculum_dile, loss_fn, opt_dile, 5)
no_curriculum(no_curriculum_lenet, loss_fn, opt_lenet, 5)

[1,  2000] loss: 3.525
[1,  4000] loss: 3.466
[1,  6000] loss: 3.333
[1,  8000] loss: 3.287
[1, 10000] loss: 3.244
[1, 12000] loss: 3.189
[2,  2000] loss: 3.125
[2,  4000] loss: 3.089
[2,  6000] loss: 3.042
[2,  8000] loss: 3.048
[2, 10000] loss: 2.994
[2, 12000] loss: 2.989
[3,  2000] loss: 2.931
[3,  4000] loss: 2.915
[3,  6000] loss: 2.890
[3,  8000] loss: 2.891
[3, 10000] loss: 2.861
[3, 12000] loss: 2.842
[4,  2000] loss: 2.780
[4,  4000] loss: 2.765
[4,  6000] loss: 2.759
[4,  8000] loss: 2.770
[4, 10000] loss: 2.759
[4, 12000] loss: 2.738
[5,  2000] loss: 2.650
[5,  4000] loss: 2.635
[5,  6000] loss: 2.645
[5,  8000] loss: 2.634
[5, 10000] loss: 2.649
[5, 12000] loss: 2.639
Finished Training
[1,  2000] loss: 3.338
[1,  4000] loss: 3.141
[1,  6000] loss: 3.006
[1,  8000] loss: 2.975
[1, 10000] loss: 2.877
[1, 12000] loss: 2.854
[2,  2000] loss: 2.716
[2,  4000] loss: 2.713
[2,  6000] loss: 2.681
[2,  8000] loss: 2.658
[2, 10000] loss: 2.641
[2, 12000] loss: 2.614
[3,  2000] loss:

In [69]:
model_names = ['real_time_alex', 'no_curriculum_alex', 'real_time_dile', 
               'no_curriculum_dile', 'real_time_lenet', 'no_curriculum_lenet']
models_trained = [real_time_alex, no_curriculum_alex, real_time_dile, no_curriculum_dile, real_time_lenet, no_curriculum_lenet]


<br />
<br />
We now output the results of both curriculum and non-curriculum based training strategies. As you can see, with limited training there are noticeable improvements to testing accuracy.

In [67]:
for name, model_trained in zip(model_names, models_trained):
    correct = 0
    total = 0
    for data in testloader:
        images, labels = data
        outputs = model_trained(Variable(images).cuda())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.cuda()).sum()

    print(name + ' Accuracy of the network on the 3744 test images: %d %%' % (
        100 * correct / total))

real_time_alex Accuracy of the network on the 3744 test images: 23 %
no_curriculum_alex Accuracy of the network on the 3744 test images: 23 %
real_time_dile Accuracy of the network on the 3744 test images: 25 %
no_curriculum_dile Accuracy of the network on the 3744 test images: 29 %
real_time_lenet Accuracy of the network on the 3744 test images: 18 %
no_curriculum_lenet Accuracy of the network on the 3744 test images: 20 %


In [68]:
for name, model_trained in zip(model_names, models_trained):
    class_correct = list(0. for i in range(34))
    class_total = list(0. for i in range(34))
    for data in testloader:
        images, labels = data
        outputs = model_trained(Variable(images).cuda())
        _, predicted = torch.max(outputs.data, 1)
        c = (predicted == labels.cuda()).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i]
            class_total[label] += 1


    for i in range(34):
        print(name + ' Accuracy of %5s : %2d %%' % (
            classes[i], 100 * class_correct[i] / class_total[i]))


real_time_alex Accuracy of airplane : 41 %
real_time_alex Accuracy of backpack : 10 %
real_time_alex Accuracy of banana : 46 %
real_time_alex Accuracy of bench :  9 %
real_time_alex Accuracy of bicycle : 21 %
real_time_alex Accuracy of  bird :  7 %
real_time_alex Accuracy of  boat : 26 %
real_time_alex Accuracy of  book : 20 %
real_time_alex Accuracy of bottle :  9 %
real_time_alex Accuracy of  bowl :  1 %
real_time_alex Accuracy of   car : 11 %
real_time_alex Accuracy of carrot : 56 %
real_time_alex Accuracy of chair : 25 %
real_time_alex Accuracy of clock : 49 %
real_time_alex Accuracy of   cow : 25 %
real_time_alex Accuracy of   cup :  1 %
real_time_alex Accuracy of donut : 29 %
real_time_alex Accuracy of  fork : 24 %
real_time_alex Accuracy of handbag : 13 %
real_time_alex Accuracy of horse : 22 %
real_time_alex Accuracy of  kite : 47 %
real_time_alex Accuracy of knife : 18 %
real_time_alex Accuracy of person : 11 %
real_time_alex Accuracy of pottedplant :  3 %
real_time_alex Accur