## Load in data

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import time
import progressbar
import importlib
# torch.manual_seed(1)    # reproducible
plt.style.use('default')
%matplotlib inline

import pdb

import Models_MNIST as mds

# Hyper Parameters
EPOCH = 10
BATCH_SIZE = 256
DOWNLOAD_MNIST = False

m1 = 64
m2 = 128
m3 = 50
cudaopt = True

EPS = 1e-4

# Mnist digits dataset
train_data = torchvision.datasets.MNIST(
    root='../data',
    train=True,                                     # this is training data
    transform=torchvision.transforms.ToTensor(),    # Converts a PIL.Image or numpy.ndarray to
                                                    # torch.FloatTensor of shape (C x H x W) and normalize in the range [0.0, 1.0]
    download=True,                        # download it if you don't have it
)

# LIMITING TRAINING DATA
Ntrain = int(60e3)
train_set = np.random.permutation(60000)[0:Ntrain]
train_data.train_data = train_data.train_data[torch.LongTensor(train_set),:,:]
train_data.train_labels = train_data.train_labels[torch.LongTensor(train_set)]

test_data = torchvision.datasets.MNIST(
    root='../data',
    train=False,                                     # this is testing data
    transform=torchvision.transforms.ToTensor(),    # Converts a PIL.Image or numpy.ndarray to
                                                    # torch.FloatTensor of shape (C x H x W) and normalize in the range [0.0, 1.0]
    download=True,                        # download it if you don't have it
)

# Data Loader for easy mini-batch return in training, the image batch shape will be (50, 1, 28, 28)
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = Data.DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=True)

##  Define models normal neural network model

In [2]:
class neural_net(nn.Module):
    def __init__(self,m1,m2,m3):
        super(neural_net, self).__init__()
        
        # Convolutional Filters
        self.W1 = nn.Parameter(torch.randn(m1,1,6,6), requires_grad=True)
        self.strd1 = 2;
        self.W2 = nn.Parameter(torch.randn(m2,m1,6,6), requires_grad=True)
        self.strd2 = 2;
        self.W3 = nn.Parameter(torch.randn(m3,m2,4,4), requires_grad=True)
        self.strd3 = 1;
        
        # Biases / Thresholds
        self.b1 = nn.Parameter(torch.zeros(1,m1,1,1), requires_grad=True)
        self.b2 = nn.Parameter(torch.zeros(1,m2,1,1), requires_grad=True)
        self.b3 = nn.Parameter(torch.zeros(1,m3,1,1), requires_grad=True)
        
        # Classifier
        self.Wclass = nn.Linear(m3, 10)
        
        # Initialization
        self.W1.data = 0.01 * self.W1.data
        self.W2.data = 0.01 * self.W2.data
        self.W3.data = 0.01 * self.W3.data
        
    def forward(self, x):    
        # Encoding
        gamma1 = F.relu(F.conv2d(x,self.W1, stride = self.strd1) + self.b1)       # first estimation
        gamma2 = F.relu(F.conv2d(gamma1,self.W2, stride = self.strd2) + self.b2) 
        gamma3 = F.relu(F.conv2d(gamma2,self.W3, stride = self.strd3) + self.b3)
        activation1 = gamma1.clone()
        activation1[activation1!=0]=1
        activation2 = gamma1.clone()
        activation2[activation2!=0]=1
        activation3 = gamma3.clone()
        activation3[activation3!=0]=1
        # classifier
        gamma = gamma3.view(gamma3.shape[0],gamma3.shape[1]*gamma3.shape[2]*gamma3.shape[3])
        out = self.Wclass(gamma)
        out = F.log_softmax(out,dim = 1)
        # calculate activations for each layer          
        
        # classifier
        gamma = gamma3.view(gamma3.shape[0],gamma3.shape[1]*gamma3.shape[2]*gamma3.shape[3])
        out = self.Wclass(gamma)
        scores = F.log_softmax(out, dim = 1)
    
        return gamma, scores, activation1, activation2, activation3

## Define joint neural network

In [3]:
class joint_neural_net(nn.Module):
    def __init__(self,m1,m2,m3):
        super(joint_neural_net, self).__init__()
        
        # Convolutional Filters
        self.W1 = nn.Parameter(torch.randn(m1,1,6,6), requires_grad=True)
        self.strd1 = 2;
        self.W2 = nn.Parameter(torch.randn(m2,m1,6,6), requires_grad=True)
        self.strd2 = 2;
        self.W3 = nn.Parameter(torch.randn(m3,m2,4,4), requires_grad=True)
        self.strd3 = 1;
        
        # Biases / Thresholds
        self.b1 = nn.Parameter(torch.zeros(1,m1,1,1), requires_grad=True)
        self.b2 = nn.Parameter(torch.zeros(1,m2,1,1), requires_grad=True)
        self.b3 = nn.Parameter(torch.zeros(1,m3,1,1), requires_grad=True)
        
        # Classifier
        self.Wclass = nn.Linear(m3, 10)
        
        # Initialization
        self.W1.data = 0.01 * self.W1.data
        self.W2.data = 0.01 * self.W2.data
        self.W3.data = 0.01 * self.W3.data
        
        
    def forward(self, x):    
        # Encoding
        gamma1 = F.relu(F.conv2d(x,self.W1, stride = self.strd1) + self.b1)       # first estimation
        gamma2 = F.relu(F.conv2d(gamma1,self.W2, stride = self.strd2) + self.b2) 
        gamma3 = F.relu(F.conv2d(gamma2,self.W3, stride = self.strd3) + self.b3)
        # classifier
        gamma = gamma3.view(gamma3.shape[0],gamma3.shape[1]*gamma3.shape[2]*gamma3.shape[3])
        out = self.Wclass(gamma)
        out = F.log_softmax(out,dim = 1)
        # calculate activations for each layer          
        activation1 = gamma1.clone()
        activation1[activation1!=0]=1
        activation2 = gamma1.clone()
        activation2[activation2!=0]=1
        activation3 = gamma3.clone()
        activation3[activation3!=0]=1
        
        # classifier
        gamma = gamma3.view(gamma3.shape[0],gamma3.shape[1]*gamma3.shape[2]*gamma3.shape[3])
        out = self.Wclass(gamma)
        out = F.log_softmax(out,dim = 1)
    
        return gamma, out, activation1, activation2, activation3
    
    def joint_train(self, x, labels):
        # print("Running joint training")
        # Initialise dics to contain sorted data
        label_bin_data = {"0":[], "1":[], "2":[], "3":[], "4":[], "5":[], "6":[], "7":[], "8":[], "9":[]} # Dictionary of lists of tensors
        data_by_class = {} # Dictionary of tensors
        encoded_by_class = {} # Dictionary of tensors
        scores_by_class = {} # Dictionary of lists
        sorted_labels = np.empty(labels.shape[0])
        index = 0
        # Sort data by its label class into a dictionary of lists which contain the data point tensors
        for i in range(labels.shape[0]):
            label_bin_data[str(int(labels[i].item()))].append(x[i,:,:,:])
        # Turn each list of tensors in the dictionary into a tensor
        first = True
        for key, tensor_list in label_bin_data.items():
            # print(key)
            # print(len(label_bin_data[key]))
            if len(label_bin_data[key]) > 0:
                sorted_labels[index:index+len(label_bin_data[key])] = int(key)*np.ones(len(label_bin_data[key]))
                index = index+len(label_bin_data[key])
                data_by_class[key] = torch.stack(label_bin_data[key], dim=0)
                encoded_by_class[key], scores_by_class[key] = self.joint_forward(data_by_class[key])
                if first == True:
                    scores = scores_by_class[key]
                    first = False
                else:
                    scores = torch.cat((scores, scores_by_class[key]), 0)
        return encoded_by_class, scores, torch.from_numpy(sorted_labels).type(torch.LongTensor)
    

    def joint_forward(self,x):   
        # Encoding
        gamma1 = F.relu(F.conv2d(x,self.W1, stride = self.strd1) + self.b1)       # first estimation
        gamma2 = F.relu(F.conv2d(gamma1,self.W2, stride = self.strd2) + self.b2)
        
        # Encourage joint sparisty in the final layer sparse layer encoding
        X1 = F.conv2d(gamma2,self.W3, stride = self.strd3)
        X1_dims = list(X1.shape)
        X1_mat = X1.view(-1, X1_dims[1])
        st_factors = 1-torch.squeeze(self.b3)*1/(torch.sum(X1_mat**2, dim=0))
        st_factors_mat = torch.diag(st_factors)
        X2_mat = torch.t(torch.mm(st_factors_mat, torch.t(X1_mat)))
        X2 = X2_mat.view(X1_dims[0], X1_dims[1], X1_dims[2], X1_dims[3])    
        gamma3 = F.relu(X2)
        
        # classifier
        gamma = gamma3.view(gamma3.shape[0],gamma3.shape[1]*gamma3.shape[2]*gamma3.shape[3])
        out = self.Wclass(gamma)
        out = F.log_softmax(out,dim = 1)
    
        return gamma, out
    

## Define testing function

In [4]:
def test(model, x, labels):
    # Initialise dics to contain sorted data
    label_bin_data = {"0":[], "1":[], "2":[], "3":[], "4":[], "5":[], "6":[], "7":[], "8":[], "9":[]} # Dictionary of lists of tensors
    data_by_class = {} # Dictionary of tensors
    encoded_by_class = {} # Dictionary of tensors
    scores_by_class = {} # Dictionary of lists
    sorted_labels = np.empty(labels.shape[0])
    index = 0
    
    activations_count1 = {}
    activations_count2 = {}
    activations_count3 = {}
    
    # Sort data by its label class into a dictionary of lists which contain the data point tensors
    for i in range(labels.shape[0]):
        label_bin_data[str(int(labels[i].item()))].append(x[i,:,:,:])
    # Turn each list of tensors in the dictionary into a tensor
    first = True
    for key, tensor_list in label_bin_data.items():
        # print(key)
        # print(len(label_bin_data[key]))
        if len(label_bin_data[key]) > 0:
            sorted_labels[index:index+len(label_bin_data[key])] = int(key)*np.ones(len(label_bin_data[key]))
            index = index+len(label_bin_data[key])
            data_by_class[key] = torch.stack(label_bin_data[key], dim=0)
            encoded_by_class[key], scores_by_class[key], activations1, activations2, activations3 = model(data_by_class[key])
            activations_count1[key] = torch.sum(activations1.view(-1, m1), dim=0)
            activations_count2[key] = torch.sum(activations2.view(-1, m2), dim=0)
            activations_count3[key] = torch.sum(activations3.view(-1, m3), dim=0)
            if first == True:
                scores = scores_by_class[key]
                first = False
            else:
                scores = torch.cat((scores, scores_by_class[key]), 0)
    return encoded_by_class, scores, torch.from_numpy(sorted_labels).type(torch.LongTensor), activations_count1, activations_count2, activations_count3

## Train baseline neural network

In [5]:
Loss_test_nn = np.zeros((EPOCH,))
Acc_test_nn = np.zeros((EPOCH,))

print('\n\t\t\t\t\tTraining Baseline\n')
    
model = neural_net(m1,m2,m3)

activations_count1_nn = {"0": torch.zeros(m1), "1":torch.zeros(m1), "2":torch.zeros(m1), "3":torch.zeros(m1), "4":torch.zeros(m1), "5":torch.zeros(m1), "6":torch.zeros(m1), "7":torch.zeros(m1), "8":torch.zeros(m1), "9":torch.zeros(m1)}
activations_count2_nn = {"0": torch.zeros(m2), "1":torch.zeros(m2), "2":torch.zeros(m2), "3":torch.zeros(m2), "4":torch.zeros(m2), "5":torch.zeros(m2), "6":torch.zeros(m2), "7":torch.zeros(m2), "8":torch.zeros(m2), "9":torch.zeros(m2)}
activations_count3_nn = {"0": torch.zeros(m3), "1":torch.zeros(m3), "2":torch.zeros(m3), "3":torch.zeros(m3), "4":torch.zeros(m3), "5":torch.zeros(m3), "6":torch.zeros(m3), "7":torch.zeros(m3), "8":torch.zeros(m3), "9":torch.zeros(m3)}

if cudaopt:
    model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001, eps = EPS)
bar = progressbar.ProgressBar()

for epoch in range(EPOCH):

    bar.update((epoch+1)/EPOCH*100)
    # train 1 epoch
    model.train()
    for step, (x, y) in enumerate(train_loader):
        b_x = Variable(x)   # batch x, shape (batch, 28*28)
        b_y = Variable(y)               # batch label
        if cudaopt:
            b_y, b_x = b_y.cuda(), b_x.cuda()
        encoded, scores,_,_,_ = model(b_x)
        loss = F.nll_loss(scores, b_y)      # negative log likelyhood
        optimizer.zero_grad()               # clear gradients for this training step
        loss.backward()                     # backpropagation, compute gradients
        optimizer.step()                    # apply gradients      
            
    # testing
    model.eval()
    correct = 0
    test_loss = 0
    for step, (x, y) in enumerate(test_loader):
        b_x = Variable(x)   # batch x, shape (batch, 28*28)
        b_y = Variable(y)               # batch label
        if cudaopt:
            b_y, b_x = b_y.cuda(), b_x.cuda()
        encoded_by_class, scores, sorted_labels, activations1, activations2, activations3 = test(model, b_x, b_y)
        sorted_labels = sorted_labels.type(torch.cuda.LongTensor)
        for key, tensor_list in activations1.items():
            activations_count1_nn[key] += activations1[key].type(torch.FloatTensor)
            activations_count2_nn[key] += activations2[key].type(torch.FloatTensor)
            activations_count3_nn[key] += activations3[key].type(torch.FloatTensor)
                
        test_loss += F.nll_loss(scores, sorted_labels, size_average=False).data[0]
        pred = scores.data.max(1, keepdim=True)[1]
        correct += pred.eq(b_y.data.view_as(pred)).long().cpu().sum()
        
    test_loss /= len(test_loader.dataset)
    Loss_test_nn[epoch] = test_loss
    Acc_test_nn[epoch] =  100 * float(correct) /float(len(test_loader.dataset))
    
torch.save(model.state_dict(), 'cnn_model.pt')


					Training Baseline



/ |                         #                        | 50 Elapsed Time: 0:00:52

RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1524584710464/work/aten/src/THC/generic/THCStorage.cu:58

## Train Joint Neural Network

In [None]:
Loss_test_jnn = np.zeros((EPOCH,))
Acc_test_jnn = np.zeros((EPOCH,))

print('\n\t\t\t\t\tTraining Baseline\n')
    
model = joint_neural_net(m1,m2,m3)

activations_count1_jnn = {"0": torch.zeros(m1), "1":torch.zeros(m1), "2":torch.zeros(m1), "3":torch.zeros(m1), "4":torch.zeros(m1), "5":torch.zeros(m1), "6":torch.zeros(m1), "7":torch.zeros(m1), "8":torch.zeros(m1), "9":torch.zeros(m1)}
activations_count2_jnn = {"0": torch.zeros(m2), "1":torch.zeros(m2), "2":torch.zeros(m2), "3":torch.zeros(m2), "4":torch.zeros(m2), "5":torch.zeros(m2), "6":torch.zeros(m2), "7":torch.zeros(m2), "8":torch.zeros(m2), "9":torch.zeros(m2)}
activations_count3_jnn = {"0": torch.zeros(m3), "1":torch.zeros(m3), "2":torch.zeros(m3), "3":torch.zeros(m3), "4":torch.zeros(m3), "5":torch.zeros(m3), "6":torch.zeros(m3), "7":torch.zeros(m3), "8":torch.zeros(m3), "9":torch.zeros(m3)}

if cudaopt:
    model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001, eps = EPS)
bar = progressbar.ProgressBar()

for epoch in range(EPOCH):

    bar.update((epoch+1)/EPOCH*100)
    # train 1 epoch
    model.train()
    for step, (x, y) in enumerate(train_loader):
        b_x = Variable(x)   # batch x, shape (batch, 28*28)
        b_y = Variable(y)               # batch label
        if cudaopt:
            b_y, b_x = b_y.cuda(), b_x.cuda()
        encoded, scores, sorted_labels = model.joint_train(b_x, b_y)
        sorted_labels = sorted_labels.type(torch.cuda.LongTensor)
        loss = F.nll_loss(scores, sorted_labels)      # negative log likelyhood
        optimizer.zero_grad()               # clear gradients for this training step
        loss.backward()                     # backpropagation, compute gradients
        optimizer.step()                    # apply gradients      
            
    # testing
    model.eval()
    correct = 0
    test_loss = 0
    for step, (x, y) in enumerate(test_loader):
        b_x = Variable(x)   # batch x, shape (batch, 28*28)
        b_y = Variable(y)               # batch label
        if cudaopt:
            b_y, b_x = b_y.cuda(), b_x.cuda()
        encoded_by_class, scores, sorted_labels, activations1, activations2, activations3 = test(model, b_x, b_y)
        sorted_labels = sorted_labels.type(torch.cuda.LongTensor)
        for key, tensor_list in activations1.items():
            activations_count1_jnn[key] += activations1[key].type(torch.FloatTensor)
            activations_count2_jnn[key] += activations2[key].type(torch.FloatTensor)
            activations_count3_jnn[key] += activations3[key].type(torch.FloatTensor)
                
        test_loss += F.nll_loss(scores, sorted_labels, size_average=False).data[0]
        pred = scores.data.max(1, keepdim=True)[1]
        correct += pred.eq(b_y.data.view_as(pred)).long().cpu().sum()
        
    test_loss /= len(test_loader.dataset)
    Loss_test_jnn[epoch] = test_loss
    Acc_test_jnn[epoch] =  100 * float(correct) /float(len(test_loader.dataset))
    
torch.save(model.state_dict(), 'cnn_model.pt')

## Accuracy and test performance

In [None]:
fig = plt.figure(figsize=(8,6))
plt.style.use('default')
plt.plot(Acc_test_nn, linewidth = 2,label='Normal')
plt.plot(Acc_test_jnn, linewidth = 2,label = 'Joint')
plt.grid('on')
plt.title('Test Accuracy')
plt.ylabel('Classification accuracy %')
plt.xlabel('# Epochs')
plt.legend()
plt.axis([0, EPOCH-1, 0, 100])
plt.show()

## Histrograms of activations

In [None]:
idx1 = 9
idx2 = 1

plt.figure(figsize=(16,12))
plt.subplot(2,2,1)
plt.bar(np.arange(m3), activations_count3_jnn[str(idx1)].detach().numpy())
plt.title('Joint NN histogram of layer 3 activations - number:' + str(idx1))
plt.xlabel('Node index')
plt.ylabel('# activations')
plt.subplot(2,2,2)
plt.bar(np.arange(m3), activations_count3_nn[str(idx1)].detach().numpy())
plt.title('NN histogram of layer 3 activations - number:' + str(idx1))
plt.xlabel('Node index')
plt.ylabel('# activations')
plt.subplot(2,2,3)
plt.bar(np.arange(m3), activations_count3_jnn[str(idx2)].detach().numpy())
plt.title('Joint NN histogram of layer 3 activations - number:' + str(idx2))
plt.xlabel('Node index')
plt.ylabel('# activations')
plt.subplot(2,2,4)
plt.bar(np.arange(m3), activations_count3_nn[str(idx2)].detach().numpy())
plt.title('NN histogram of layer 3 activations - number:' + str(idx2))
plt.xlabel('Node index')
plt.ylabel('# activations')
plt.show()



## Alternative visualisations of activations