In [207]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import numpy as np
import matplotlib.pyplot as plt

from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering

import networkx as nx

from scipy.cluster.hierarchy import dendrogram, linkage

In [208]:
# Hyperparameters
cuda = True
epochs = 10
log_interval = 400
batchSize = 16

hidden_layer_sizes = [3 * 32 * 32, 256, 10]

lr = 1e-3

In [209]:
def softmax(input, axis=1):
    input_size = input.size()
    
    trans_input = input.transpose(axis, len(input_size)-1)
    trans_size = trans_input.size()

    input_2d = trans_input.contiguous().view(-1, trans_size[-1])
    
    soft_max_2d = F.softmax(input_2d)
    
    soft_max_nd = soft_max_2d.view(*trans_size)
    return soft_max_nd.transpose(axis, len(input_size)-1)

In [210]:
class Attention(nn.Module):
    def __init__(self, dim):
        super(Attention, self).__init__()
        self.coefs = nn.Parameter(torch.FloatTensor(dim, dim))
        self.coefs.data.normal_(0, 1.)
        self.normal_coefs = torch.exp(-self.coefs) / torch.exp(-self.coefs).sum(1).repeat(1, dim)
    
    def forward(self, X):
        return F.linear(X, weight=self.normal_coefs)
        
    

In [211]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(hidden_layer_sizes[i], hidden_layer_sizes[i+1]) for i in range(len(hidden_layer_sizes) - 1)])
        self.attentions = nn.ParameterList([nn.Parameter(torch.FloatTensor(hidden_layer_sizes[i], hidden_layer_sizes[i])) for i in range(1, len(hidden_layer_sizes) - 1)])
        for i, l in enumerate(self.attentions):
            self.attentions[i].data.copy_(torch.eye(self.attentions[i].size(0)))
    
    def clamp(self):
        for i, l in enumerate(self.attentions):
            self.attentions[i].data.clamp_(0., 1.)
            self.attentions[i].data.div_(self.attentions[i].sum(1).repeat(1, self.attentions[i].size(0)).data)
            
    def forward(self, x):
        activations = 0
        x = x.view(-1, 3 * 32 * 32)
        for i, l in enumerate(self.linears):
            x = F.relu(l(x))
            if i < len(self.attentions):
                x = F.linear(x, weight=self.attentions[i])
            if i + 2 == len(hidden_layer_sizes):
                break
            if type(activations) is np.ndarray:
                activations = np.append(activations, np.expand_dims(torch.sign(x).cpu().data.numpy(), axis=1), axis=1)
            else:
                activations = np.expand_dims(torch.sign(x).cpu().data.numpy(), axis=1)
        return x, activations
    
model = Net()
print(model)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD([
                {'params': model.linears.parameters()},
                {'params': model.attentions.parameters(), 'lr': 1e-2}
            ], lr=lr, momentum=0.9)

Net (
  (linears): ModuleList (
    (0): Linear (3072 -> 256)
    (1): Linear (256 -> 10)
  )
  (attentions): ParameterList (
  )
)


In [212]:
model.clamp()

In [213]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batchSize,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batchSize,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [214]:
if cuda:
    model.cuda()
    criterion.cuda()

In [215]:
def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(trainloader):
        if cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output, activations = model(data)
        loss = criterion(output, target) + 1000 * model.attentions[0].abs().mean()
        loss.backward()
        optimizer.step()
        model.clamp()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(trainloader.dataset),
                100. * batch_idx / len(trainloader), loss.data[0]))

def test(epoch):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in testloader:
        if cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output, activations = model(data)
        test_loss += criterion(output, target).data[0]
        pred = output.data.max(1)[1] # get the index of the max log-probability
        correct += pred.eq(target.data).cpu().sum()

    test_loss = test_loss
    test_loss /= len(testloader) # loss function already averages over batch size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(testloader.dataset),
        100. * correct / len(testloader.dataset)))
    
def activation_metrics():
    metrics = Constellation()
    for batch_idx, (data, target) in enumerate(trainloader):
        if cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        output, activations = model(data)
        metrics.train_add(activations, target.cpu().data.numpy())
        if batch_idx > 1000:
            break
#     for batch_idx, (data, target) in enumerate(testloader):
#         if cuda:
#             data, target = data.cuda(), target.cuda()
#         data, target = Variable(data), Variable(target)
#         output, activations = model(data)
#         metrics.test_add(activations, target.cpu().data.numpy())
        
    metrics.print_metrics()

In [216]:
# activation_metrics()

for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)


Test set: Average loss: 1.5744, Accuracy: 4443/10000 (44%)


Test set: Average loss: 1.4638, Accuracy: 4922/10000 (49%)


Test set: Average loss: 1.4223, Accuracy: 5020/10000 (50%)


Test set: Average loss: 1.4007, Accuracy: 5104/10000 (51%)


Test set: Average loss: 1.3555, Accuracy: 5273/10000 (53%)


Test set: Average loss: 1.3613, Accuracy: 5204/10000 (52%)


Test set: Average loss: 1.3389, Accuracy: 5348/10000 (53%)


Test set: Average loss: 1.3453, Accuracy: 5318/10000 (53%)


Test set: Average loss: 1.3634, Accuracy: 5219/10000 (52%)


Test set: Average loss: 1.3611, Accuracy: 5270/10000 (53%)



In [217]:
plt.hist(model.attentions[0].squeeze().cpu().data.numpy(), bins=np.arange(0., 1., 0.01))
plt.show()

KeyboardInterrupt: 

In [218]:
model.attentions[0].squeeze().cpu().data.numpy()

array([[  9.32353318e-01,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   9.91017044e-01,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   9.99983966e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   1.16491329e-03, ...,
          9.46410537e-01,   0.00000000e+00,   0.00000000e+00],
       [  6.58738194e-04,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   9.84207928e-01,   0.00000000e+00],
       [  1.33649027e-03,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   9.70637798e-01]], dtype=float32)