# Kervolution Again

Attempt to replicate results from: https://arxiv.org/abs/1904.03955

In [1]:
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
## eh, dude?


import torchvision
import torchvision.transforms as transforms

## Kresnet, trying to get it working

layers stuff from: https://github.com/gan3sh500/kervolution-pytorch/blob/master/layer.py

In [2]:
# When bias is not none, this thing fails
class LinearKernel(nn.Module):
    def __init__(self):
        super(LinearKernel, self).__init__()
    
    def forward(self, x_unf, w, b):
        t = x_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
        if b is not None:
            return t + b
        return t

In [3]:
class MyLinearKernel(nn.Module):
    def __init__(self):
        super(LinearKernel, self).__init__()
    
    def forward(self, x_unf, w, b):
        output = x_unf.matmul(w.t())
        if bias is not None :
            output += b
        return output

**Polynomial Kervolution**

K<sub>p</sub>(X,W) = (X<sup>T</sup> W + c<sub>p</sub>)<sup>d<sub>p</sub></sup>

In [4]:
class PolynomialKernel(LinearKernel):
    def __init__(self, cp=2.0, dp=3, train_cp=True):
        super(PolynomialKernel, self).__init__()
        self.cp = nn.parameter.Parameter(torch.tensor(cp, requires_grad=train_cp))
        self.dp = dp

    def forward(self, x_unf, w, b):
        return (self.cp + super(PolynomialKernel, self).forward(x_unf, w, b))**self.dp

In [5]:
## doesnt work
class GaussianKernel(nn.Module):
    def __init__(self, gamma=1.0):
        super(GaussianKernel, self).__init__()
        self.gamma = torch.nn.parameter.Parameter(
                            torch.tensor(gamma, requires_grad=True))
    
    def forward(self, x_unf, w, b):
        l = x_unf.transpose(1, 2)[:, :, :, None] - w.view(1, 1, -1, w.size(0))
        l = l.sum(l**2, 2)
        t = torch.exp(-self.gamma * l)
        if b is not None:
            return t + b
        return t

In [6]:
class KernelConv2d(nn.Conv2d):
    def __init__(self, in_channels, out_channels, kernel_size, kernel_fn=PolynomialKernel,
                 stride=1, padding=0, dilation=1, groups=1, bias=None,
                 padding_mode='zeros'):
        '''
        Follows the same API as torch Conv2d except kernel_fn.
        kernel_fn should be an instance of the above kernels.
        '''
        super(KernelConv2d, self).__init__(in_channels, out_channels, 
                                           kernel_size, stride, padding,
                                           dilation, groups, bias, padding_mode)
        self.kernel_fn = kernel_fn()
   
    def compute_shape(self, x):
        h = (x.shape[2] + 2 * self.padding[0] - 1 * (self.kernel_size[0] - 1) - 1) // self.stride[0] + 1
        w = (x.shape[3] + 2 * self.padding[1] - 1 * (self.kernel_size[1] - 1) - 1) // self.stride[1] + 1
        return h, w
    
    def forward(self, x):
        #print('x.shape:',x.shape)
        x_unf = nn.functional.unfold(x, self.kernel_size, self.dilation,self.padding, self.stride)
        #print('x_unf.shape:',x_unf.shape)
        h, w = self.compute_shape(x)
     
        #print('h,w:',h,w)
        #print('self.weight.shape:',self.weight.shape)
        result = self.kernel_fn(x_unf, self.weight, self.bias).view(x.shape[0], -1, h, w)
        #print('result.shape:',result.shape)
        #print('***')
        return result

In [7]:
#temp_conv = KernelConv2d(3,32,3,PolynomialKernel)

In [8]:
# network architecture from: https://github.com/wang-chen/KervNets

class BasicKervBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicKervBlock, self).__init__()
        self.kerv1 = KernelConv2d(in_planes, planes, kernel_size=3,stride=stride,padding=1, bias=None)
        self.bn1 = nn.BatchNorm2d(planes)
        self.kerv2 = KernelConv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=None)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                KernelConv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=None),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.kerv1(x)))
        out = self.bn2(self.kerv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

In [9]:
class KResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(KResNet, self).__init__()
        self.in_planes = 64

        self.kerv1 = KernelConv2d(3, 64, 3, stride=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.kerv1(x)))
        out = F.relu(self.layer1(out))
        out = F.relu(self.layer2(out))
        out = F.relu(self.layer3(out))
        out = F.relu(self.layer4(out))
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out
    
    
def KResNet18(num_classes=10):
    return KResNet(BasicKervBlock, [2,2,2,2], num_classes=num_classes)

In [10]:
## Test network

net = KResNet18()
y = net(Variable(torch.randn(8,3,32,32)))
print(y.size())

torch.Size([8, 10])


## Training an image classifier

We will do the following steps in order:

1. Load and normalizing the CIFAR10 training and test datasets using torchvision
2. Define a Neural Network
3. Define a loss function
4. Train the network on the training data
5. Test the network on the test data

## 1. Loading and normalizing CIFAR10

In [11]:


transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 256
num_workers = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=num_workers)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=num_workers)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


Let us show some of the training images

In [12]:
import matplotlib.pyplot as plt
import numpy as np

# functions to show an image


def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

<Figure size 640x480 with 1 Axes>

 ship plane truck  frog


## 2. Define a Neural Network

In [13]:
net = KResNet18()

## 3. Define a Loss function and optimizer

In [14]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9,weight_decay=1e-4)

# from Kervolutional Neural Networks article (5.2):
#
# The learning rate is set to 0.1, and is reduced every 30 epochs. Also, a weight decay of 10 −4
# and a momentum of 0.9 without dampening are employed.

## 4. Train the network (GPU, because very slow on CPU)

We have to loop over our data iterator, and feed the inputs to the network and optimize.

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

cuda:0


In [16]:
net.to(device)

KResNet(
  (kerv1): KernelConv2d(
    3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False
    (kernel_fn): PolynomialKernel()
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicKervBlock(
      (kerv1): KernelConv2d(
        64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (kernel_fn): PolynomialKernel()
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (kerv2): KernelConv2d(
        64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (kernel_fn): PolynomialKernel()
      )
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicKervBlock(
      (kerv1): KernelConv2d(
        64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (kernel_fn): PolynomialKernel()
      )
      (bn1): Batch

In [17]:
for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        # send the inputs and targets at every step to the GPU
        inputs, labels = inputs.to(device), labels.to(device)


        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

print('Finished Training')

[1,   100] loss: 2.336
[2,   100] loss: 2.308
[3,   100] loss: 2.306
[4,   100] loss: 2.302
[5,   100] loss: 2.301
Finished Training


**Network doesnt seem able to learn**

loss hangs around 2.3