## 5 Convolutional layers

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.optim
import torch.functional as F

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [83]:
mb_size = 100 # mini-batch size of 100


trans = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)


dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)


In [86]:
def dropout(X,p_drop=1):
    if p_drop<0 or p_drop>1:
        return X
    mask=np.random.binomial(1,p_drop,X.shape)#np.where(np.random.random(X.shape)<p_drop)
    X = X * torch.tensor(mask).type(torch.FloatTensor)
    return X/p_drop

def PRelu (X,a):
    z = torch.tensor(X, requires_grad=True)
    z = z*a[0]
    z[z >= 0] = X[z >= 0]
    return z

def init_weights(shape):
    # xavier initialization (a good initialization is important!)
    # http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
    fan_in = shape[0]
    fan_out = shape[1]
    variance = 2.0/(fan_in + fan_out)
    w = torch.randn(size=shape)*np.sqrt(variance)
    w.requires_grad = True
    return w

def rectify(X):
    return torch.max(torch.zeros_like(X), X)


# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)


# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)


def model(X, w_conv1, w_h2, w_o, a_2, p_drop_input, p_drop_hidden):
    X = X.reshape(-1, 1, 28, 28)
    X = dropout(X, p_drop_input)
    #h = rectify(X @ w_h)
    for i in range(len(w_conv1)):
        convolutional_layer = rectify(conv2d(X, w_conv1[i] ))
        subsample_layer = max_pool2d(convolutional_layer, (2, 2)) # reduces window 2x2 to 1 pixel
        X = dropout(subsample_layer, p_drop_input )
    
    X = X.reshape((128))
    #h2 = rectify(h @ w_h2)
    h2 = PRelu(X @ w_h2,a_2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [85]:
w_1 = init_weights((32, 1, 5, 5))
w_2 = init_weights((64, 32, 5, 5))
w_3 = init_weights((128, 64, 2, 2))
w_conv1 = [w_1, w_2, w_3]

w_h2 = init_weights((128, 625))
w_o = init_weights((625, 10))
#prelu constant
a = init_weights((1,128))
a_2 = init_weights((1,625))
optimizer = RMSprop([*w_conv1, w_h2, w_o, a_2])

# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    optimizer.zero_grad()
    noise = torch.tensor([])
    for x in X:
        noise = torch.cat((noise,model( x.reshape(784), w_conv1, w_h2, w_o, a_2, 0.8, 0.7))) #TODO: use batches
    noise = noise.reshape((mb_size,10))
    cost = torch.nn.functional.cross_entropy(noise, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

  


Loss: 9.076483726501465
Loss: 21.814857482910156
Loss: 13.486246109008789
Loss: 7.855291366577148
Loss: 5.60434627532959
Loss: 5.221518516540527
Loss: 4.083083152770996
Loss: 3.5060172080993652
Loss: 3.0871341228485107
Loss: 2.9395751953125
Loss: 3.1969616413116455
Loss: 2.980170249938965
Loss: 2.7726821899414062
Loss: 2.466749668121338
Loss: 2.4642858505249023
Loss: 2.626936435699463
Loss: 2.359334707260132
Loss: 2.495028495788574
Loss: 2.4967434406280518
Loss: 2.5585427284240723
Loss: 2.4977221488952637
Loss: 2.146055221557617
Loss: 2.2822954654693604
Loss: 2.1539268493652344
Loss: 2.3975000381469727
Loss: 2.170358657836914
Loss: 1.8855329751968384
Loss: 2.340444803237915
Loss: 2.322434902191162
Loss: 2.27750301361084
Loss: 2.1462085247039795
Loss: 2.4409046173095703
Loss: 2.173182487487793
Loss: 2.0850517749786377
Loss: 2.035439968109131
Loss: 1.958567500114441
Loss: 2.4515392780303955
Loss: 2.1142237186431885
Loss: 1.8617095947265625
Loss: 1.9060800075531006
Loss: 1.865603685379028

KeyboardInterrupt: 