In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.optim
import torch.functional as F

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [2]:
mb_size = 100 # mini-batch size of 100


trans = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])

dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)


dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


In [9]:
def init_weights(shape):
    # xavier initialization (a good initialization is important!)
    # http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
    fan_in = shape[0]
    fan_out = shape[1]
    variance = 2.0/(fan_in + fan_out)
    w = torch.randn(size=shape)*np.sqrt(variance)
    w.requires_grad = True
    return w

def rectify(X):
    return torch.max(torch.zeros_like(X), X)


# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)


# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)


def model(X, w_h, w_h2, w_o,a, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    #h = rectify(X @ w_h)
    h = PRelu(X @ w_h,a)
    h = dropout(h, p_drop_hidden)
    #h2 = rectify(h @ w_h2)
    h2 = PRelu(h @ w_h2,a_2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [17]:
w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))
#prelu constant
a = init_weights((1,625))
a_2 = init_weights((1,625))
optimizer = RMSprop([w_h, w_h2, w_o, a])

# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    optimizer.zero_grad()
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o,a, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    #print("Loss: {}".format(cost))
    optimizer.step()

  


Loss: 3.3084473609924316
Loss: 20.16912078857422
Loss: 21.996244430541992
Loss: 10.791604995727539
Loss: 9.43410587310791
Loss: 7.064802646636963
Loss: 5.596531867980957
Loss: 2.4379665851593018
Loss: 2.483964681625366
Loss: 1.2743006944656372
Loss: 1.1285089254379272
Loss: 0.9836878776550293
Loss: 0.7834575176239014
Loss: 0.8933309316635132
Loss: 0.9158243536949158
Loss: 0.8274177312850952
Loss: 0.6991760730743408
Loss: 0.7600440979003906
Loss: 0.9970940351486206
Loss: 1.0137861967086792
Loss: 1.0761635303497314
Loss: 1.0505982637405396
Loss: 0.8884304165840149
Loss: 1.234202265739441
Loss: 0.9113507270812988
Loss: 0.9358523488044739
Loss: 0.880368709564209
Loss: 0.8387554287910461
Loss: 0.9210190773010254
Loss: 0.7196834683418274
Loss: 0.640656054019928
Loss: 0.7970659136772156
Loss: 1.0960174798965454
Loss: 1.3423253297805786
Loss: 1.5828328132629395
Loss: 0.810356080532074
Loss: 0.6107978820800781
Loss: 0.659767746925354
Loss: 0.7076742053031921
Loss: 0.8368184566497803
Loss: 1.233

KeyboardInterrupt: 

## 3 Dropout

In [5]:
def dropout(X,p_drop=1):
    if p_drop<0 or p_drop>1:
        return X
    mask=np.random.binomial(1,p_drop,X.shape)#np.where(np.random.random(X.shape)<p_drop)
    X[mask]=0
    return X/p_drop

the dropout method sets randomly a certain percentage of the weights to zero. This reduces overfitting as it prevents coadaptive learning. But the implementation here makes the code run way slower.

## 4 Parametric Relu

In [16]:
def PRelu (X,a):
    z = torch.tensor(X, requires_grad=True)
    z = z*a
    z[z >= 0] = X[z >= 0]
    return z
#TODO include a in params