In [1]:
%pylab inline
import scipy.optimize
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

Populating the interactive namespace from numpy and matplotlib


In [22]:
import pickle

In [3]:
import theano
from theano import tensor as T
from lasagne.updates import sgd, apply_momentum, nesterov_momentum, adagrad, adadelta

In [4]:
mnist = fetch_mldata('MNIST original', data_home='./data')
y_all = mnist.target[:, np.newaxis]
intercept = np.ones_like(y_all)
data = np.hstack([intercept, mnist.data, y_all])
np.random.shuffle(data)

In [5]:
def normalize_features(train, test):
    """Normalizes train set features to a standard normal distribution
    (zero mean and unit variance). The same procedure is then applied
    to the test set features.
    """
    train_mean = train.mean(axis=0)
    # +0.1 to avoid division by zero in this specific case
    train_std = train.std(axis=0) + 0.1
    
    train = (train - train_mean) / train_std
    test = (test - train_mean) / train_std
    return train, test

In [6]:
train_data_count = 60000
Xt = data[:train_data_count, :-1]
yt = data[:train_data_count, -1:].astype(int)


Xv = data[train_data_count:, :-1]
yv = data[train_data_count:, -1:].astype(int)

Xt, Xv = normalize_features(Xt, Xv)

In [7]:
binarizer = preprocessing.LabelBinarizer()
binarizer.fit(yt)
ytb = binarizer.transform(yt)

In [55]:
def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape, m=0.01):
    w = np.random.randn(*shape) * m
    return theano.shared(floatX(w))

def model(X, W, B):
    h = X
    for w, b in zip(W[:-1], B[:-1]):
        z = T.dot(h, w) + b
        h = T.maximum(z, 0)
#         h = (T.exp(z) - T.exp(-z))/(T.exp(z) + T.exp(-z))
    h = T.dot(h, W[-1]) + B[-1]
    h = T.nnet.softmax(h)
    return h

In [65]:
Layers = [Xt.shape[1], 32, 10]

X = T.fmatrix()
Y = T.fmatrix()

W = []
B = []
for l1, l2 in zip(Layers[:-1], Layers[1:]):
    W.append(init_weights((l1, l2)))
    B.append(init_weights((l2,), m=1))



hwx = model(X, W, B)
cost = T.mean(T.nnet.categorical_crossentropy(hwx, Y))

params = W + B

# updates = sgd(cost, params, learning_rate=0.05)
updates = nesterov_momentum(cost, params, learning_rate=0.05)

predict = theano.function(inputs=[X], outputs=hwx, allow_input_downcast=True)
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)


In [25]:
updates = sgd(cost, params, learning_rate=0.08)


predict = theano.function(inputs=[X], outputs=hwx, allow_input_downcast=True)
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)

In [68]:
%%time
for i in range(1000):
    cost = train(Xt, ytb)
    if i % 10 == 0:
        ytp = binarizer.inverse_transform(predict(Xt))
        print(i, 'Train acc', accuracy_score(yt, ytp), cost)

0 Train acc 0.9747 0.09103312360619488
10 Train acc 0.974883333333 0.0905296149974597
20 Train acc 0.975033333333 0.09003130004774981
30 Train acc 0.97515 0.08953801985657879
40 Train acc 0.975233333333 0.08905127552207501
50 Train acc 0.975383333333 0.08857117892028628
60 Train acc 0.975566666667 0.08809817466539215
70 Train acc 0.975716666667 0.08763331761964013
80 Train acc 0.975816666667 0.08717531140552698
90 Train acc 0.975916666667 0.08672384808874312
100 Train acc 0.976083333333 0.08627693582980304
110 Train acc 0.97625 0.0858344724347057
120 Train acc 0.97635 0.08539592132309573
130 Train acc 0.976416666667 0.08496023318066206
140 Train acc 0.9765 0.08452987419289588
150 Train acc 0.976483333333 0.0841050824625499
160 Train acc 0.97655 0.08368513798432844
170 Train acc 0.976733333333 0.08327012713256803
180 Train acc 0.9768 0.08286021566763188
190 Train acc 0.976916666667 0.08245435581485278
200 Train acc 0.977083333333 0.08205273788096352
210 Train acc 0.977233333333 0.081656

In [52]:
with open('weights.pickle', 'wb') as f:
    pickle.dump((W, B), f, protocol=4)

In [69]:
ytp = binarizer.inverse_transform(predict(Xt))
print('Train acc', accuracy_score(yt, ytp))
yvp = binarizer.inverse_transform(predict(Xv))
print('Valid acc', accuracy_score(yv, yvp))

Train acc 0.9845
Valid acc 0.9529
