In [1]:
%pylab inline
import scipy.optimize
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

Populating the interactive namespace from numpy and matplotlib


In [38]:
import theano
from theano import tensor as T
from lasagne.updates import sgd, apply_momentum, nesterov_momentum, adagrad, adadelta

In [3]:
mnist = fetch_mldata('MNIST original', data_home='./data')
y_all = mnist.target[:, np.newaxis]
intercept = np.ones_like(y_all)
data = np.hstack([intercept, mnist.data, y_all])
np.random.shuffle(data)

In [4]:
def normalize_features(train, test):
    """Normalizes train set features to a standard normal distribution
    (zero mean and unit variance). The same procedure is then applied
    to the test set features.
    """
    train_mean = train.mean(axis=0)
    # +0.1 to avoid division by zero in this specific case
    train_std = train.std(axis=0) + 0.1
    
    train = (train - train_mean) / train_std
    test = (test - train_mean) / train_std
    return train, test

In [5]:
train_data_count = 60000
Xt = data[:train_data_count, :-1]
yt = data[:train_data_count, -1:].astype(int)


Xv = data[train_data_count:, :-1]
yv = data[train_data_count:, -1:].astype(int)

Xt, Xv = normalize_features(Xt, Xv)

In [9]:
binarizer = preprocessing.LabelBinarizer()
binarizer.fit(yt)
ytb = binarizer.transform(yt)

In [46]:
def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape, m=0.01):
    w = np.random.rand(*shape) * m
    return theano.shared(floatX(w))

In [99]:
X = T.fmatrix()
Y = T.fmatrix()

w = init_weights((Xt.shape[1], 10))
b = init_weights((10,), m=0.01)

# hwx = T.nnet.sigmoid(T.dot(X, w) + b)
hwx = 1 / (1 + T.exp(-1 * (T.dot(X, w) + b)))
# cost = T.mean(T.nnet.categorical_crossentropy(hwx, Y))
cost = -T.mean(Y * T.log(hwx) + (1 - Y) * T.log(1-hwx))

params = [w, b]

# updates = sgd(cost, params, learning_rate=0.05)
grad_w = T.grad(cost=cost, wrt=w)
grad_b = T.grad(cost=cost, wrt=b)
updates = [[w, w - grad_w * 0.05], [b, b - grad_b * 0.05]]


predict = theano.function(inputs=[X], outputs=hwx, allow_input_downcast=True)
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)


In [100]:

for i in range(100):
    print("  ", train(Xt, ytb))
    if i % 10 == 0:
        ytp = binarizer.inverse_transform(predict(Xt))
        print(i, 'Train acc', accuracy_score(yt, ytp))

   0.7312605381011963
0 Train acc 0.217816666667
   0.7238372564315796
   0.7168281078338623
   0.7102088332176208
   0.7039557695388794
   0.6980456709861755
   0.6924563646316528
   0.6871663331985474
   0.682155191898346
   0.6774033904075623
   0.6728926301002502
10 Train acc 0.727533333333
   0.6686056852340698
   0.6645264029502869
   0.660639762878418
   0.6569317579269409
   0.6533893942832947
   0.6500007510185242
   0.6467546820640564
   0.6436408758163452
   0.6406500339508057
   0.6377733945846558
20 Train acc 0.76265
   0.6350029706954956
   0.6323314309120178
   0.6297519207000732
   0.6272584199905396
   0.6248450875282288
   0.6225066781044006
   0.6202384233474731
   0.6180357336997986
   0.6158946752548218
   0.6138113737106323
30 Train acc 0.77825
   0.611782431602478
   0.6098045110702515
   0.6078747510910034
   0.605990469455719
   0.6041489839553833
   0.6023480892181396
   0.6005855202674866
   0.5988593101501465
   0.5971676111221313
   0.5955086350440979
40 Tr

In [101]:
ytp = binarizer.inverse_transform(predict(Xt))
print('Train acc', accuracy_score(yt, ytp))
yvp = binarizer.inverse_transform(predict(Xv))
print('Valid acc', accuracy_score(yv, yvp))

Train acc 0.816416666667
Valid acc 0.813
