In [2]:
from __future__ import division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import check_X_y
from sklearn.utils import check_random_state
import theano
import theano.tensor as tt
from theano import shared
from time import time
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 

In [3]:
from sklearn.datasets import fetch_mldata
from sklearn.utils import check_array
from sklearn.cross_validation import train_test_split


def load_data(dtype=np.float32, order='C'):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]
    # Scale the features to [0, 1]
    X = X / 255
    ## Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]
    return X_train, X_test, y_train, y_test

X_dev, X_test, y_dev, y_test = load_data()

# Create a small training set for faster experiments
X_train, X_val, y_train, y_val = train_test_split(
    X_dev, y_dev, train_size=int(1e4), random_state=0)

Loading dataset...
Creating train-test split...


## Various neural networks utilities

Mostly taken from Alec Radford and Kyle Kastner's gists:

- https://gist.github.com/Newmu/a56d5446416f5ad2bbac
- https://gist.github.com/kastnerkyle/f3f67424adda343fef40

In [7]:
def sharedX(X, dtype=theano.config.floatX, name=None):
    return theano.shared(np.asarray(X, dtype=dtype), name=name)


def softmax(x):
    return tt.nnet.softmax(x)


def relu(x):
    return (x + abs(x)) / 2.0


def tanh(x):
    return tt.tanh(x)


def sigmoid(x):
    return tt.nnet.sigmoid(x)


def linear(x):
    return x


def iter_data(*data, **kwargs):
    batch_size = kwargs.get('batch_size', 128)
    batches, remainder = divmod(len(data[0]), batch_size)
    if remainder != 0:
        batches += 1
    for b in range(batches):
        start = b * batch_size
        end = (b + 1) * batch_size
        if len(data) == 1:
            yield data[0][start:end]
        else:
            yield tuple([d[start:end] for d in data])
            

class Adam(object):
    """no bias init correction and no b1 decay"""
    
    def __init__(self, learning_rate=0.0001, grad_momentum=0.1,
                 sq_grad_momentum=0.001, eps=1e-8):
        self.learning_rate = learning_rate
        self.grad_momentum = grad_momentum
        self.sq_grad_momentum = sq_grad_momentum
        self.eps = eps

    def make_updates(self, params, cost):
        b1 = self.grad_momentum
        b2 = self.sq_grad_momentum
        updates = []
        grads = tt.grad(cost, params)
        for p, g in zip(params, grads):
            m = shared(p.get_value() * 0.)
            v = shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * tt.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (tt.sqrt(v_t) + self.eps)
            p_t = p - (self.learning_rate * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        return updates

In [17]:
class EarlyStoppingMonitor(object):
    def __init__(self, X_val=None, y_val=None, patience=3, data_log_filename=None):
        if X_val is not None:
            X_val, y_val = check_X_y(X_val, y_val, dtype=theano.config.floatX)
        self.X_val = X_val
        self.y_val = y_val
        self.data_log_filename = data_log_filename
        
        self.best_cost = np.inf
        self.patience = patience
        self.patience_reset = patience
        
    def monitor(self, model, context):
        epoch = context.get('epoch')
        X_train = context.get('X')
        Y_train = context.get('Y')
        
        X_val = self.X_val
        Y_val = getattr(self, 'Y_val', None)
        if Y_val is None and self.y_val is not None:
            self.Y_val = Y_val = model.label_binarizer_.transform(self.y_val)
        
        training_cost = model._compute_cost(X_train, Y_train)
        if X_val is not None:
            validation_cost = model._compute_cost(X_val, Y_val)
            current_cost = validation_cost
                
        else:
            validation_cost = np.nan
            current_cost = training_cost

        print("Epoch #%03d: training cost: %0.5f, validation cost: %0.5f" % (
            epoch, training_cost, validation_cost))

        # Patience-based stopping condition
        if current_cost < self.best_cost:
            self.patience = self.patience_reset
            self.best_cost = current_cost
        else:
            self.patience -= 1
        return self.patience > 0


class MLPClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, hidden=(100, 100), activation='relu', init_gain='auto',
                 batch_size=128, optimizer=None, n_epochs=1000,
                 warm_start=False, random_state=None):
        self.hidden = hidden
        self.activation = activation
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.n_epochs = n_epochs
        self.init_gain = init_gain
        self.warm_start = warm_start
        self.random_state = random_state
        
    def _init_parameters(self, n_features, n_outputs):
        rng = check_random_state(self.random_state)
        input_dims = (n_features,) + self.hidden
        output_dims = self.hidden + (n_outputs,)

        if self.init_gain == 'auto':
            g = np.sqrt(2) if self.activation == 'relu' else 1.
        else:
            g = self.init_gain
        
        self.weights_ = []
        self.biases_ = []
        for l, (in_dim, out_dim) in enumerate(zip(input_dims, output_dims)):
            std = 2 * g / (in_dim + out_dim)
            W = sharedX(rng.normal(size=(in_dim, out_dim), scale=std))
            self.weights_.append(W)
            b = sharedX(np.zeros(out_dim))
            self.biases_.append(b)
        
    def _make_functions(self):
        x = tt.matrix()
        y = tt.matrix()

        # Define the computation graph of the model
        if self.activation == 'relu':
            sigma = relu
        elif self.activation == 'tanh':
            sigma = tanh
        elif self.activation == 'linear':
            sigma = linear
        else:
            raise ValueError('Unsupported activation: %s' % self.activation)
        
        activations = [sigma] * (len(self.weights_) - 1) + [softmax]
        tmp = x
        for w, b, s in zip(self.weights_, self.biases_, activations):
            tmp = s(tt.dot(tmp, w) + b)
        
        output = tmp
        cost = tt.nnet.binary_crossentropy(output, y).mean()
        
        # Use the optimizer to compute the parameter updates based
        # on the gradient of the cost function
        opt = self.optimizer
        if opt is None:
            opt = Adam()
        parameters = []
        parameters += self.weights_
        parameters += self.biases_
        fit_updates = opt.make_updates(parameters, cost)
        
        # Compile the functions them-selves
        f = theano.function
        self._fit = f([x, y], cost, updates=fit_updates, name='_fit')
        self._compute_cost = f([x, y], cost, name="_compute_cost")
        self._forward = f([x], output, name="_forward")

    def fit(self, X, y, monitor=None):
        X, y = check_X_y(X, y, dtype=theano.config.floatX)
        self.label_binarizer_ = lb = LabelBinarizer()
        Y = lb.fit_transform(y).astype(theano.config.floatX)
        n_samples, n_features = X.shape
        _, n_outputs = Y.shape
        if not self.warm_start or not hasattr(self, 'weights_') :
            self._init_parameters(n_features, n_outputs)

        self._make_functions()
        self.training_costs_ = [self._compute_cost(X, Y)]
        for epoch in range(self.n_epochs):
            if monitor is not None and not monitor.monitor(self, locals()):
                break
            for X_batch, Y_batch in iter_data(X, Y):
                cost = self._fit(X_batch, Y_batch)
                self.training_costs_.append(cost)
        return self
    
    def predict_proba(self, X):
        X = check_array(X, dtype=theano.config.floatX)
        return self._forward(X)
    
    def predict(self, X):
        indices = self.predict_proba(X).argmax(axis=1)
        return self.label_binarizer_.classes_[indices]

In [18]:
mlp = MLPClassifier(hidden=(100, 100), batch_size=128, init_gain=20., random_state=0)
mlp.fit(X_train, y_train, monitor=EarlyStoppingMonitor(X_val, y_val))

Epoch #000: training cost: 0.41119, validation cost: 0.41400
Epoch #001: training cost: 0.06485, validation cost: 0.07053
Epoch #002: training cost: 0.04739, validation cost: 0.05612
Epoch #003: training cost: 0.03956, validation cost: 0.05027
Epoch #004: training cost: 0.03457, validation cost: 0.04674
Epoch #005: training cost: 0.03094, validation cost: 0.04433
Epoch #006: training cost: 0.02814, validation cost: 0.04256
Epoch #007: training cost: 0.02586, validation cost: 0.04111
Epoch #008: training cost: 0.02397, validation cost: 0.04000
Epoch #009: training cost: 0.02230, validation cost: 0.03904
Epoch #010: training cost: 0.02084, validation cost: 0.03819
Epoch #011: training cost: 0.01954, validation cost: 0.03747
Epoch #012: training cost: 0.01838, validation cost: 0.03685
Epoch #013: training cost: 0.01732, validation cost: 0.03627
Epoch #014: training cost: 0.01635, validation cost: 0.03578
Epoch #015: training cost: 0.01546, validation cost: 0.03530
Epoch #016: training cos

MLPClassifier(activation='relu', batch_size=128, hidden=(100, 100),
       init_gain=20.0, n_epochs=1000, optimizer=None, random_state=0,
       warm_start=False)

In [19]:
mlp.score(X_test, y_test)

0.95269999999999999