In [1]:
from __future__ import division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import check_X_y
from sklearn.utils import check_random_state
from sklearn.utils import shuffle
import theano
import theano.tensor as tt
from theano import shared
from time import time
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 
floatX =theano.config.floatX

In [2]:
from sklearn.datasets import fetch_mldata
from sklearn.utils import check_array
from sklearn.cross_validation import train_test_split


def load_data(dtype=np.float32, order='C'):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]
    # Scale the features to [0, 1]
    X = X / 255
    ## Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]
    return X_train, X_test, y_train, y_test

X_dev, X_test, y_dev, y_test = load_data()

# Create a small training set for faster experiments
X_train, X_val, y_train, y_val = train_test_split(
    X_dev, y_dev, test_size=int(1e3), random_state=0)

Loading dataset...
Creating train-test split...


## Various neural networks utilities

Mostly taken from Alec Radford and Kyle Kastner's gists:

- https://gist.github.com/Newmu/a56d5446416f5ad2bbac
- https://gist.github.com/kastnerkyle/f3f67424adda343fef40

In [3]:
def sharedX(X, dtype=floatX, name=None):
    return theano.shared(np.asarray(X, dtype=dtype), name=name)


def softmax(x):
    return tt.nnet.softmax(x)


def relu(x):
    return (x + abs(x)) / 2.0


def tanh(x):
    return tt.tanh(x)


def sigmoid(x):
    return tt.nnet.sigmoid(x)


def linear(x):
    return x


def iter_data(*data, **kwargs):
    batch_size = kwargs.get('batch_size', 128)
    batches, remainder = divmod(len(data[0]), batch_size)
    if remainder != 0:
        batches += 1
    for b in range(batches):
        start = b * batch_size
        end = (b + 1) * batch_size
        if len(data) == 1:
            yield data[0][start:end]
        else:
            yield tuple([d[start:end] for d in data])

## Optimizers 

In [4]:
class Adam(object):
    """no bias init correction and no b1 decay"""
    
    def __init__(self, learning_rate=0.001, grad_momentum=0.1,
                 sq_grad_momentum=0.001, eps=1e-8):
        self.learning_rate = learning_rate
        self.grad_momentum = grad_momentum
        self.sq_grad_momentum = sq_grad_momentum
        self.eps = eps

    def make_updates(self, params, cost):
        b1 = self.grad_momentum
        b2 = self.sq_grad_momentum
        updates = []
        grads = tt.grad(cost, params)
        for p, g in zip(params, grads):
            m = shared(p.get_value() * 0.)
            v = shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * tt.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (tt.sqrt(v_t) + self.eps)
            p_t = p - self.learning_rate * g_t
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        return updates
    
    
class MomentumSGD(object):
    
    monitor = None
    
    def __init__(self, learning_rate_init=0.1, max_adaptation_count=3,
                 learning_rate_scale=0.2, momentum=0.9):
        self.learning_rate_init = learning_rate_init
        self.max_adaptation_count = max_adaptation_count
        self.learning_rate_scale = learning_rate_scale
        self.momentum = momentum
        
    def on_stalled_convergence(self):
        if self.adaptation_count > self.max_adaptation_count:
            # Optimizer thinks model has converged
            return False
        
        # Rescale the learning rate
        old_lr = self.learning_rate.get_value()
        new_lr = old_lr * self.learning_rate_scale
        self.learning_rate.set_value(new_lr)
        self.adaptation_count += 1
        print("Updated learning rate from %f to %f" % (old_lr, new_lr))

    def make_updates(self, params, cost):
        self.adaptation_count = 0
        self.learning_rate = sharedX(self.learning_rate_init)

        updates = []
        grads = tt.grad(cost, params)
        for p, g in zip(params, grads):
            if self.momentum > 0:
                v = shared(p.get_value() * 0.)
                v_t = self.momentum * v - self.learning_rate * g
                p_t = p + v_t
                updates.append((v, v_t))
            else:
                # traditional SGD
                p_t = p - self.learning_rate * g
            updates.append((p, p_t))
        return updates

    
class NesterovMomentumSGD(MomentumSGD):

    def make_updates(self, params, cost):
        self.adaptation_count = 0
        self.learning_rate = sharedX(self.learning_rate_init)

        updates = []
        grads = tt.grad(cost, params)
        for p, g in zip(params, grads):
            g = tt.clip(g, -.001, .001)
            v = shared(p.get_value() * 0.)
            v_t = self.momentum * v - self.learning_rate * g
            p_t = p + self.momentum * v_t - self.learning_rate * g
            updates.append((v, v_t))
            updates.append((p, p_t))
        return updates

## Orchestration

In [5]:
class EarlyStoppingMonitor(object):
    def __init__(self, X_train, y_train, X_val=None, y_val=None, patience=3,
                 data_log_filename=None, subsample=int(5e3), random_state=None):

        X_train, y_train = check_X_y(X_train, y_train, dtype=floatX)
        self.label_binarizer_ = lb = LabelBinarizer().fit(y_train)

        if subsample is not None and subsample < X_train.shape[0]:
            X_train, y_train = shuffle(X_train, y_train, random_state=random_state)
            X_train = X_train[:subsample]
            y_train = y_train[:subsample]
        self.X_train = X_train
        self.y_train = y_train
        self.Y_train = lb.transform(y_train)

        if X_val is not None:
            X_val, y_val = check_X_y(X_val, y_val, dtype=floatX)
            if subsample is not None and subsample < X_val.shape[0]:
                X_val, y_val = shuffle(X_val, y_val,
                                       random_state=random_state)
                X_val = X_val[:subsample]
                y_val = y_val[:subsample]
            self.Y_val = lb.transform(y_val)
        self.X_val = X_val
        self.y_val = y_val

        self.data_log_filename = data_log_filename
        self.patience_reset = patience
        self.reset()
    
    def reset(self):
        self.best_cost = np.inf
        self.patience = self.patience_reset
    
    def accuracy(self, y_true, output):
        indices = output.argmax(axis=1)
        return np.mean(self.label_binarizer_.classes_[indices] == y_true)
        
    def monitor(self, model, context):
        epoch = context.get('epoch')
        optimizer = context.get('optimizer')

        training_out, training_cost = model._forward_and_cost(
            self.X_train, self.Y_train)
        training_acc = self.accuracy(self.y_train, training_out)
        if self.X_val is not None:
            validation_out, validation_cost = model._forward_and_cost(
                self.X_val, self.Y_val)
            validation_acc =  self.accuracy(self.y_val, validation_out)
            current_cost = validation_cost      
        else:
            validation_cost = None
            validation_acc = None
            current_cost = training_cost

        msg = "Epoch #%03d, training cost: %0.6f, acc: %0.4f" % (
            epoch, training_cost, training_acc)
        
        if validation_out is not None:
            msg += ", validation cost: %0.6f, acc: %0.4f" % (
                validation_cost, validation_acc)
        print(msg)

        # Patience-based stopping condition
        if current_cost < self.best_cost - 1e-6:
            self.patience = self.patience_reset
            self.best_cost = current_cost
        elif (hasattr(optimizer, 'on_stalled_convergence')
              and not optimizer.on_stalled_convergence()):
            # Optimizer has already had enough opportunities to adapt the
            # learning rate in the past. We can start loosing patience.
            self.patience -= 1
        else:
            # The optimizer does not use validation set convergence info,
            # we can loose patience without notifying it.
            self.patience -= 1
        return self.patience > 0

monitor = EarlyStoppingMonitor(X_train, y_train, X_val=X_val, y_val=y_val,
                               random_state=0)

In [6]:
class MLPClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, hidden=(100, 100), activation='relu', init_gain='auto',
                 batch_size=128, optimizer=None, n_epochs=1000,
                 weight_decay=int(1e-5), warm_start=False, random_state=None):
        self.hidden = hidden
        self.activation = activation
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.n_epochs = n_epochs
        self.weight_decay = weight_decay
        self.init_gain = init_gain
        self.warm_start = warm_start
        self.random_state = random_state
        
    def _init_parameters(self, n_features, n_outputs):
        rng = check_random_state(self.random_state)
        input_dims = (n_features,) + self.hidden
        output_dims = self.hidden + (n_outputs,)

        if self.init_gain == 'auto':
            g = np.sqrt(2) if self.activation == 'relu' else 1.
        else:
            g = self.init_gain
        
        self.weights_ = []
        self.biases_ = []
        for l, (in_dim, out_dim) in enumerate(zip(input_dims, output_dims)):
            std = 2 * g / (in_dim + out_dim)
            W = sharedX(rng.normal(size=(in_dim, out_dim), scale=std))
            self.weights_.append(W)
            b = sharedX(np.zeros(out_dim))
            self.biases_.append(b)
        
    def _make_functions(self):
        x = tt.matrix()
        y = tt.matrix()

        # Define the computation graph of the model
        if self.activation == 'relu':
            sigma = relu
        elif self.activation == 'tanh':
            sigma = tanh
        elif self.activation == 'linear':
            sigma = linear
        else:
            raise ValueError('Unsupported activation: %s' % self.activation)
        
        activations = [sigma] * (len(self.weights_) - 1) + [softmax]
        tmp = x
        for w, b, s in zip(self.weights_, self.biases_, activations):
            tmp = s(tt.dot(tmp, w) + b)
        
        output = tmp
        cost = tt.nnet.binary_crossentropy(output, y).mean()
        if self.weight_decay > 0:
            for w in self.weights_:
                cost += self.weight_decay * tt.sqr(w).sum()
        
        # Use the optimizer to compute the parameter updates based
        # on the gradient of the cost function
        opt = self.optimizer
        if opt is None:
            opt = Adam()

        parameters = []
        parameters += self.weights_
        parameters += self.biases_
        fit_updates = opt.make_updates(parameters, cost)
        
        # Compile the functions them-selves
        f = theano.function
        self._fit = f([x, y], cost, updates=fit_updates, name='_fit')
        self._forward_and_cost = f([x, y], (output, cost), name="_forward_and_cost")
        self._forward = f([x], output, name="_forward")
        return opt

    def fit(self, X, y, monitor=None):
        X, y = check_X_y(X, y, dtype=floatX)
        self.label_binarizer_ = lb = LabelBinarizer()
        Y = lb.fit_transform(y).astype(floatX)
        n_samples, n_features = X.shape
        _, n_outputs = Y.shape
        if not self.warm_start or not hasattr(self, 'weights_') :
            self._init_parameters(n_features, n_outputs)

        optimizer = self._make_functions()
        self.training_costs_ = []
        if monitor is not None:
            monitor.reset()

        try:
            for epoch in range(self.n_epochs):
                if monitor is not None and not monitor.monitor(self, locals()):
                    break
                for X_batch, Y_batch in iter_data(X, Y):
                    cost = self._fit(X_batch, Y_batch)
                    self.training_costs_.append(cost)
        except KeyboardInterrupt:
            print("Interruped by user")

        return self
    
    def predict_proba(self, X):
        X = check_array(X, dtype=theano.config.floatX)
        return self._forward(X)

    def predict(self, X):
        indices = self.predict_proba(X).argmax(axis=1)
        return self.label_binarizer_.classes_[indices]

In [15]:
1 / 1.5

0.6666666666666666

In [None]:
%%time
optimizer = NesterovMomentumSGD(learning_rate_init=.1, learning_rate_scale=0.2, momentum=0.9)
# optimizer = Adam()
mlp = MLPClassifier(hidden=(100, 100), batch_size=128, init_gain='auto', weight_decay=1e-6,
                    optimizer=optimizer, random_state=0)
mlp.fit(X_train, y_train, monitor=monitor)

Epoch #000, training cost: 0.325075, acc: 0.1302, validation cost: 0.325075, acc: 0.1460
Epoch #001, training cost: 0.084984, acc: 0.8498, validation cost: 0.089047, acc: 0.8380
Epoch #002, training cost: 0.052269, acc: 0.9106, validation cost: 0.058006, acc: 0.9070
Epoch #003, training cost: 0.038023, acc: 0.9386, validation cost: 0.041551, acc: 0.9340
Epoch #004, training cost: 0.029942, acc: 0.9510, validation cost: 0.031648, acc: 0.9480
Epoch #005, training cost: 0.025030, acc: 0.9618, validation cost: 0.026075, acc: 0.9570
Epoch #006, training cost: 0.021554, acc: 0.9650, validation cost: 0.023162, acc: 0.9610
Epoch #007, training cost: 0.018934, acc: 0.9692, validation cost: 0.021389, acc: 0.9650
Epoch #008, training cost: 0.016833, acc: 0.9718, validation cost: 0.020569, acc: 0.9670
Epoch #009, training cost: 0.014838, acc: 0.9740, validation cost: 0.019678, acc: 0.9690
Epoch #010, training cost: 0.013268, acc: 0.9762, validation cost: 0.018941, acc: 0.9710

In [8]:
mlp.score(X_test, y_test)

0.93110000000000004