In [8]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import check_X_y
import theano
import theano.tensor as tt
from theano import shared
from time import time
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 

In [9]:
from sklearn.datasets import fetch_mldata
from sklearn.utils import check_array
from sklearn.cross_validation import train_test_split


def load_data(dtype=np.float32, order='C'):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]
    # Scale the features to [0, 1]
    X = X / 255
    ## Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]
    return X_train, X_test, y_train, y_test

X_dev, X_test, y_dev, y_test = load_data()

# Create a small training set for faster experiments
X_train, X_val, y_train, y_val = train_test_split(
    X_dev, y_dev, train_size=int(1e4), random_state=0)

Loading dataset...
Creating train-test split...


## Various neural networks utilities

Mostly taken from Alec Radford and Kyle Kastner's gists:

- https://gist.github.com/Newmu/a56d5446416f5ad2bbac
- https://gist.github.com/kastnerkyle/f3f67424adda343fef40

In [4]:
def sharedX(X, dtype=theano.config.floatX, name=None):
    return theano.shared(np.asarray(X, dtype=dtype), name=name)


def softmax(x):
    return tt.nnet.softmax(x)


def relu(x):
    return (x + abs(x)) / 2.0


def tanh(x):
    return tt.tanh(x)


def sigmoid(x):
    return tt.nnet.sigmoid(x)


def linear(x):
    return x


def iter_data(*data, **kwargs):
    batch_size = kwargs.get('batch_size', 128)
    batches = len(data[0]) / batch_size
    if len(data[0]) % batch_size != 0:
        batches += 1
    for b in range(batches):
        start = b * batch_size
        end = (b + 1) * batch_size
        if len(data) == 1:
            yield data[0][start:end]
        else:
            yield tuple([d[start:end] for d in data])
            

class Adam(object):
    """no bias init correction and no b1 decay"""
    
    def __init__(self, learning_rate=0.0001, grad_momentum=0.1,
                 sq_grad_momentum=0.001, eps=1e-8):
        self.learning_rate = learning_rate
        self.grad_momentum = grad_momentum
        self.sq_grad_momentum = sq_grad_momentum

    def make_updates(self, params, cost):
        b1 = self.grad_momentum
        b2 = self.sq_grad_momentum
        updates = []
        grads = tt.grad(cost, params)
        for p, g in zip(params, grads):
            m = shared(p.get_value() * 0.)
            v = shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * tt.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (tt.sqrt(v_t) + e)
            p_t = p - (self.learning_rate * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        return updates


In [None]:
class EarlyStoppingMonitor(object):
    def __init__(self, X_val=None, y_val=None, data_log_filename=None):
        self.X_val = X_val
        self.y_val = y_val
        self.data_log_filename = data_log_filename
        
    def monitor(self, model):
        # TODO
        return True


class MLPClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, hidden=(100, 100), activation='relu',  batch_size=128,
                 optimizer=None, n_epochs=1000):
        self.hidden = hidden
        self.activation = activation
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.n_epochs = n_epochs
        
    def _make_model(self, X, Y):
        input_batch = T.fmatrix()
        target_batch = T.fmatrix()
        
        
        
    def fit(self, X, y, monitor=None):
        X, y = check_X_y(X, y)
        self.label_binarizer_ = lb = LabelBinarizer()
        Y = lb.fit_transform(y)
        self.model_params_, self.model_cost_, input_batch, output_batch = self._make_model(X, Y)
        opt = self.optimizer
        if opt is None:
            opt = Adam()
        updates = opt.make_updates(self.model_params_, self.model_cost_)
        
        
        self._fit_batch = theano.function([data_batch], self.model_cost_,
                                          updates=updates)
        
        for e in range(self.n_epochs):
            if monitor is not None and not monitor(self):
                break
            