# The classification task

This is where all things come together, where the actual classification task is carried out.

In [116]:
# define a logger
import logging
logging.getLogger().setLevel(logging.INFO)

## Utility functions

In [117]:
class EarlyStopping(object):
    def __init__(self, patience=100):
        self.patience = patience
        self.best_valid = np.inf
        self.best_valid_epoch = 0
        self.best_weights = None

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_loss']
        current_epoch = train_history[-1]['epoch']
        if current_valid < self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = nn.get_all_params_values()
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("Early stopping.")
            print("Best valid loss was {:.6f} at epoch {}.".format(
                self.best_valid, self.best_valid_epoch))
            nn.load_params_from(self.best_weights)
            raise StopIteration()

In [118]:
# Read model from disk
from gensim.models import Doc2Vec
import os

def loadModel(dim=600):
    if(dim not in (100, 300, 600)):
        raise ValueError('dim must be 100, 300 or 600')
        
    modelName = 'allDocs' + str(dim) + 'D.model'
    modelBasePath = 'cache'
    modelPath = os.path.join(os.getcwd(), modelBasePath, modelName)

    logging.info('start loading the model')
    model = Doc2Vec.load(modelPath)
    logging.info('loading completed')
    return model

In [119]:
# compile the corpus
import os
import json
from tqdm import tqdm
import numpy as np

def loadData(model=None, regression=False):
    JSONFILESDIR = 'data/json'
    X = []
    y = []
    
    if regression:
        zero = np.array([0,1], dtype=np.int32)
        one = np.array([1,0], dtype=np.int32)
    else:
        zero = np.int32(0)
        one = np.int32(1)

    logging.info('building corpus...')
    filenames = model.docvecs.doctags.keys()
    for k in tqdm(filenames):
        with open(os.path.join(JSONFILESDIR, k + '.json')) as fh:
            jsonFile = json.load(fh)

        if jsonFile['lang'] != 'en' or jsonFile['citedBy'] is None:
            logging.debug('{f} discarded from corpus.'.format(f=k))
            continue

        X.append(model.docvecs[k])
        isSuccessfull = one if int(jsonFile['citedBy']) > 0 else zero
        y.append(isSuccessfull)
        logging.debug('{f} absorbed into corpus.'.format(f=k))

    # transform to numpy arrays
    X = np.array(X)
    y = np.array(y)

    logging.info('corpus complete')
    return X, y

In [120]:
# classifying neural net
from lasagne import layers
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
from lasagne.nonlinearities import rectify, tanh, softmax, sigmoid

def makeLayers(depth=3):
    yield ('input', layers.InputLayer)
    yield ('hidden0', layers.DenseLayer)
    for i in range(1, depth - 2):
        yield ('dropout' + str(i-1), layers.DropoutLayer)
        yield ('hidden' + str(i), layers.DenseLayer)
    yield ('output', layers.DenseLayer)
    
def makeParameters(arch, dropout_p, nonlinearity):
    if len(arch) < 3:
        raise ValueError("The network must be at least 3 layers deep")
    depth = len(arch)
    params = {}
    
    # the static ones
    exec('params["{}"] = ({}, {})'.format('input_shape', "None", arch[0]))
    exec('params["{}"] = {}'.format('hidden0_num_units', arch[1]))
    for i in range(1, depth - 2):
        exec('params["{}"] = {}'.format('dropout' + str(i-1) + '_p', dropout_p))
        exec('params["{}"] = {}'.format('hidden' + str(i) + '_num_units', arch[i+1]))
    
    # again some statics ones
    exec('params["{}"] = {}'.format('output_num_units', arch[-1]))
    exec('params["{}"] = {}'.format('output_nonlinearity', nonlinearity))
    return params

def loadNN(arch=(100, 400, 160, 2), dropout_p=0.6, epochs=50,\
           nonlinearity=None, regression=False, evalSize=.1):
    if not regression and nonlinearity != 'softmax':
        nonlinearity = 'softmax'
        logging.info("nonlinearity was set to 'softmax'. This is\
            the only non-linearity supported by classification")
        
    return NeuralNet(
        # configuration
        layers=[x for x in makeLayers(len(arch))],
        **makeParameters(arch, dropout_p, nonlinearity),
        
        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,
        update_momentum=0.9,

        regression=regression,
        max_epochs=epochs,
        eval_size=evalSize,
        verbose=1,
        on_epoch_finished=[EarlyStopping(patience=10)]
        )

In [121]:
# NN creating factory
from sklearn.cross_validation import train_test_split
from collections import namedtuple
import numpy as np

try:
    model
except NameError:
    model = {}

try:
    X
    y
except NameError:
    X, y = {}, {}

# returns a trained network and the corresponding training
# and test data
def genNN(inputDim=600, regression=False, arch=(400, 160, 2), \
            dropout_p=.6, epochs=100, nonlinearity='softmax', \
            evalSize=.1):

    TrainTestData = namedtuple("TrainTest", ["X_train","X_test","y_train","y_test"])
    
    # get the persisted model, including the training 
    # data associated with it. check if already loaded
    global model
    global X, y
    # if currently loaded data doesn't match the required data
    if type(X) == np.ndarray and X.shape[1] != inputDim:
        model, X, y = {}, {}, {}
    if model == {}:
        model = loadModel(inputDim)
    
    # give the data in a convenient format
    if any([type(X) != np.ndarray, type(y) != np.ndarray]):
        X, y = loadData(model=model, regression=regression)

    # add input layer to NN's arch
    myArch = list(arch)
    myArch.insert(0, inputDim)
    
    # instanciate the network
    net = loadNN(arch=myArch, dropout_p=dropout_p, epochs=epochs, \
                 nonlinearity=nonlinearity, regression=regression,
                 evalSize=evalSize)
    # carry out train/test split
    X_train, X_test, y_train, y_test = train_test_split( \
        X, y, test_size=evalSize, random_state=42)
    
    # train the network
    net.fit(X_train, y_train)
    
    return net, TrainTestData(X_train, X_test, y_train, y_test)

In [122]:
from collections import namedtuple

EvalResults = namedtuple('EvalResults', ['All','Pos','Neg', 'SuccRate', 
                         'TrainLoss', 'ValidLoss'])

def evalPrediction(net, trainTestData, regression=False):
    # predict the test set
    predictions = net.predict(trainTestData.X_test)
    predictions = predictions.round().astype(np.int32)
    
    # dealing with the different data representations 
    # of regression vs classification
    if regression:
        truthMatrix = trainTestData.y_test[:, 0] == predictions[:, 0]
        positiveTestCases = trainTestData.y_test[:, 0].sum()
    else:
        truthMatrix = trainTestData.y_test == predictions
        positiveTestCases = trainTestData.y_test.sum()

    testSize = truthMatrix.shape[0]
    correct = truthMatrix.sum()
    false = testSize - truthMatrix.sum()
    train_loss = np.array([i["train_loss"] for i in net.train_history_])
    valid_loss = np.array([i["valid_loss"] for i in net.train_history_])
    
    return EvalResults(testSize, correct, false, correct/float(testSize),\
                       train_loss, valid_loss)

In [123]:
# automated nn testing
from collections import namedtuple
import pickle
import os

NNCandidate = namedtuple('NNCandidate', \
                         ['inputDim', 'dropout', 'arch', 'evalRes'])
NNcandidatesPath = os.path.join('cache', 'NNcandidates')

# predefined network architectures
archs = ((100, 2), (300, 2), (250, 80, 2), (500, 160, 2), (400, 150, 40, 2))

for inputDim in (100, 300, 600):
    for dropout in (.4, .6):
        for arch in archs:
            net, trainTestData = genNN(inputDim=inputDim, arch=arch, dropout_p=dropout)
            evalRes = evalPrediction(net, trainTestData)
            candidate = NNCandidate(inputDim, dropout, arch, evalRes)
            with open(NNcandidatesPath, str(inputDim) + str(dropout)\
                                   + str(arch) + '.pickle'), 'wb') as fh:
                pickle.dump(candidate, fh)

# Neural Network with 10302 learnable parameters

## Layer information

  #  name       size
---  -------  ------
  0  input       100
  1  hidden0     100
  2  output        2

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m0.65171[0m       [32m0.63998[0m      1.01834      0.63967  0.18s
      2       [36m0.62694[0m       [32m0.63346[0m      0.98971      0.64444  0.20s
      3       [36m0.62147[0m       [32m0.63113[0m      0.98469      0.64577  0.19s
      4       [36m0.61808[0m       [32m0.62973[0m      0.98149      0.64691  0.18s
      5       [36m0.61515[0m       [32m0.62847[0m      0.97881      0.64996  0.18s
      6       [36m0.61243[0m       [32m0.62733[0m      0.97624      0.64996  0.18s
      7       [36m0.60979[0m       [32m0.62645[0m      0.97341      0.64996  0.18s
      8       [36m0.60721[0m       [32m0.62581[0m      0.97028      0.651

train_split=TrainSplit(eval_size=0.4)
  warn("The 'eval_size' argument has been deprecated, please use "
INFO:root:start loading the model
INFO:gensim.utils:loading Doc2Vec object from /home/user/projekte/econstorModelling/cache/allDocs300D.model
INFO:gensim.utils:loading docvecs recursively from /home/user/projekte/econstorModelling/cache/allDocs300D.model.docvecs.* with mmap=None
INFO:gensim.utils:loading doctag_syn0 from /home/user/projekte/econstorModelling/cache/allDocs300D.model.docvecs.doctag_syn0.npy with mmap=None
INFO:gensim.utils:loading syn0 from /home/user/projekte/econstorModelling/cache/allDocs300D.model.syn0.npy with mmap=None
INFO:gensim.utils:loading syn1 from /home/user/projekte/econstorModelling/cache/allDocs300D.model.syn1.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:root:loading completed
INFO:root:building corpus...
INFO:root:corpus complete


Early stopping.
Best valid loss was 0.617407 at epoch 38.
Loaded parameters to layer 'hidden0' (shape 100x400).
Loaded parameters to layer 'hidden0' (shape 400).
Loaded parameters to layer 'hidden1' (shape 400x150).
Loaded parameters to layer 'hidden1' (shape 150).
Loaded parameters to layer 'hidden2' (shape 150x40).
Loaded parameters to layer 'hidden2' (shape 40).
Loaded parameters to layer 'output' (shape 40x2).
Loaded parameters to layer 'output' (shape 2).
# Neural Network with 30302 learnable parameters

## Layer information

  #  name       size
---  -------  ------
  0  input       300
  1  hidden0     100
  2  output        2

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m0.65630[0m       [32m0.62976[0m      1.04216      0.63720  0.41s
      2       [36m0.61977[0m       [32m0.61844[0m      1.00215      0.65015  0.33s
      3       [36m0.60920[0m       [32m0.615

INFO:root:start loading the model
INFO:gensim.utils:loading Doc2Vec object from /home/user/projekte/econstorModelling/cache/allDocs600D.model
INFO:gensim.utils:loading docvecs recursively from /home/user/projekte/econstorModelling/cache/allDocs600D.model.docvecs.* with mmap=None
INFO:gensim.utils:loading doctag_syn0 from /home/user/projekte/econstorModelling/cache/allDocs600D.model.docvecs.doctag_syn0.npy with mmap=None
INFO:gensim.utils:loading syn1 from /home/user/projekte/econstorModelling/cache/allDocs600D.model.syn1.npy with mmap=None
INFO:gensim.utils:loading syn0 from /home/user/projekte/econstorModelling/cache/allDocs600D.model.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:root:loading completed
INFO:root:building corpus...
INFO:root:corpus complete


Early stopping.
Best valid loss was 0.613415 at epoch 26.
Loaded parameters to layer 'hidden0' (shape 300x400).
Loaded parameters to layer 'hidden0' (shape 400).
Loaded parameters to layer 'hidden1' (shape 400x150).
Loaded parameters to layer 'hidden1' (shape 150).
Loaded parameters to layer 'hidden2' (shape 150x40).
Loaded parameters to layer 'hidden2' (shape 40).
Loaded parameters to layer 'output' (shape 40x2).
Loaded parameters to layer 'output' (shape 2).
# Neural Network with 60302 learnable parameters

## Layer information

  #  name       size
---  -------  ------
  0  input       600
  1  hidden0     100
  2  output        2

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m0.65563[0m       [32m0.61905[0m      1.05909      0.64844  0.66s
      2       [36m0.61527[0m       [32m0.60395[0m      1.01874      0.66902  0.60s
      3       [36m0.60283[0m       [32m0.600

In [31]:
# pick the best performing NN
import os
import pickle
NNcandidatesPath = os.path.join('cache', 'NNcandidates')

results = []

for i, p in enumerate(os.listdir(NNcandidatesPath)):
    with open(os.path.join(NNcandidatesPath, p), 'rb') as fh:
        candidate = pickle.load(fh)
    results.append(candidate)
    
results = sorted(results, key=lambda x: x.evalRes.SuccRate)    

In [None]:
# visualize the training progress
%pylab inline
import matplotlib.pyplot as plt

train_loss = np.array([i["train_loss"] for i in net.train_history_])
valid_loss = np.array([i["valid_loss"] for i in net.train_history_])

marginFactor = 1.2
train_min = min(train_loss)
train_max = max(train_loss)
valid_min = min(valid_loss)
valid_max = max(valid_loss)
y_min = min(train_min, valid_min)
y_max = min(train_max, valid_max)

plt.plot(train_loss, linewidth=3, label="train")
plt.plot(valid_loss, linewidth=3, label="valid")
plt.grid()
plt.legend()
plt.xlabel("epoch")
plt.ylabel("loss")
plt.ylim(y_min * 1/marginFactor, y_max * marginFactor)
plt.yscale("log")
plt.show()

In [None]:
from sklearn import manifold, datasets
X_, color = datasets.samples_generator.make_s_curve(X_test.shape[0], random_state=0)

tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
Y = tsne.fit_transform(X_test)
plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
plt.title("t-SNE")

plt.show()