# Training a Neural model with the new, simplified dataset

In [107]:
%matplotlib inline
# standard library
import itertools
import sys, os
import re
import glob
import shutil
import functools
import math

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask
from scipy import sparse

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import randint as sp_randint

# skealrn
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, GroupKFold
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA, NMF
from sklearn.svm import LinearSVC
from sklearn.externals.joblib import Memory
from sklearn.utils import class_weight
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


# tesnsorflow
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adagrad, SGD
from keras import activations
from keras.layers.advanced_activations import LeakyReLU
from keras.callbacks import Callback

# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

import tf_utils, tf_experiments
from utils import dict_combinations, RecDict, ItemSelector, MyKerasClassifier
from keras_utils import KerasSparseClassifier, create_model, sparse_generator

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [6]:
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [7]:
if 'tensorflow' == K.backend():
    from keras.backend.tensorflow_backend import set_session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

In [8]:
# define some constants
# RANDOM_SEED = 41
np.random.seed()
tf.set_random_seed(time.time())

In [163]:
def group_argsort(x, shuffle=True):
    """Sortng function that preserves grouping of elements. 
    Same kind elements stay together, but the groups may be shuffled if specified."""
    uniq_elems = np.unique(x)
    if shuffle:
        np.random.shuffle(uniq_elems)

    # get the indices for each unique item
    new_indices = np.zeros(x.shape, dtype=int)
    current_pos = 0
    for elem in uniq_elems:
        elem_indices = np.where(x == elem)[0]  # the positions of elem
        new_indices[current_pos:current_pos + len(elem_indices)] = elem_indices
        current_pos += len(elem_indices)
    
    return new_indices

vals = np.array([1, 0, 2, 1, 1, 0, 1])
vals[group_argsort(vals, shuffle=True)]

array([2, 0, 0, 1, 1, 1, 1])

## Inputting Dragnet

In [9]:
dataset = tf_utils.get_numpy_dataset('../data/final/dragnet/dom-full-*.csv', text_cols=['class_text', 'id_text', 'block_text'], categorize_id=False)

  args2 = [_execute_task(a, cache) for a in args]


In [10]:
# dataset['text'][dataset['text'] != dataset['text']] = ''  # to fix the nan problem

In [11]:
def camel_case_split(identifier):
    # split text on camel case
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return ' '.join([m.group(0) for m in matches])

def preprocess_text(text):
    return  ' ' + camel_case_split(text.replace('_', '-').replace('-', ' ')).lower() + ' '

## simple keras model(no sklearn compatibility)

In [193]:
is_block = dataset['is_block'].ravel()
data_X = RecDict({'numeric': dataset['numeric'][is_block], 'text': dataset['text'][is_block, 0]})
data_y = dataset['y'][is_block]
groups = dataset['id'][is_block, 0]  # for the url

# transform pipeline(the sklearn pipeline but without estimation)
tr_pipeline = Pipeline(steps=[
    ('union', FeatureUnion(transformer_list=[
        ('numeric', Pipeline(steps=[
            ('select', ItemSelector(key='numeric')),
        ])),
        ('class', Pipeline(steps=[
            ('select', ItemSelector(key='text')),
            ('text', TfidfVectorizer(analyzer='char_wb', ngram_range=(3,3), use_idf=False,
                                     preprocessor=preprocess_text))
        ]))],
        transformer_weights={
            'numeric': 1.0,
            'class': 1.0
        },
    )),
    ('normalizer', MaxAbsScaler())
]) 

# tranform the data
data_X = tr_pipeline.fit_transform(data_X, data_y)

# order them by group, for consistency
order = group_argsort(groups)
groups = groups[order]
data_X = data_X[order]
data_y = data_y[order]

In [194]:
# # get the splits(train, validation, test)
# splitter = GroupKFold(10).split(data_X, data_y, groups=groups)
# split_slices = [0, 0, 0]
# split_slices[2] = next(splitter)[1]
# split_slices[1] = next(splitter)[1]

# # get the rest
# split_slices[0] = np.ones(data_X.shape[0], dtype=bool)
# split_slices[0][split_slices[2]] = False
# split_slices[0][split_slices[1]] = False

# split_slices

splitter = GroupKFold(10).split(data_X, data_y, groups=groups)
split_slices = [0, 0, 0]
split_slices[2] = next(splitter)[1]

# # get the rest
rest = np.ones(data_X.shape[0], dtype=bool)
rest[split_slices[2]] = False
rest_ind = np.nonzero(rest)[0]

split_ind = int(rest_ind.shape[0] * .9)
split_slices[1] = rest_ind[split_point:]
split_slices[0] = rest_ind[:split_point]

split_slices

[array([     0,      1,      2, ..., 114251, 114252, 114253]),
 array([114254, 114255, 114256, ..., 126251, 126252, 126253]),
 array([  2500,   2501,   2502, ..., 125806, 125807, 125808])]

In [195]:
# add the metrics
class Metrics(Callback):
    def __init__(self, validation_data, batch_size, *args,  prefix='', **kwargs):
        super().__init__(*args, **kwargs)
        self._validation_data = validation_data
        self._batch_size = batch_size
        self.prefix = prefix
        
    def on_epoch_end(self, epoch, logs={}):
        preds = self.model.predict_generator(
            sparse_generator(self._validation_data[0], None, self._batch_size, shuffle=False),
            steps=np.ceil(self._validation_data[0].shape[0] / self._batch_size)
        )

        predict = np.round(np.asarray(preds))
        target = self._validation_data[1]
        results = {
            'precision': precision_score(target, predict),
            'recall': recall_score(target, predict),
            'f1': f1_score(target, predict)
        }
        print(' - '.join('{}{}: {}'.format(self.prefix, name, val) for name, val in results.items()))
        
        for name, val in results.items():
            logs['{}{}'.format(self.prefix, name)] = val 
        
metrics = Metrics((data_X[split_slices[1]], data_y[split_slices[1]]), 1024, prefix='val_')
metrics_train = Metrics((data_X[split_slices[0]], data_y[split_slices[0]]), 1024)

In [196]:
# compute the weights
weights = class_weight.compute_class_weight('balanced', [0, 1], data_y[split_slices[0]])
weights

array([0.83553384, 1.24508133])

In [197]:
# define the callbacks and the model
checkpoint = keras.callbacks.ModelCheckpoint('/home/nikitautiu/models/keras/weights.best.hdf5', 
                                             monitor='val_f1', verbose=1, save_best_only=True, mode='max')
early_stopper = keras.callbacks.EarlyStopping(monitor='val_f1', min_delta=0.0001, patience=50, verbose=1, mode='max')
                            
    
model = create_model(nb_features=data_X.shape[1], optimizer='rmsprop', hidden_layers=[1000, 500, 100],
                     activation='relu', dropout=.2)

In [198]:
# train

model.fit_generator(
    sparse_generator(data_X[split_slices[0]], data_y[split_slices[0]], 1024, shuffle=True),
    steps_per_epoch=np.ceil(data_X[split_slices[0]].shape[0] / 1024),
    validation_data=sparse_generator(data_X[split_slices[1]], data_y[split_slices[1]], shuffle=False),
    validation_steps=np.ceil(data_X[split_slices[1]].shape[0] / 1024),
    class_weight=weights,
    callbacks=[metrics_train, metrics,  checkpoint, early_stopper],
    epochs=50
)

Epoch 1/50
recall: 0.892300574656667 - precision: 0.9292742303595882 - f1: 0.9104121636729523
val_recall: 0.8690361213057756 - val_precision: 0.930506721820062 - val_f1: 0.8987215341590091

Epoch 00001: val_f1 improved from -inf to 0.89872, saving model to /home/nikitautiu/models/keras/weights.best.hdf5
Epoch 2/50
recall: 0.9365686179020162 - precision: 0.9345206278244813 - f1: 0.935543502055311
val_recall: 0.9136565578520378 - val_precision: 0.9307359307359307 - val_f1: 0.9221171654157325

Epoch 00002: val_f1 improved from 0.89872 to 0.92212, saving model to /home/nikitautiu/models/keras/weights.best.hdf5
Epoch 3/50
recall: 0.9501314892373625 - precision: 0.9398333253046871 - f1: 0.9449543506163273
val_recall: 0.9200309059300753 - val_precision: 0.9342879560612005 - val_f1: 0.9271046228710463

Epoch 00003: val_f1 improved from 0.92212 to 0.92710, saving model to /home/nikitautiu/models/keras/weights.best.hdf5
Epoch 4/50
recall: 0.9497175416382585 - precision: 0.9518730933496035 - f1: 

recall: 0.9684425830330184 - precision: 0.9749950970778584 - f1: 0.9717077937942828
val_recall: 0.9464941085570794 - val_precision: 0.9712586719524281 - val_f1: 0.9587164938368227

Epoch 00022: val_f1 did not improve
Epoch 23/50
recall: 0.9742378494204733 - precision: 0.9733372257091423 - f1: 0.9737873293255774
val_recall: 0.9569248599575043 - val_precision: 0.9619417475728156 - val_f1: 0.9594267454246153

Epoch 00023: val_f1 did not improve
Epoch 24/50
recall: 0.9679799357163729 - precision: 0.9747204786190663 - f1: 0.9713385134144554
val_recall: 0.9534479428240293 - val_precision: 0.9633099141295862 - val_f1: 0.9583535579069993

Epoch 00024: val_f1 did not improve
Epoch 25/50
recall: 0.9758205902405767 - precision: 0.972505338769171 - f1: 0.9741601439058778
val_recall: 0.9658103148541627 - val_precision: 0.9567546880979717 - val_f1: 0.9612611746611555

Epoch 00025: val_f1 improved from 0.96119 to 0.96126, saving model to /home/nikitautiu/models/keras/weights.best.hdf5
Epoch 26/50
rec

recall: 0.9824437518262394 - precision: 0.9692274430671663 - f1: 0.9757908484086293
val_recall: 0.9708325284914043 - val_precision: 0.9506336296576509 - val_f1: 0.9606269113149847

Epoch 00045: val_f1 did not improve
Epoch 46/50
recall: 0.9783773254115126 - precision: 0.9776869357860671 - f1: 0.9780320087628553
val_recall: 0.9611744253428627 - val_precision: 0.9669646327244462 - val_f1: 0.964060835028577

Epoch 00046: val_f1 did not improve
Epoch 47/50
recall: 0.9853413850199669 - precision: 0.9691062362295239 - f1: 0.9771563797932966
val_recall: 0.9675487734209002 - val_precision: 0.958293476181366 - val_f1: 0.9628988850442136

Epoch 00047: val_f1 did not improve
Epoch 48/50
recall: 0.9822733028148437 - precision: 0.9737610736958988 - f1: 0.9779986665858538
val_recall: 0.9679350975468418 - val_precision: 0.9572110792741165 - val_f1: 0.9625432193622743

Epoch 00048: val_f1 did not improve
Epoch 49/50
recall: 0.9778659783773254 - precision: 0.9769857681547257 - f1: 0.9774256751000937
va

<keras.callbacks.History at 0x7f4e9b28d7b8>

In [199]:
model.load_weights('/home/nikitautiu/models/keras/weights.best.hdf5') # load the weights

In [200]:
preds = model.predict_generator(
    sparse_generator(data_X[split_slices[0]], None, 1024, shuffle=False),
    steps=np.ceil(data_X[split_slices[0]].shape[0] / 1024)
)
preds = np.round(preds)
f1_score(preds, data_y[split_slices[0]])

0.9749436049191064

In [201]:
preds = model.predict_generator(
    sparse_generator(data_X[split_slices[1]], None, 1024, shuffle=False),
    steps=np.ceil(data_X[split_slices[1]].shape[0] / 1024)
)
preds = np.round(preds)
f1_score(preds, data_y[split_slices[1]])

0.9663354876049001

In [202]:
preds = model.predict_generator(
    sparse_generator(data_X[split_slices[2]], None, 1024, shuffle=False),
    steps=np.ceil(data_X[split_slices[2]].shape[0] / 1024)
)
preds = np.round(preds)
f1_score(preds, data_y[split_slices[2]])

0.9644556962025316

## sklearn compatible model

In [224]:
# is_block = dataset['is_block'].ravel()
# data_X = RecDict({'numeric': dataset['numeric'][is_block], 'text': dataset['text'][is_block, 0]})
# data_y = dataset['y'][is_block]
# groups = dataset['id'][is_block, 0]  # for the url

is_block = dataset['is_block'].ravel()
data_X = RecDict({'numeric': dataset['numeric'][:], 'text': dataset['text'][:, 0]})
data_y = dataset['y'][:]
groups = dataset['id'][:, 0]  # for the url

# order them by group, for consistency
order = group_argsort(groups)
groups = groups[order]
data_X = data_X[order]
data_y = data_y[order]

In [225]:
# get the split slices(just train, test - the validation is split in the estimator)
splitter = GroupKFold(10).split(data_X, data_y, groups)
split_slices = [0, 0]
split_slices[1] = next(splitter)[1]

split_slices[0] = np.ones(data_X.shape[0], dtype=bool)
split_slices[0][split_slices[1]] = False
split_slices[0] = np.nonzero(split_slices[0])[0]

split_slices

[array([      0,       1,       2, ..., 1085425, 1085426, 1085427]),
 array([  12267,   12268,   12269, ..., 1082103, 1082104, 1082105])]

In [230]:
# define the classifier
estimator = MyKerasClassifier(create_model, shuffle=True,
                        hidden_layers=[1000, 500, 100],
                        optimizer='rmsprop', dropout=.2, activation='relu',
                        class_weight='balanced', epochs=25, patience=200)

# define the pipeline
# FeatureUnion gives 
est = Pipeline(steps=[
    ('union', FeatureUnion(transformer_list=[
        ('numeric', Pipeline(steps=[
            ('select', ItemSelector(key='numeric')),
        ])),
        ('class', Pipeline(steps=[
            ('select', ItemSelector(key='text')),
            ('text', TfidfVectorizer(analyzer='char_wb', ngram_range=(3,3), use_idf=False,
                                     preprocessor=preprocess_text))
        ]))],
        transformer_weights={
            'numeric': 1.0,
            'class': 1.0
        },
    )),
    ('normalizer', MaxAbsScaler()),
    ('classify', estimator)
])

In [231]:
est.fit(data_X[split_slices[0]], data_y[split_slices[0]])

Epoch 1/25
val_recall: 0.8694571073352673 - val_precision: 0.6218138707765264 - val_f1: 0.725073440470019

Epoch 00001: val_f1 improved from -inf to 0.72507, saving model to /tmp/tmpvib_42tl
Epoch 2/25
val_recall: 0.8547451305428927 - val_precision: 0.5920769341179848 - val_f1: 0.699567540066141

Epoch 00002: val_f1 did not improve
Epoch 3/25
val_recall: 0.9094488188976378 - val_precision: 0.657331136738056 - val_f1: 0.763105276884291

Epoch 00003: val_f1 improved from 0.72507 to 0.76311, saving model to /tmp/tmpvib_42tl
Epoch 4/25
val_recall: 0.8976377952755905 - val_precision: 0.7071498530852106 - val_f1: 0.7910883856829803

Epoch 00004: val_f1 improved from 0.76311 to 0.79109, saving model to /tmp/tmpvib_42tl
Epoch 5/25
val_recall: 0.8984666390385412 - val_precision: 0.6105322444381864 - val_f1: 0.727028839704896

Epoch 00005: val_f1 did not improve
Epoch 6/25
val_recall: 0.8818897637795275 - val_precision: 0.5969144460028051 - val_f1: 0.711943793911007

Epoch 00006: val_f1 did not 

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(memory=None, steps=[('select', ItemSelector(key='numeric'))])), ('class', Pipeline(memory=None,
     steps=[('select', ItemSelector(key='text')), ('text', TfidfVectorizer(analyzer='char_wb', binary=False, decode_er...alizer', MaxAbsScaler(copy=True)), ('classify', <utils.MyKerasClassifier object at 0x7f4e232a5710>)])

In [232]:
f1_score(est.predict(data_X[split_slices[0]]), data_y[split_slices[0]])

0.902261108529168

In [233]:
f1_score(est.predict(data_X[split_slices[1]]), data_y[split_slices[1]])

0.8903566710700132