In [1]:
"""
Created on Tue Jul 31 22:44:51 2018

For large data-set, Single hidden layer NN with dropout regularizer of 0.5/0.8

@author: Shasha

"""

import pandas as pd
import numpy as np

from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import bcolz

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def single_nn_model():
    model = Sequential([
        Embedding(vocab_size + 1, 32, input_length=max_len),
        Flatten(),
        Dense(100, activation='relu'),
        Dropout(0.8),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [3]:
# Manual model Evaluation with K-fold

def cv_evaluate_nn_model(build_fn, X, y, nb_epoch=5, n_splits=5, batch_size=64, **kwargs):
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    model = KerasClassifier(build_fn=build_fn, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1)
    results = cross_val_score(model, X, y, cv=kfold)
    print('\nModel average accuracy: {:.2f}'.format(results.mean()))
    

In [4]:
train_big = "./dataset/training-data-large.txt"
test_big = "./dataset/test-data-large.txt"

In [5]:
#put the data in to a data frame and divide it in to Sample vs Label
train_big_df = pd.read_csv(train_big, sep = '\t', names = ["Label", "Sample"])

train_big_df_sample_len = train_big_df.Sample.str.split(",").apply(len)


In [6]:
X_train, y = np.array(train_big_df.Sample), np.array(train_big_df.Label)

#print (X_train[:3])

test_big_df = pd.read_csv(test_big, sep = ' ', names = ["Sample"])

In [7]:
test_big_df_sample_len = test_big_df.Sample.str.split(",").apply(len)

X_test = np.array(test_big_df.Sample)

# concatenate both train, test set to build the vocabulary
X_all = np.concatenate((X_train, X_test))

In [8]:
tknzr = Tokenizer(lower = False, split = ',')
tknzr.fit_on_texts(X_all)

In [9]:
print (X_train.shape)

vocab_size = len(tknzr.word_counts)
print ("Size of vocab : ", vocab_size)

(1000000,)
Size of vocab :  67311


In [10]:
# Feature Extraction

#It is too costly to transfer all vocab into sequence, so, limiting to 20K
max_features = 20000
max_len = 400
tknzr = Tokenizer(num_words=max_features, lower=False, split=',')
tknzr.fit_on_texts(X_all)

X_TrainSeqs = tknzr.texts_to_sequences(X_train)
X_TestSeqs = tknzr.texts_to_sequences(X_test)

X_TrainSeqs = sequence.pad_sequences(X_TrainSeqs, maxlen=max_len)
X_TestSeqs = sequence.pad_sequences(X_TestSeqs, maxlen=max_len)

print (X_TrainSeqs.shape, X_TestSeqs.shape)

(1000000, 400) (100000, 400)


In [11]:
#saving the data to a file to load quickly in to memory
def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

def load_array(fname):
    return bcolz.open(fname)[:]

In [12]:
save_array('./dataset/train_large.dat', X_TrainSeqs)
save_array('./dataset/test_large.dat', X_TestSeqs)
save_array('./dataset/y_train.dat', y)

In [13]:
X_TrainSeqs = load_array('./dataset/train_large.dat')
y = load_array('./dataset/y_train.dat')
X_TestSeqs = load_array('./dataset/test_large.dat')

In [14]:
#For this large data doing cross-validation would be very time consuming. So here I split the data to training set and validation set to test the models.
X_train, X_test, y_train, y_test = train_test_split(X_TrainSeqs, y, test_size=0.2, random_state=0)

In [15]:
# Model Initialization & Training
model = single_nn_model()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64) # manual validation

Train on 800000 samples, validate on 200000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f11feb7c208>

In [17]:
y_preds = model.predict(X_TestSeqs)

In [20]:
y_preds[-5]

array([0.9264522], dtype=float32)