In [1]:
import time
import os
import re
import numpy as np
import json
from math import ceil
from pattern.en import tokenize
from gensim.models import Word2Vec, KeyedVectors
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout, Activation
from keras.models import load_model
from keras.backend import clear_session

np.random.seed(233)
vocab_dim = 100
maxlen = 200
n_iterations = 20
n_exposures = 10
window_size = 7
batch_size = 2048
n_epochs = 10
cpu_count = 8

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class Loader:
    def __init__(self, file):
        self.file = file
        self.length = 0
    
    def __len__(self):
        if self.length > 0:
            return self.length
        for line in open(self.file):
            self.length += 1
        return self.length
    
    def __iter__(self):
        for line in open(self.file):
            data = json.loads(line)
            yield data

In [3]:
def num_lines(file):
    n = 0
    for line in open(file):
        n += 1
    return n

In [4]:
def time_now():
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

In [5]:
def word2vec_train(file):
    if os.path.exists('Word2Vec'):
        return KeyedVectors.load('Word2Vec')
    data = Loader(file)
    model = Word2Vec(size = vocab_dim,
                     min_count = n_exposures,
                     window = window_size,
                     workers = cpu_count,
                     iter = n_iterations)
    model.build_vocab(data)
    model.train(data, total_examples = model.corpus_count, epochs = n_iterations)
    wv = model.wv
    del model
    wv.save('Word2Vec')
    return wv

In [2]:
def create_dictionary(wv = None):
    n = 0
    w2indx = {}
    embedding_weights = np.zeros((len(wv.vocab.keys()) + 1, vocab_dim))
    for word in wv.vocab.keys():
        n += 1
        w2indx[word] = n
        embedding_weights[n, :] = wv[word]
    n += 1
    return n, w2indx, embedding_weights

In [7]:
def data_generator(file_x, file_y, w2indx):
    while True:
        fx = open(file_x)
        fy = open(file_y)
        while True:
            x = []
            y = []
            for i in range(batch_size):
                txt = fy.readline()
                if not txt:
                    break
                y.append(int(txt))
                txt = json.loads(fx.readline())
                x.append([w2indx.get(word, 0) for word in txt])
            if x == []:
                break
            x = sequence.pad_sequences(x, maxlen = maxlen, padding='post', truncating='post')
            yield (x, y)
        fx.close()
        fy.close()

In [8]:
def train_lstm(n_symbols, w2indx, embedding_weights, file_x, file_y):
    print('[%s] Defining a Simple Keras Model...' % (time_now()))
    model = Sequential()
    model.add(Embedding(input_dim = n_symbols,
                        output_dim = vocab_dim,
                        mask_zero = True,
                        weights = [embedding_weights],
                        input_length = maxlen))
    model.add(LSTM(activation = 'sigmoid', units = 50, recurrent_activation = 'hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    print('[%s] Compiling the Model...' % (time_now()))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

    n_steps = ceil(num_lines(file_y) / batch_size)
    
    for i in range(n_epochs):
        print("[%s] Training (epoch %d)..." % (time_now(), i))
        model.fit_generator(data_generator(file_x, file_y, w2indx), 
                            steps_per_epoch = n_steps, epochs = 1, verbose = 1)
        model.save('lstm_epoch_%d.h5' % i)

    return model

In [9]:
def train():
    print('[%s] Training a Word2Vec model...' % (time_now()))
    wv = word2vec_train('text.json')
    n_symbols, w2indx, embedding_weights = create_dictionary(wv)
    print('[%s] Setting up Arrays for Keras Embedding Layer...' % (time_now()))
    model = train_lstm(n_symbols, w2indx, embedding_weights, 'data_x.json', 'data_y.json')
    print('[%s] Finished.' % (time_now()))
    return model

In [10]:
model = train()

[2018-05-17 02:23:09] Training a Word2Vec model...
[2018-05-17 02:23:12] Setting up Arrays for Keras Embedding Layer...
[2018-05-17 02:23:12] Defining a Simple Keras Model...
[2018-05-17 02:23:16] Compiling the Model...
[2018-05-17 02:23:18] Training (epoch 0)...
Epoch 1/1
[2018-05-17 02:29:59] Training (epoch 1)...
Epoch 1/1
[2018-05-17 02:36:38] Training (epoch 2)...
Epoch 1/1
[2018-05-17 02:43:18] Training (epoch 3)...
Epoch 1/1
[2018-05-17 02:49:57] Training (epoch 4)...
Epoch 1/1
[2018-05-17 02:56:36] Training (epoch 5)...
Epoch 1/1
[2018-05-17 03:03:15] Training (epoch 6)...
Epoch 1/1
[2018-05-17 03:09:55] Training (epoch 7)...
Epoch 1/1
[2018-05-17 03:16:34] Training (epoch 8)...
Epoch 1/1
[2018-05-17 03:23:14] Training (epoch 9)...
Epoch 1/1
[2018-05-17 03:29:53] Finished.


In [3]:
wv = KeyedVectors.load('Word2Vec')
_, w2indx, _ = create_dictionary(wv)
with open('data_test_x.json') as f:
    x = json.loads(f.read())
with open('data_test_y.json') as f:
    y = json.loads(f.read())
for i in range(n_epochs):
    model = load_model('lstm_epoch_%d.h5' % i)
    n = np.zeros((3, 2))
    xx = []
    for j in range(len(y)):
        xx.append([w2indx.get(word, 0) for word in x[j]])
    xx = sequence.pad_sequences(xx, maxlen = maxlen, padding='post', truncating='post')
    predict = model.predict_classes(xx)
    del model
    clear_session()
    for j in range(len(y)):
        n[y[j]][predict[j][0]] += 1
    p = n[1][1] / (n[1][1] + n[0][1])
    r = n[1][1] /(n[1][1] + n[1][0])
    f1 = (2 * p * r) / (p + r)
    print('[Model %d] Precision = %.3f, Recall = %.3f, F1 = %.3f' % (i, p, r, f1))
    print(n)
    print()
del wv
del w2indx

[Model 0] Precision = 0.917, Recall = 0.932, F1 = 0.925
[[458.  42.]
 [ 34. 466.]
 [121. 129.]]

[Model 1] Precision = 0.935, Recall = 0.916, F1 = 0.925
[[468.  32.]
 [ 42. 458.]
 [130. 120.]]

[Model 2] Precision = 0.935, Recall = 0.922, F1 = 0.928
[[468.  32.]
 [ 39. 461.]
 [128. 122.]]

[Model 3] Precision = 0.938, Recall = 0.934, F1 = 0.936
[[469.  31.]
 [ 33. 467.]
 [122. 128.]]

[Model 4] Precision = 0.943, Recall = 0.926, F1 = 0.934
[[472.  28.]
 [ 37. 463.]
 [129. 121.]]

[Model 5] Precision = 0.949, Recall = 0.924, F1 = 0.936
[[475.  25.]
 [ 38. 462.]
 [129. 121.]]

[Model 6] Precision = 0.952, Recall = 0.920, F1 = 0.936
[[477.  23.]
 [ 40. 460.]
 [129. 121.]]

[Model 7] Precision = 0.947, Recall = 0.926, F1 = 0.936
[[474.  26.]
 [ 37. 463.]
 [130. 120.]]

[Model 8] Precision = 0.946, Recall = 0.916, F1 = 0.931
[[474.  26.]
 [ 42. 458.]
 [132. 118.]]

[Model 9] Precision = 0.952, Recall = 0.920, F1 = 0.936
[[477.  23.]
 [ 40. 460.]
 [133. 117.]]



In [5]:
x = ["Good!", "Bad.", "I like this book!", "I like this book.", "I hate this book."]
x = [' '.join(tokenize(re.sub('([a-z][.!?]+)([A-Z])', '\g<1> \g<2>', st, 0))).lower().split() for st in x]
xx = []
for st in x:
    xx.append([w2indx.get(word, 0) for word in st])
xx = sequence.pad_sequences(xx, maxlen = maxlen, padding='post', truncating='post')
predict = model.predict_classes(xx)
print(predict[:, 0])

[1 0 1 1 0]
