In [1]:
import random
import keras
import subprocess
import pickle
import numpy as np
import pandas as pd

from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

tokenizer = Tokenizer()

Using TensorFlow backend.


In [2]:
def extract_stems(line):
    tokens = word_tokenize(line)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    porter = PorterStemmer()
    stems = []
    for t in tokens:
        stems.append(porter.stem(t))
    return stems

In [3]:
def get_one_hot(stems, vocab):
    seq = []
    for stem in stems:
        if stem in vocab:
            seq.append(vocab[stem] - 1)
    return seq

In [4]:
def load_vocab():
    with open('vocab.pickle', 'rb') as f:
        vocab = pickle.load(f)
        return vocab

In [5]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, vocab, maxlen, bot_file_name='tr-bot.txt', gen_file_name='tr-gen.txt', batch_size=64):
        'Initialization'
        self.bot_file_name = bot_file_name
        self.gen_file_name = gen_file_name
        self.vocab = vocab
        self.maxlen = maxlen
        self.size = int(subprocess.check_output(['wc', '-l', bot_file_name]).split()[0]) * 2
        self.batch_size = batch_size
        self.bot_file = open(self.bot_file_name, 'r', errors='ignore')
        self.gen_file = open(self.gen_file_name, 'r', errors='ignore')
        self.on_epoch_end()

    def on_epoch_end(self):
        self.bot_file = open(self.bot_file_name, 'r', errors='ignore')
        self.gen_file = open(self.gen_file_name, 'r', errors='ignore')

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(self.size / self.batch_size)

    def __getitem__(self, index):
        'Generate one batch of data'
        bot_data = []
        for _ in range(int(self.batch_size / 2)):
            bot_data.append(self.bot_file.readline())
        gen_data = []
        for _ in range(int(self.batch_size / 2)):
            gen_data.append(self.gen_file.readline())

        x_data = []
        for line in bot_data:
            stems = extract_stems(line)
            x_data.append(get_one_hot(stems, vocab))
        for line in gen_data:
            stems = extract_stems(line)
            x_data.append(get_one_hot(stems, vocab))
        y_data = [1] * len(bot_data) + [0] * len(gen_data)

        train_set = list(zip(x_data, y_data))
        random.shuffle(train_set)
        x, y = zip(*train_set)
        x = sequence.pad_sequences(np.array(x), maxlen=self.maxlen)
        return x, np.array(y)

In [6]:
def get_test_data(maxlen):
    with open('test-bot.txt', 'r', errors='ignore') as file:
        bot_data = file.readlines()

    with open('test-gen.txt', 'r', errors='ignore') as file:
        gen_data = file.readlines()

    x_data = []
    for line in bot_data:
        stems = extract_stems(line)
        x_data.append(get_one_hot(stems, vocab))
    for line in gen_data:
        stems = extract_stems(line)
        x_data.append(get_one_hot(stems, vocab))
    y_data = [1] * len(bot_data) + [0] * len(gen_data)
    
    x = sequence.pad_sequences(np.array(x_data), maxlen=maxlen)
    return x, np.array(y_data)

In [7]:
max_features = 20000
maxlen = 80
batch_size = 64
vocab = load_vocab()
print('vocab size:', len(vocab))

vocab size: 20000


In [12]:
training_generator = DataGenerator(vocab, maxlen, 'tr-bot.txt', 'tr-gen.txt', batch_size)
validation_generator = DataGenerator(vocab, maxlen, 'val-bot.txt', 'val-gen.txt', batch_size)

In [13]:
model = Sequential()
model.add(Embedding(len(vocab), 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [14]:
x_test, y_test = get_test_data(maxlen)

In [15]:
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    workers=1,
                    epochs=10)

score, acc = model.evaluate(x_test, y_test,
                           batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.6134560176481804
Test accuracy: 0.8054606119791666
