In [19]:
import random
import keras
import subprocess
import pickle
import numpy as np
import pandas as pd

from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

tokenizer = Tokenizer()

In [2]:
def extract_stems(line):
    tokens = word_tokenize(line)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    porter = PorterStemmer()
    stems = []
    for t in tokens:
        stems.append(porter.stem(t))
    return stems

In [50]:
def get_one_hot(stems, vocab):
    seq = []
    for stem in stems:
        one_hot = [0] * (len(vocab))
        if stem in vocab:
            one_hot[vocab[stem] - 1] = 1
        seq.append(one_hot)
    return seq

In [51]:
def load_vocab():
    with open('vocab.pickle', 'rb') as f:
        vocab = pickle.load(f)
        return vocab

In [79]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, vocab, maxlen, bot_file='tr-bot.txt', gen_file='tr-gen.txt', batch_size=64):
        'Initialization'
        self.bot_file = bot_file
        self.gen_file = gen_file
        self.vocab = vocab
        self.maxlen = maxlen
        self.size = int(subprocess.check_output(['wc', '-l', bot_file]).split()[0]) * 2
        self.batch_size = batch_size
        self.bot_file = open(self.bot_file, 'r', errors='ignore')
        self.gen_file = open(self.gen_file, 'r', errors='ignore')

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(self.size / self.batch_size)

    def __getitem__(self, index):
        'Generate one batch of data'
        bot_data = []
        for _ in range(self.batch_size):
            bot_data.append(self.bot_file.readline())
        gen_data = []
        for _ in range(self.batch_size):
            gen_data.append(self.gen_file.readline())

        x_data = []
        for line in bot_data:
            stems = extract_stems(line)
            x_data.append(get_one_hot(stems, vocab))
        for line in gen_data:
            stems = extract_stems(line)
            x_data.append(get_one_hot(stems, vocab))
        y_data = [1] * len(bot_data) + [0] * len(gen_data)

        train_set = list(zip(x_data, y_data))
        random.shuffle(train_set)
        x, y = zip(*train_set)
        x = sequence.pad_sequences(np.array(x), maxlen=self.maxlen)
        return x, np.array(y)

In [80]:
max_features = 20000
maxlen = 80
batch_size = 16
vocab = load_vocab()
print('vocab size:', len(vocab))

vocab size: 20000


In [81]:
training_generator = DataGenerator(vocab, maxlen, 'tr-bot.txt', 'tr-gen.txt', batch_size)
validation_generator = DataGenerator(vocab, maxlen, 'val-bot.txt', 'val-gen.txt', batch_size)

In [82]:
model = Sequential()
model.add(LSTM(128, input_shape=(80, len(vocab)), dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [83]:
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=4,
                    epochs=1)

Epoch 1/1
  371/16383 [..............................] - ETA: 1:27:34 - loss: 0.2655 - acc: 0.9181

Process ForkPoolWorker-86:
Process ForkPoolWorker-81:
Process ForkPoolWorker-83:
Process ForkPoolWorker-84:
Process ForkPoolWorker-85:
Process ForkPoolWorker-87:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-88:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda/lib/python3.7

KeyboardInterrupt: 

  File "/opt/anaconda/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
  File "/opt/anaconda/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda/lib/python3.7/multiprocessing/pool.py", line 127, in worker
    put((job, i, result))
  File "/opt/anaconda/lib/python3.7/multiprocessing/queues.py", line 363, in put
    with self._wlock:
  File "/opt/anaconda/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/opt/anaconda/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
KeyboardInterrupt
Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.7/multiprocessing/pool.py", line 127, in worker
    put((job, i, result))
  File "/opt/anaconda/lib/python3.7/multipr