In [5]:
import random
import keras
import subprocess
import numpy as np
import pandas as pd

from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

tokenizer = Tokenizer()

In [2]:
def extract_stems(line):
    tokens = word_tokenize(line)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    porter = PorterStemmer()
    stems = []
    for t in tokens:
        stems.append(porter.stem(t))
    return stems

In [14]:
def get_one_hot(stems, vocab):
    seq = []
    for stem in stems:
        one_hot = [0] * (len(vocab) + 1)
        if stem in vocab:
            one_hot[vocab[word]] = 1
        seq.append(one_hot)
    return seq

In [4]:
def load_vocab():
    with open('vocab.pickle', 'rb') as f:
        vocab = pickle.load(f)
        return vocab

In [None]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, bot_file='tr-bot.txt', gen_file='tr-gen.txt', batch_size=64):
        'Initialization'
        self.bot_file = bot_file
        self.gen_file = gen_file
        self.size = int(subprocess.check_output(['wc', '-l', data_gen]).split()[0]) * 2
        self.batch_size = batch_size
        self.bot_file = open(data_bot, 'r')
        self.gen_file = open(data_gen, 'r')

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(self.size / self.batch_size)

    def __getitem__(self, index):
        'Generate one batch of data'
        bot_data = []
        for _ in range(self.batch_size):
            bot_data.append(self.bot_file.readline())
        gen_data = []
        for _ in range(self.batch_size):
            gen_data.append(self.gen_file.readline())
        
        x_data = []
        for line in bot_data:
            stems = extract_stems(line)
            x_data.append(get_one_hot(stems))
        for line in gen_data:
            stems = extract_stems(line)
            x_data.append(get_one_hot(stems))
        y_data = [1] * len(bot_data) + [0] * len(gen_data)

        train_set = list(zip(x_data, y_data))
        random.shuffle(train_set)
        x_data, y_data = zip(*train_set)
        x = sequence.pad_sequences(self.x_data[index * self.batch_size: (index + 1) * self.batch_size], maxlen=maxlen)
        return x, self.y_data[index * self.batch_size: (index + 1) * self.batch_size]

In [3]:
def get_dataset(bot_train_tokens, gen_train_tokens, vocab):
    bot_train_seq = get_sequence(bot_train_tokens, vocab)
    gen_train_seq = get_sequence(gen_train_tokens, vocab)
    x_train = bot_train_seq + gen_train_seq
    y_train = [0] * len(bot_train_seq) + [1] * len(gen_train_seq)
    train_set = list(zip(x_train, y_train))
    random.shuffle(train_set)
    x, y = zip(*train_set)
    return np.array(x), np.array(y)

In [4]:
def dataset_preparation(max_features):
    bot_train_tokens = extract_tokens(load_doc('tr-small-bot.csv'))
    gen_train_tokens = extract_tokens(load_doc('tr-small-gen.csv'))
    bot_test_tokens = extract_tokens(load_doc('test-bot-dump.csv'))
    gen_test_tokens = extract_tokens(load_doc('test-gen-dump.csv'))
    vocab = generate_vocab(bot_train_tokens, gen_train_tokens, max_features)
    x_train, y_train = get_dataset(bot_train_tokens, gen_train_tokens, vocab)
    x_test, y_test = get_dataset(bot_test_tokens, gen_test_tokens, vocab)
    return (x_train, y_train), (x_test, y_test)

In [5]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, x_data, y_data, batch_size=16):
        'Initialization'
        self.x_data = x_data
        self.y_data = y_data
        self.batch_size = batch_size
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.x_data) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        x = sequence.pad_sequences(self.x_data[index * self.batch_size: (index + 1) * self.batch_size], maxlen=maxlen)
        return x, self.y_data[index * self.batch_size: (index + 1) * self.batch_size]

In [6]:
max_features = 10000
maxlen = 80
batch_size = 32
(x_train, y_train), (x_test, y_test) = dataset_preparation(max_features)
print(x_train.shape, 'train sequences')
print(x_test.shape, 'test sequences')


vocab size: 7075
(8194,) train sequences
(2000,) test sequences


In [7]:
training_generator = DataGenerator(x_train, y_train)
validation_generator = DataGenerator(x_test, y_test)

In [22]:
model = Sequential()
model.add(LSTM(128, input_shape=(80, 7076), dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [23]:
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=4,
                    epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15

Process ForkPoolWorker-56:
Process ForkPoolWorker-55:
Process ForkPoolWorker-50:
Process ForkPoolWorker-53:
Process ForkPoolWorker-51:
Process ForkPoolWorker-52:
Process ForkPoolWorker-49:
Process ForkPoolWorker-54:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwar

KeyboardInterrupt: 