In [1]:
import random
from collections import Counter

import numpy as np
import pandas as pd
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

tokenizer = Tokenizer()

Using TensorFlow backend.


In [2]:
def prep_set(file):
    csv_file = pd.read_csv(file + '.csv')
    with open(file + '-dump.csv', 'w') as output:
        for line in csv_file.text.tolist():
            output.write(str(line) + '\n')


def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text.split('\n')


def extract_tokens(doc):
    total_stems = []
    for line in doc:
        tokens = word_tokenize(line)
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if w not in stop_words]
        porter = PorterStemmer()
        stems = []
        for t in tokens:
            stems.append(porter.stem(t))
        total_stems.append(stems)
    return total_stems


def generate_vocab(bot_tokens, gen_tokens, max_features):
    vocab_counter = Counter()
    vocab_counter.update([t for row in bot_tokens for t in row])
    vocab_counter.update([t for row in gen_tokens for t in row])
    vocab = {key: val for key, val in vocab_counter.items() if val > 1}
    vocab_list = sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:max_features]
    vocab = {text: i + 1 for i, (text, _) in enumerate(vocab_list)}
    print('vocab size:', len(vocab))
    return vocab


def get_sequence(document, vocab):
    total_seq = []
    for tweet in document:
        seq = []
        for word in tweet:
            one_hot = [0] * (len(vocab) + 1)
            if word in vocab:
                one_hot[vocab[word]] += 1
            seq.append(one_hot)
        total_seq.append(seq)
    return total_seq


In [3]:
def get_dataset(bot_train_tokens, gen_train_tokens, vocab):
    bot_train_seq = get_sequence(bot_train_tokens, vocab)
    gen_train_seq = get_sequence(gen_train_tokens, vocab)
    x_train = bot_train_seq + gen_train_seq
    y_train = [0] * len(bot_train_seq) + [1] * len(gen_train_seq)
    train_set = list(zip(x_train, y_train))
    random.shuffle(train_set)
    x, y = zip(*train_set)
    return np.array(x), np.array(y)


In [4]:
def dataset_preparation(max_features):
    bot_train_tokens = extract_tokens(load_doc('tr-small-bot.csv'))
    gen_train_tokens = extract_tokens(load_doc('tr-small-gen.csv'))
    bot_test_tokens = extract_tokens(load_doc('test-bot-dump.csv'))
    gen_test_tokens = extract_tokens(load_doc('test-gen-dump.csv'))
    vocab = generate_vocab(bot_train_tokens, gen_train_tokens, max_features)
    x_train, y_train = get_dataset(bot_train_tokens, gen_train_tokens, vocab)
    x_test, y_test = get_dataset(bot_test_tokens, gen_test_tokens, vocab)
    return (x_train, y_train), (x_test, y_test)

In [5]:
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, x_data, y_data, batch_size=16):
        'Initialization'
        self.x_data = x_data
        self.y_data = y_data
        self.batch_size = batch_size
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.x_data) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        x = sequence.pad_sequences(self.x_data[index * self.batch_size: (index + 1) * self.batch_size], maxlen=maxlen)
        return x, self.y_data[index * self.batch_size: (index + 1) * self.batch_size]

In [6]:
max_features = 10000
maxlen = 80
batch_size = 32
(x_train, y_train), (x_test, y_test) = dataset_preparation(max_features)
print(x_train.shape, 'train sequences')
print(x_test.shape, 'test sequences')


vocab size: 7075
(8194,) train sequences
(2000,) test sequences


In [7]:
training_generator = DataGenerator(x_train, y_train)
validation_generator = DataGenerator(x_test, y_test)

In [22]:
model = Sequential()
model.add(LSTM(128, input_shape=(80, 7076), dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=4,
                    epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15