In [None]:
import pandas as pd
import itertools
import numpy as np
import os
import keras
import random
import sys
import io
from keras import Sequential
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, Activation, Bidirectional
from keras.models import Model
from keras.initializers import Constant
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils import to_categorical

In [None]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [None]:
def shuffle_and_split_training_set(sentences_original, labels_original, percentage_test=33):
    # shuffle at unison
    print('Shuffling sentences')
    tmp_sentences = []
    tmp_next_char = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_char.append(labels_original[i])
    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_char[:cut_index], tmp_next_char[cut_index:]

    print("Training set = %d\nTest set = %d" % (len(x_train), len(y_test)))
    return x_train, y_train, x_test, y_test

In [None]:
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, MAX_SEQUENCE_LENGTH), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        print('----- Diversity:' + str(diversity) + '\n')
        print('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        sys.stdout.write(' '.join(sentence))

        for i in range(25):
            x_pred = np.zeros((1, MAX_SEQUENCE_LENGTH))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            sys.stdout.write(" "+next_word)
            sys.stdout.flush()
        print()

In [None]:
text_path = "script.csv"
data = pd.read_csv(text_path)

In [None]:
data = data[data['Script'].map(len) < 30000]

In [None]:
scripts_raw = list(set(data["Script"].values.tolist()))

del data

In [None]:
joint_script = " ".join(scripts_raw)

In [None]:
joint_script = joint_script.lower()

scripts_in_words = [w for w in joint_script.split(' ') if w.strip() != '' or w == '\n']

del joint_script

In [None]:
word_freq = {}
for word in scripts_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

words = sorted(set(scripts_in_words))
print('total words:', len(words))
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

In [None]:
word_freq = {k: v for k, v in sorted(word_freq.items(), key=lambda item: item[1])}

In [None]:
step = 1
sentences = []
next_words = []
for i in range(0, len(scripts_in_words) - MAX_SEQUENCE_LENGTH, step):
    sentences.append(scripts_in_words[i: i + MAX_SEQUENCE_LENGTH])
    next_words.append(scripts_in_words[i + MAX_SEQUENCE_LENGTH])
print('total sequences: ', len(sentences))

In [None]:
sentences, next_words, sentences_test, next_words_test = shuffle_and_split_training_set(sentences, next_words, percentage_test=20)

In [None]:
glove_path = "glove.6B/glove.6B.300d.txt"
embeddings_index = {}
f = open(os.path.join(glove_path))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
embedding_matrix = np.zeros((len(words), EMBEDDING_DIM))
for i, word in enumerate(word_freq):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(words),
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

model = Sequential()

model.add(embedding_layer)

model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(len(words), activation='softmax'))

optimizer = RMSprop(learning_rate=0.01)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

In [None]:
batch_size = 128
file_path = "models/word-epoch{epoch:03d}-loss{loss:.4f}-val_loss{val_loss:.4f}"
checkpoint = ModelCheckpoint(file_path, monitor='loss', save_best_only=True)
model.fit_generator(generator(sentences, next_words, batch_size),
                    steps_per_epoch=int(len(sentences)/batch_size) + 1,
                    epochs=30,
                    callbacks=[checkpoint, LambdaCallback(on_epoch_end=on_epoch_end)],
                    validation_data=generator(sentences_test, next_words_test, batch_size),
                    validation_steps=int(len(sentences_test)/batch_size) + 1)