In [1]:
import pandas as pd
import itertools
import numpy as np
import os
import keras
from keras import Sequential
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, Activation, Bidirectional
from keras.models import Model
from keras.initializers import Constant
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils import to_categorical
import random
import sys
import io

Using TensorFlow backend.


In [2]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [3]:
def shuffle_and_split_training_set(sentences_original, labels_original, percentage_test=33):
    # shuffle at unison
    print('Shuffling sentences')
    tmp_sentences = []
    tmp_next_char = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_char.append(labels_original[i])
    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_char[:cut_index], tmp_next_char[cut_index:]

    print("Training set = %d\nTest set = %d" % (len(x_train), len(y_test)))
    return x_train, y_train, x_test, y_test

In [4]:
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, MAX_SEQUENCE_LENGTH), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y

In [5]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [6]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        print('----- Diversity:' + str(diversity) + '\n')
        print('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        sys.stdout.write(' '.join(sentence))

        for i in range(25):
            x_pred = np.zeros((1, MAX_SEQUENCE_LENGTH))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            sys.stdout.write(" "+next_word)
            sys.stdout.flush()
        print()

In [7]:
text_path = "script.csv"
data = pd.read_csv(text_path)

In [8]:
data = data[data['Script'].map(len) < 30000]

In [9]:
scripts_raw = list(set(data["Script"].values.tolist()))

del data

In [10]:
joint_script = " ".join(scripts_raw)

In [11]:
joint_script = joint_script.lower()

scripts_in_words = [w for w in joint_script.split(' ') if w.strip() != '' or w == '\n']

del joint_script

In [12]:
word_freq = {}
for word in scripts_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

words = sorted(set(scripts_in_words))
print('total words:', len(words))
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

total words: 229481


In [13]:
word_freq = {k: v for k, v in sorted(word_freq.items(), key=lambda item: item[1])}

In [14]:
step = 1
sentences = []
next_words = []
for i in range(0, len(scripts_in_words) - MAX_SEQUENCE_LENGTH, step):
    sentences.append(scripts_in_words[i: i + MAX_SEQUENCE_LENGTH])
    next_words.append(scripts_in_words[i + MAX_SEQUENCE_LENGTH])
print('total sequences: ', len(sentences))

total sequences:  7044544


In [15]:
sentences, next_words, sentences_test, next_words_test = shuffle_and_split_training_set(sentences, next_words, percentage_test=20)

Shuffling sentences
Training set = 5635635
Test set = 1408909


In [16]:
glove_path = "glove.6B/glove.6B.300d.txt"
embeddings_index = {}
f = open(os.path.join(glove_path))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [17]:
embedding_matrix = np.zeros((len(words), EMBEDDING_DIM))
for i, word in enumerate(word_freq):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(words),
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

model = Sequential()

model.add(embedding_layer)

model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(len(words), activation='softmax'))

optimizer = RMSprop(learning_rate=0.01)

In [18]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

In [19]:
batch_size = 128
file_path = "models/word-epoch{epoch:03d}-loss{loss:.4f}-val_loss{val_loss:.4f}"
checkpoint = ModelCheckpoint(file_path, monitor='loss', save_best_only=True)
model.fit_generator(generator(sentences, next_words, batch_size),
                    steps_per_epoch=int(len(sentences)/batch_size) + 1,
                    epochs=30,
                    callbacks=[checkpoint, LambdaCallback(on_epoch_end=on_epoch_end)],
                    validation_data=generator(sentences_test, next_words_test, batch_size),
                    validation_steps=int(len(sentences_test)/batch_size) + 1)

Epoch 1/30

----- Generating text after Epoch: 0

----- Diversity:0.3

----- Generating with seed:
"to incorporate this kind of infrastructure. then the city grows around it. and of course this is just a glimpse of something which could be much better if we just create it, and it changes the way of life.
and the second ingredient, which would solve mobility, that very difficult challenge"

to incorporate this kind of infrastructure. then the city grows around it. and of course this is just a glimpse of something which could be much better if we just create it, and it changes the way of life.
and the second ingredient, which would solve mobility, that very difficult challenge the very time and the first time is the very case and and the the a is the two and the two and the a
----- Diversity:0.4

----- Generating with seed:
"to incorporate this kind of infrastructure. then the city grows around it. and of course this is just a glimpse of something which could be much better if we just cr

  after removing the cwd from sys.path.


 the to and that a the it of the
----- Diversity:0.5

----- Generating with seed:
"and because of other factors. and so someone has to really realize when the crisis is going to happen.
this is the situation in qatar, for those who don't know. we only have two days of water reserve. we import 90 percent of our food, and we only cultivate less than"

and because of other factors. and so someone has to really realize when the crisis is going to happen.
this is the situation in qatar, for those who don't know. we only have two days of water reserve. we import 90 percent of our food, and we only cultivate less than that of the about of a the and of the to for the of and is in a the of of in the of was
----- Diversity:0.6

----- Generating with seed:
"and because of other factors. and so someone has to really realize when the crisis is going to happen.
this is the situation in qatar, for those who don't know. we only have two days of water reserve. we import 90 percent of our food, and we o

<keras.callbacks.callbacks.History at 0x7f1d102d0d10>