In [1]:
import utils

import numpy as np
import tensorflow as tf

In [2]:
# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

Default GPU Device: /device:GPU:0


In [3]:
data_dir = 'moes_tavern_lines.txt'
text = utils.load_data(data_dir)

# Ignore notice, since we don't use it for analysing the data
text = text[81:]

In [4]:
view_sentence_range = (0, 10)

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

Dataset Stats
Roughly the number of unique words: 11492


In [5]:
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))

Number of scenes: 262


In [6]:
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

Average number of sentences in each scene: 15.248091603053435


In [7]:
sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))

Number of lines: 4257


In [8]:
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

Average number of words in each line: 11.50434578341555


In [9]:
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

The sentences 0 to 10:
Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.




In [10]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    vocab_to_int = {w:i for i, w in enumerate(set(text))}
    int_to_vocab = {i:w for i, w in enumerate(set(text))}
    return vocab_to_int, int_to_vocab

In [11]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    return {
        '.': '||Period||',
        ',': '||Comma||',
        '"': '||Quotation_Mark||',
        ';': '||Semicolon||',
        '!': '||Exclamation_mark||',
        '?': '||Question_mark||',
        '(': '||Left_Parentheses||',
        ')': '||Right_Parentheses||',
        '--': '||Dash||',
        "\n": '||Return||'
    }

In [12]:
# Preprocess Training, Validation, and Testing Data
utils.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [13]:
# Check lookup tables
int_text, vocab_to_int, int_to_vocab, token_dict = utils.load_preprocess()

In [14]:
# Build Training Data

In [15]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    n_batches = len(int_text) // (batch_size * seq_length)
    result = []
    for i in range(n_batches):
        inputs = []
        targets = []
        for j in range(batch_size):
            idx = i * seq_length + j * seq_length
            inputs.append(int_text[idx:idx + seq_length])
            targets.append(int_text[idx + 1:idx + seq_length + 1])
        result.append([inputs, targets])
    return np.array(result)

In [16]:
# Params

# Number of Epochs
num_epochs = 100
# Batch Size
batch_size = 128
# Sequence Length
seq_length = 25
# Learning Rate
learning_rate = 0.01
# RNN Size
rnn_units = 256
# Embedding dimensions
embedding_dim = 50

In [17]:
batches = get_batches(int_text, batch_size, seq_length)
for inputs, target in batches:
    print(inputs.shape, target.shape)
    break

(128, 25) (128, 25)


In [18]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape = [batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences = True,
                        stateful = True,
                        recurrent_initializer = 'glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [20]:
model = build_model(
  vocab_size = len(int_to_vocab),
  embedding_dim = embedding_dim,
  rnn_units = rnn_units,
  batch_size = batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 50)           338950    
_________________________________________________________________
gru (GRU)                    (128, None, 256)          236544    
_________________________________________________________________
dense (Dense)                (128, None, 6779)         1742203   
Total params: 2,317,697
Trainable params: 2,317,697
Non-trainable params: 0
_________________________________________________________________


In [21]:
tf.keras.utils.plot_model(model, show_shapes = True)

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
