forked from srowen/quatrains-rnn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
quatrains.py
187 lines (166 loc) · 7.58 KB
/
quatrains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# See also Chapters 6, 8 of "Deep Learning with Python" by Francois Chollet (Manning)
# https://www.manning.com/books/deep-learning-with-python
import math
import numpy as np
import random
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense, Embedding, GRU
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
# Configuration parameters.
# Whether or not to configure for a GPU
use_gpu = False
# Dimension of existing GLoVe embedding to use; must be 50, 100, 200, 300
embedding_dim = 300
# Dimension of recurrent GRU below
gru_dim = 128
# Dropout applied to GRU
dropout = 0.5
# Regularization factor
regularization = 0.02
# Training batch size
batch_size = 512
# RMSProp learning rate
learning_rate = 0.003
# Training will take sequences of this many words and learn to predict the following word
phrase_len = 4
# Load the text of Nostradamus's 942 quatrains, 1 per line
quatrains = []
quatrains_file = open('quatrains.txt')
for line in quatrains_file:
quatrains.append(line)
quatrains_file.close()
# Shuffle them for good measure
random.shuffle(quatrains)
# Use a Keras Tokenizer to turn text into tokens, and assign tokens to unique indices
tokenizer = Tokenizer(filters='"#$%&()*+/:<=>@[\\]^_`{|}~\t\r\n', lower=True)
tokenizer.fit_on_texts(quatrains)
# Maps words to index, as you'd expect
word_index = tokenizer.word_index
# Inverse mapping:
index_word = {index: word for word, index in word_index.items()}
num_distinct_words = len(word_index)
# Weight words in the loss function such that more common words are given more importance.
# The weight here is, somewhat arbitrarily, the square root of the count.
# This _tends_ to make the output a little more normal-sounding.
word_index_weights = {word_index[word]: math.sqrt(count) for word, count in tokenizer.word_counts.items()}
word_index_weights[0] = 1.0
# + 1 embeddings because 0 is unused by Tokenizer, and we need an extra embedding to represent
# the end of a quatrain. Random values are generated to start, which will mostly be overwritten by the
# embedding. The ones that remain will then start with small random embeddings.
embeddings = np.random.rand(num_distinct_words + 1, embedding_dim)
# Read through embedding file and copy in any embedding for words that appear in the corpus
embeddings_file = open('glove.6B.{}d.txt'.format(embedding_dim))
for line in embeddings_file:
tokens = line.split()
word = tokens[0]
if word in word_index:
embeddings[word_index[word]] = np.asarray(tokens[1:], dtype='float32')
embeddings_file.close()
phrases = []
next_word_encodings = []
for sequence in tokenizer.texts_to_sequences(quatrains):
# add an 'end' marker to the end, and start, of all sequences
sequence = [0] + sequence + [0]
# Pick any location that can start a phrase of phrase_len words and still have an element after
for start_index in range(0, len(sequence) - phrase_len):
next_index = start_index + phrase_len
# Append the subsequence as inputs
phrases.append(sequence[start_index : next_index])
# Append the next word, one-hot-encoded. Only one of these zeroes will be set to 1
next_word_encoding = np.zeros(num_distinct_words + 1, dtype=np.bool)
next_word_encoding[sequence[next_index]] = 1
next_word_encodings.append(next_word_encoding)
if use_gpu:
# Configure TensorFlow to not grab all available GPU memory
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
session = tf.Session(config=config)
K.set_session(session)
device = '/gpu:0'
else:
device = '/cpu:0'
# Configure a TensorFlow model via Keras
model = Sequential()
# Use potentially different devices to compute one part of the model vs another
with tf.device(device):
# Embedding layer converts words (indices) into embeddings
# The actual embedding matrix is set below
model.add(Embedding(num_distinct_words + 1, embedding_dim, input_length=phrase_len))
# Learn a recurrent model of sequences of words with a Gated Recurrent Unit (GRU)
model.add(GRU(gru_dim,
dropout=dropout,
recurrent_dropout=dropout,
kernel_regularizer=l2(regularization),
bias_regularizer=l2(regularization),
recurrent_regularizer=l2(regularization)))
# Predict one of the possible words (or end) with standard dense layer plus softmax activation
model.add(Dense(num_distinct_words + 1, activation='softmax'))
# Actually set the weights for the embedding
# Note that this is not 'frozen' and is left trainable
model.layers[0].set_weights([embeddings])
# Compile the model with appropriate loss for multiclass classification
model.compile(optimizer=RMSprop(lr=learning_rate), loss='categorical_crossentropy', metrics=['acc'])
# Print a summary
model.summary()
# Need to work with these values as Numpy arrays
phrases = np.array(phrases)
# Shuffle the data, by computing a permutation of its indices
shuffled_indices = np.random.permutation(len(phrases))
# Most of the data will be used for training; take most of the shuffled indices as random training set,
# and remainder of indices indicate the validation set
training_size = int(0.95 * len(phrases))
train_indices = shuffled_indices[:training_size]
val_indices = shuffled_indices[training_size:]
# Split input into train/validation
phrases_train = phrases[train_indices]
phrases_val = phrases[val_indices]
# Split outputs in exactly the same way
next_word_embeddings = np.array(next_word_encodings)
next_word_embeddings_train = next_word_embeddings[train_indices]
next_word_embeddings_val = next_word_embeddings[val_indices]
for run in range(0, 1000):
print('Run {}'.format(run))
# Train for just a few epochs
model.fit(phrases_train,
next_word_embeddings_train,
class_weight=word_index_weights,
epochs=5,
batch_size=batch_size,
shuffle=True,
validation_data=(phrases_val, next_word_embeddings_val),
verbose=2)
# Generate a few random outputs from the model so far
print()
for i in range(0, 5):
# Begin with a dummy 'phrase' of preceding text (all 0 / end markers)
random_phrase = np.array([0] * phrase_len)
# Build up a quatrain word by word by predicting the next word
emitted_quatrain = []
# Cap the size of the emitted quatrain in case it runs on a while
while len(emitted_quatrain) < 32:
# Need to reshape the input to use with model.predict
random_phrase_t = np.copy(random_phrase)
random_phrase_t.shape = (1, phrase_len)
# Predict the next word, to get a distribution over all words
pred_next_word = model.predict(random_phrase_t)[0]
# Choose one word from this distribution; not always the most likely word!
draw = np.random.uniform()
for pred_next_word_index in range(0, len(pred_next_word)):
draw -= pred_next_word[pred_next_word_index]
if draw < 0.0:
break
if pred_next_word_index == 0:
# End of quatrain
break
else:
# Append the predicted next word
emitted_quatrain.append(index_word[pred_next_word_index])
# Update the phrase to drop first word, add next new word at the end
random_phrase = np.append(random_phrase[1:], pred_next_word_index)
# Print a generated snippet for this run
print(" ".join(emitted_quatrain))
print()