In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [2]:
# path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
path_to_file = 'dictionary.txt'

In [3]:
text = open(path_to_file , 'rb').read().decode(encoding = 'utf-8') # read and decode for py2 compat
len(text)  # length in characters

4478507

In [4]:
print(text[:500])

A-  prefix (also an- before a vowel sound) not, without (amoral). [greek]

Aa  abbr. 1 automobile association. 2 alcoholics anonymous. 3 anti-aircraft.

Aardvark  n. Mammal with a tubular snout and a long tongue, feeding on termites. [afrikaans]

Ab-  prefix off, away, from (abduct). [latin]

Aback  adv.  take aback surprise, disconcert. [old english: related to *a2]

Abacus  n. (pl. -cuses) 1 frame with wires along which beads are slid for calculating. 2 archit. Flat slab on top of a capital. 


In [5]:
vocab = sorted(set(text))  # creating mapping from characters to integers

list(enumerate(vocab))

[(0, '\n'),
 (1, ' '),
 (2, '!'),
 (3, '"'),
 (4, '$'),
 (5, '%'),
 (6, '&'),
 (7, "'"),
 (8, '('),
 (9, ')'),
 (10, '*'),
 (11, '+'),
 (12, ','),
 (13, '-'),
 (14, '.'),
 (15, '/'),
 (16, '0'),
 (17, '1'),
 (18, '2'),
 (19, '3'),
 (20, '4'),
 (21, '5'),
 (22, '6'),
 (23, '7'),
 (24, '8'),
 (25, '9'),
 (26, ':'),
 (27, ';'),
 (28, '='),
 (29, '?'),
 (30, 'A'),
 (31, 'B'),
 (32, 'C'),
 (33, 'D'),
 (34, 'E'),
 (35, 'F'),
 (36, 'G'),
 (37, 'H'),
 (38, 'I'),
 (39, 'J'),
 (40, 'K'),
 (41, 'L'),
 (42, 'M'),
 (43, 'N'),
 (44, 'O'),
 (45, 'P'),
 (46, 'Q'),
 (47, 'R'),
 (48, 'S'),
 (49, 'T'),
 (50, 'U'),
 (51, 'V'),
 (52, 'W'),
 (53, 'X'),
 (54, 'Y'),
 (55, 'Z'),
 (56, '['),
 (57, ']'),
 (58, '^'),
 (59, '`'),
 (60, 'a'),
 (61, 'b'),
 (62, 'c'),
 (63, 'd'),
 (64, 'e'),
 (65, 'f'),
 (66, 'g'),
 (67, 'h'),
 (68, 'i'),
 (69, 'j'),
 (70, 'k'),
 (71, 'l'),
 (72, 'm'),
 (73, 'n'),
 (74, 'o'),
 (75, 'p'),
 (76, 'q'),
 (77, 'r'),
 (78, 's'),
 (79, 't'),
 (80, 'u'),
 (81, 'v'),
 (82, 'w'),
 (83, 'x'),
 

In [6]:
vocab = sorted(set(text))  # creating mapping from characters to integers
char_to_index = {c:i for i, c in enumerate(vocab)}
index_to_char = np.array(vocab)

def text_to_int(text) :
    return np.array([char_to_index[i] for i in text])

int_text = text_to_int(text)

In [7]:
print(text[:20])
print(int_text[:20])

A-  prefix (also an-
[30 13  1  1 75 77 64 65 68 83  1  8 60 71 78 74  1 60 73 13]


In [8]:
def int_to_text(ints) :
    try :
        ints = ints.numpy()
    except :
        pass
    return ''.join(index_to_char[ints])

print(int_to_text(int_text[:20]))

A-  prefix (also an-


In [9]:
SEQ_LENGTH = 100  # training example sequence length
examples_per_epoch = len(text) // (SEQ_LENGTH + 1) 

char_dataset = tf.data.Dataset.from_tensor_slices(int_text) 

In [10]:
sequences = char_dataset.batch(SEQ_LENGTH + 1, drop_remainder = True)

In [11]:
def split_input_target(chunk) :  # takes 'hello'
    input_text = chunk[:-1]      # 'hell'
    target_text = chunk[1:]      # 'ello'
    return input_text, target_text

dataset = sequences.map(split_input_target)  # applies function to every entry

In [12]:
for x, y in dataset.take(1) :
    print('<Start>')
    print('<Input>')
    print(int_to_text(x))
    print('<Output>')
    print(int_to_text(y))

<Start>
<Input>
A-  prefix (also an- before a vowel sound) not, without (amoral). [greek]

Aa  abbr. 1 automobile as
<Output>
-  prefix (also an- before a vowel sound) not, without (amoral). [greek]

Aa  abbr. 1 automobile ass


In [13]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024
# buffer is how much is suffled in ram at a time, because of possibly enormous sequences
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size) :
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size,
                                  embedding_dim,
                                  batch_input_shape = [batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                             return_sequences = True,
                             stateful = True,
                             recurrent_initializer = 'glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [15]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           34304     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 134)           137350    
Total params: 5,418,630
Trainable params: 5,418,630
Non-trainable params: 0
_________________________________________________________________


In [17]:
for input_example_batch, target_example_batch in data.take(1) :
    example_batch_predictions = model(input_example_batch) # ask model for prediction on first batch
    print(example_batch_predictions.shape, '(batch_size, sequence_length, vocab_size)')

(64, 100, 134) (batch_size, sequence_length, vocab_size)


In [18]:
print(len(example_batch_predictions))  
# prediction is 64 100 character probability lists, 65 characters per
print(example_batch_predictions[:3])

64
tf.Tensor(
[[[-4.7523309e-03 -2.6344325e-04 -2.3887385e-04 ...  4.9010678e-03
   -3.4105496e-03  3.4125438e-03]
  [-4.3406356e-03  3.5034172e-04 -1.1831223e-03 ...  8.9563290e-03
    7.2912691e-04  2.2232374e-03]
  [-2.4334961e-03  9.5190684e-04 -7.0881522e-03 ...  9.5284078e-03
   -6.0752919e-04  3.6943443e-03]
  ...
  [ 2.3143031e-03  5.8845007e-03 -8.2875378e-03 ...  4.7474117e-03
    4.0495303e-03 -1.0548484e-03]
  [-2.7498598e-03  3.6736934e-03 -6.2989444e-03 ...  8.0564441e-03
    5.3704524e-04  3.2046498e-03]
  [-2.6359521e-03  2.7193914e-03 -5.6223380e-03 ...  1.0985430e-02
    4.4924514e-03  2.6371062e-03]]

 [[-2.5919313e-03  3.8523525e-03  2.6125318e-04 ... -2.3118665e-03
    6.1819406e-04 -3.9751166e-03]
  [-1.9999431e-03  4.8030019e-03 -6.1301654e-03 ...  1.5240270e-03
   -7.0604717e-04 -4.7459180e-04]
  [ 4.0071756e-03  3.1684779e-03 -3.4246102e-03 ...  2.3305619e-03
    1.9699833e-03  4.1584144e-03]
  ...
  [-7.6922299e-03  3.1561996e-03 -1.0147181e-03 ...  1.2161552e

In [19]:
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[-0.00475233 -0.00026344 -0.00023887 ...  0.00490107 -0.00341055
   0.00341254]
 [-0.00434064  0.00035034 -0.00118312 ...  0.00895633  0.00072913
   0.00222324]
 [-0.0024335   0.00095191 -0.00708815 ...  0.00952841 -0.00060753
   0.00369434]
 ...
 [ 0.0023143   0.0058845  -0.00828754 ...  0.00474741  0.00404953
  -0.00105485]
 [-0.00274986  0.00367369 -0.00629894 ...  0.00805644  0.00053705
   0.00320465]
 [-0.00263595  0.00271939 -0.00562234 ...  0.01098543  0.00449245
   0.00263711]], shape=(100, 134), dtype=float32)


In [20]:
time_pred = pred[0]
print(len(pred))
print(time_pred)  # the 65 characters

100
tf.Tensor(
[-4.7523309e-03 -2.6344325e-04 -2.3887385e-04  8.0332253e-03
 -4.8806812e-03 -7.2714505e-03 -1.5418846e-03  1.6831834e-04
  1.1657855e-03  3.6409583e-03  4.5705698e-03  1.8087268e-03
 -2.5830022e-03  2.6997674e-05 -1.1343315e-03 -1.0301720e-03
  6.0899311e-04  5.3959647e-03 -1.5546158e-03  1.0036178e-03
  1.5036254e-03 -1.1157946e-03  5.0069438e-03 -2.3691957e-03
 -5.4780608e-03  3.1098283e-03 -7.5391395e-04 -1.9625307e-03
  6.0769161e-03  2.6672373e-03 -2.3814826e-04  2.5046158e-03
  5.5898293e-03 -1.4186408e-03  4.2247954e-03 -3.1643626e-04
 -1.1529793e-03  4.5659118e-03  5.9198178e-03 -1.3465105e-03
 -5.8210311e-03 -1.6400469e-03 -3.4029961e-03  1.9173344e-03
  2.1601305e-04  9.6979673e-04 -2.5303019e-03  4.5640289e-04
 -2.2754961e-04  2.9184441e-03  5.6587736e-04 -9.3462918e-04
  2.6601120e-03  4.8241829e-03  8.0416712e-04  2.4302937e-03
  7.6320139e-04 -4.4557150e-03  6.4576562e-03 -1.7228216e-03
 -2.0587606e-05  6.8564457e-03 -3.8646389e-04  2.4021361e-03
 -1.76848

In [21]:
# sample output distribution, turn array of character probabilities into single character prediction
sampled_integers = tf.random.categorical(pred, num_samples = 1)
print(sampled_integers[:10])
# reshape array, convert arrays of integers to integer array for character processing
sampled_integers = np.reshape(sampled_integers, (1, -1))[0]
print(sampled_integers)
predicted_chars = int_to_text(sampled_integers)

predicted_chars

tf.Tensor(
[[ 34]
 [ 93]
 [129]
 [109]
 [ 42]
 [  0]
 [ 88]
 [ 78]
 [ 86]
 [ 22]], shape=(10, 1), dtype=int64)
[ 34  93 129 109  42   0  88  78  86  22   1   7  34  13 125   5  92  33
  57   1  84  29   1  94  91  73  83  26  30  67  76  54  86  17 116   6
  24   5  67  17  17  49  13 122  79 102   5  41 126  41 115 111  70 130
 116 107  68  86 105  14 107  82  35  14  38  51  91  27  53 128 131   1
  51  96  96  69  66  95   8 127  34 112 120  33  96 107  48  62  17  55
  56  42  11  37  68  57  11  11  26  80]


"E®’êM\n}s{6 'E-˜%©D] y? °¨nx:AhqY{1ö&8%h11T-Œtâ%L–Lôîk“öèi{æ.èwF.IV¨;X‘” VÀÀjg´(—EïûDÀèSc1Z[M+Hi]++:u"

In [22]:
def loss(labels, logits) :
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

In [23]:
model.compile(optimizer = 'adam', loss = loss)

In [26]:
# save model checkpoints to directory
checkpoint_dir = './training_checkpoints'
# name of checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix,
                                                         save_weights_only = True)

In [27]:
history = model.fit(data, epochs = 10, callbacks = [checkpoint_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
# model.save('shakespeare.h5')
# loaded_model = tf.keras.models.load_model('cats_dogs.h5')
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size = 1)

In [29]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [30]:
# model.save('dictionary.m5')
# model = tf.keras.models.load_model('dictionary.m5')

In [31]:
def generate_text(model, start_string) :
    # generate text using learned model
    num_generate = 1000  # number of characters to generate
    # convert start string to numbers (vectorizing)
    input_eval = text_to_int(start_string)
    input_eval = tf.expand_dims(input_eval, 0) # put int list into a 1 item list to mimic batches
    
    text_generated = []  # empty list to store results
    
    temperature = 1.0  # low temps = more predictable text, high temps = more surprising text
    # batch size will be 1
    model.reset_states()  # get rid of LSTM memory
    
    for i in range(num_generate) :
        predictions = model(input_eval)
        # remove outside fluff list dimension
        predictions = tf.squeeze(predictions, 0)
        # use categorical distribution to predict character returned by model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()
        # pass predicted character as next input to model along with previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(index_to_char[predicted_id])
    
    return (start_string + ''.join(text_generated))

In [32]:
inp = input('Starting string: ')
print(generate_text(model, inp))  # enter seed text for text generation :)

Starting string: Beachelous -v
Beachelous -versted) 1 (as roman idly, metallic, or recroim, madeculary. 2 (of a greature's sheat) a person's rashed in a wall. 2 constable round (banking many; treatury etc.).  vehicle adj.

Unimp  v. 1 (also abble) coace from which the starbons, rain. 2 (often foll. By to + infin.) Be one's kind that price. 3 asstonic. (withdown).
WTy ears victure than below anmost sore for fueur englishmptur. [anglo-fresh too eggn]

Terminism  n. (pl. Symbodium or most) 1 treatment. [old english]

Whee-appear  after sleen who or is woman children lay  n. Bloof with a producing animal. 2 universibly shoet.  desiringly adv. [perhatic pictivise]

Events  adj. (fingr. Take happeniated) occernment to a yeud.

Wetter2  n. (pl. -ies) 1 tattland sion. [old english]

Speed  —n. 1 tim of penesual jack, ess or offence in a branch, news and indian.

Unread  v. (-ging) 1 person who years (safe, or family or persimal orge with a present knowledge.

Undour-hode  n. Colloq. Count of