In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [2]:
path_to_file = 'dictionary.txt'

In [3]:
text = open(path_to_file , 'rb').read().decode(encoding = 'utf-8') # read and decode for py2 compat
len(text)  # length in characters

4478507

In [4]:
print(text[:500])

A-  prefix (also an- before a vowel sound) not, without (amoral). [greek]

Aa  abbr. 1 automobile association. 2 alcoholics anonymous. 3 anti-aircraft.

Aardvark  n. Mammal with a tubular snout and a long tongue, feeding on termites. [afrikaans]

Ab-  prefix off, away, from (abduct). [latin]

Aback  adv.  take aback surprise, disconcert. [old english: related to *a2]

Abacus  n. (pl. -cuses) 1 frame with wires along which beads are slid for calculating. 2 archit. Flat slab on top of a capital. 


In [5]:
vocab = sorted(set(text))  # creating mapping from characters to integers

list(enumerate(vocab))

[(0, '\n'),
 (1, ' '),
 (2, '!'),
 (3, '"'),
 (4, '$'),
 (5, '%'),
 (6, '&'),
 (7, "'"),
 (8, '('),
 (9, ')'),
 (10, '*'),
 (11, '+'),
 (12, ','),
 (13, '-'),
 (14, '.'),
 (15, '/'),
 (16, '0'),
 (17, '1'),
 (18, '2'),
 (19, '3'),
 (20, '4'),
 (21, '5'),
 (22, '6'),
 (23, '7'),
 (24, '8'),
 (25, '9'),
 (26, ':'),
 (27, ';'),
 (28, '='),
 (29, '?'),
 (30, 'A'),
 (31, 'B'),
 (32, 'C'),
 (33, 'D'),
 (34, 'E'),
 (35, 'F'),
 (36, 'G'),
 (37, 'H'),
 (38, 'I'),
 (39, 'J'),
 (40, 'K'),
 (41, 'L'),
 (42, 'M'),
 (43, 'N'),
 (44, 'O'),
 (45, 'P'),
 (46, 'Q'),
 (47, 'R'),
 (48, 'S'),
 (49, 'T'),
 (50, 'U'),
 (51, 'V'),
 (52, 'W'),
 (53, 'X'),
 (54, 'Y'),
 (55, 'Z'),
 (56, '['),
 (57, ']'),
 (58, '^'),
 (59, '`'),
 (60, 'a'),
 (61, 'b'),
 (62, 'c'),
 (63, 'd'),
 (64, 'e'),
 (65, 'f'),
 (66, 'g'),
 (67, 'h'),
 (68, 'i'),
 (69, 'j'),
 (70, 'k'),
 (71, 'l'),
 (72, 'm'),
 (73, 'n'),
 (74, 'o'),
 (75, 'p'),
 (76, 'q'),
 (77, 'r'),
 (78, 's'),
 (79, 't'),
 (80, 'u'),
 (81, 'v'),
 (82, 'w'),
 (83, 'x'),
 

In [6]:
vocab = sorted(set(text))  # creating mapping from characters to integers
char_to_index = {c:i for i, c in enumerate(vocab)}
index_to_char = np.array(vocab)

def text_to_int(text) :
    return np.array([char_to_index[i] for i in text])

int_text = text_to_int(text)

In [7]:
print(text[:20])
print(int_text[:20])

A-  prefix (also an-
[30 13  1  1 75 77 64 65 68 83  1  8 60 71 78 74  1 60 73 13]


In [8]:
def int_to_text(ints) :
    try :
        ints = ints.numpy()
    except :
        pass
    return ''.join(index_to_char[ints])

print(int_to_text(int_text[:20]))

A-  prefix (also an-


In [9]:
SEQ_LENGTH = 100  # training example sequence length
examples_per_epoch = len(text) // (SEQ_LENGTH + 1) 

char_dataset = tf.data.Dataset.from_tensor_slices(int_text) 

In [10]:
sequences = char_dataset.batch(SEQ_LENGTH + 1, drop_remainder = True)

In [11]:
def split_input_target(chunk) :  # takes 'hello'
    input_text = chunk[:-1]      # 'hell'
    target_text = chunk[1:]      # 'ello'
    return input_text, target_text

dataset = sequences.map(split_input_target)  # applies function to every entry

In [12]:
for x, y in dataset.take(1) :
    print('<Start>')
    print('<Input>')
    print(int_to_text(x))
    print('<Output>')
    print(int_to_text(y))

<Start>
<Input>
A-  prefix (also an- before a vowel sound) not, without (amoral). [greek]

Aa  abbr. 1 automobile as
<Output>
-  prefix (also an- before a vowel sound) not, without (amoral). [greek]

Aa  abbr. 1 automobile ass


In [13]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024
# buffer is how much is suffled in ram at a time, because of possibly enormous sequences
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size) :
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size,
                                  embedding_dim,
                                  batch_input_shape = [batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                             return_sequences = True,
                             stateful = True,
                             recurrent_initializer = 'glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [15]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           34304     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 134)           137350    
Total params: 5,418,630
Trainable params: 5,418,630
Non-trainable params: 0
_________________________________________________________________


In [17]:
for input_example_batch, target_example_batch in data.take(1) :
    example_batch_predictions = model(input_example_batch) # ask model for prediction on first batch
    print(example_batch_predictions.shape, '(batch_size, sequence_length, vocab_size)')

(64, 100, 134) (batch_size, sequence_length, vocab_size)


In [18]:
print(len(example_batch_predictions))  
# prediction is 64 100 character probability lists, 65 characters per
print(example_batch_predictions[:3])

64
tf.Tensor(
[[[-2.71745352e-03 -1.18277955e-03 -6.32086943e-04 ... -1.89054047e-03
    1.61831221e-03 -9.50910558e-04]
  [-6.21750439e-03 -5.92403114e-04 -7.63355708e-03 ... -4.10511019e-03
    2.29516299e-03 -2.09119497e-03]
  [-8.17770418e-03 -5.58275788e-04 -1.22253178e-02 ... -6.73481729e-03
    2.29908852e-03 -2.37179175e-03]
  ...
  [-1.49231181e-02  1.59795978e-03 -3.93226696e-03 ... -3.28733237e-03
    1.13530608e-03  7.39269378e-03]
  [-1.00096790e-02  6.71566417e-03 -1.74557976e-03 ... -6.36397395e-03
    4.32780152e-03  1.13998251e-02]
  [-6.20333105e-03  1.65863510e-03  5.19932387e-03 ... -8.58087372e-03
    7.91321415e-03  3.99046624e-03]]

 [[-4.34833230e-04 -4.25370922e-03  5.03918994e-03 ... -3.03444942e-03
    5.71694039e-03 -5.68087073e-03]
  [-3.55223683e-03 -3.37840407e-03 -2.67904042e-03 ... -5.54943690e-03
    4.45361668e-03 -5.76314190e-03]
  [-5.70843974e-03 -5.01964102e-03  2.16631731e-03 ...  9.28945665e-05
    2.13573733e-03 -6.30031666e-03]
  ...
  [-5.832

In [19]:
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[-0.00271745 -0.00118278 -0.00063209 ... -0.00189054  0.00161831
  -0.00095091]
 [-0.0062175  -0.0005924  -0.00763356 ... -0.00410511  0.00229516
  -0.00209119]
 [-0.0081777  -0.00055828 -0.01222532 ... -0.00673482  0.00229909
  -0.00237179]
 ...
 [-0.01492312  0.00159796 -0.00393227 ... -0.00328733  0.00113531
   0.00739269]
 [-0.01000968  0.00671566 -0.00174558 ... -0.00636397  0.0043278
   0.01139983]
 [-0.00620333  0.00165864  0.00519932 ... -0.00858087  0.00791321
   0.00399047]], shape=(100, 134), dtype=float32)


In [20]:
time_pred = pred[0]
print(len(pred))
print(time_pred)  # the 65 characters

100
tf.Tensor(
[-2.7174535e-03 -1.1827796e-03 -6.3208694e-04  7.5204240e-04
  1.4312178e-03  3.6873259e-03 -1.7914982e-03 -8.1829894e-03
 -4.1323388e-03  4.2588916e-04 -1.6987163e-03  4.9392353e-03
  4.3502864e-03 -2.0654986e-03  1.2768402e-03  2.2119319e-03
 -3.2099709e-03 -2.7290205e-03 -3.3001979e-03  3.6570828e-03
  2.9047283e-03 -4.1215739e-04  2.9001255e-03 -5.4370724e-03
  1.4014631e-03 -9.8939671e-04 -5.5363611e-03 -1.7483592e-03
 -4.0845079e-03  2.3842275e-03  3.1268462e-03 -1.6331309e-03
 -4.2570938e-04  1.9226820e-03  2.1457628e-05 -1.4045604e-03
  3.7953602e-03  2.7737552e-03 -2.4732414e-03  2.9089756e-03
  6.0451124e-03  1.8080318e-05  2.8850909e-03 -2.7214163e-03
 -4.9044588e-03  3.5660891e-03  5.6260214e-03 -8.4308377e-03
 -4.1912752e-04  2.6489072e-04 -1.2152259e-03  2.1309834e-03
  2.8660423e-03 -3.9911104e-04 -3.3040373e-03  1.5886924e-03
  1.9148778e-03 -8.9831930e-03  1.1089224e-03  2.3875632e-03
  3.2780988e-03  4.4541014e-03 -3.3685673e-04 -2.8182822e-03
 -5.46268

In [21]:
# sample output distribution, turn array of character probabilities into single character prediction
sampled_integers = tf.random.categorical(pred, num_samples = 1)
print(sampled_integers[:10])
# reshape array, convert arrays of integers to integer array for character processing
sampled_integers = np.reshape(sampled_integers, (1, -1))[0]
print(sampled_integers)
predicted_chars = int_to_text(sampled_integers)

predicted_chars

tf.Tensor(
[[109]
 [ 30]
 [ 79]
 [ 64]
 [ 83]
 [ 28]
 [109]
 [ 15]
 [ 74]
 [ 35]], shape=(10, 1), dtype=int64)
[109  30  79  64  83  28 109  15  74  35  47  51  93  31  39  56  25 110
 128  66  66  31  57 114   8  82   8  12 133  99 125   5  83  92  95  73
  47  67  60  72  36 103  18  77 124 122   0  87  75  22 106  95 127   1
   8  19  69  35  90  34  25 101  35  12 110  57  85 120 108 120   3 106
  69  78  52  78  24 109  22  66  98  10  99   5 123  73 104  34  82  87
  11   1  51  90 102   2  98  15  83   1]


'êAtex=ê/oFRV®BJ[9í‘ggB]ó(w(,…×˜%x©´nRhamGä2rˆŒ\n|p6ç´— (3jF£E9áF,í]zûéû"çjsWs8ê6gÉ*×%œnåEw|+ V£â!É/x '

In [22]:
def loss(labels, logits) :
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

In [23]:
model.compile(optimizer = 'adam', loss = loss)

In [24]:
# save model checkpoints to directory
checkpoint_dir = './training_checkpoints'
# name of checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix,
                                                         save_weights_only = True)

In [25]:
history = model.fit(data, epochs = 10, callbacks = [checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# model.save('shakespeare.h5')
# loaded_model = tf.keras.models.load_model('cats_dogs.h5')
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size = 1)

In [27]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [28]:
# model.save('dictionary.m5')
# model = tf.keras.models.load_model('dictionary.m5')

In [29]:
def generate_text(model, start_string) :
    # generate text using learned model
    num_generate = 1000  # number of characters to generate
    # convert start string to numbers (vectorizing)
    input_eval = text_to_int(start_string)
    input_eval = tf.expand_dims(input_eval, 0) # put int list into a 1 item list to mimic batches
    
    text_generated = []  # empty list to store results
    
    temperature = 1.0  # low temps = more predictable text, high temps = more surprising text
    # batch size will be 1
    model.reset_states()  # get rid of LSTM memory
    
    for i in range(num_generate) :
        predictions = model(input_eval)
        # remove outside fluff list dimension
        predictions = tf.squeeze(predictions, 0)
        # use categorical distribution to predict character returned by model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()
        # pass predicted character as next input to model along with previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(index_to_char[predicted_id])
    
    return (start_string + ''.join(text_generated))

In [30]:
inp = input('Starting string: ')
print(generate_text(model, inp))  # enter seed text for text generation :)

Starting string: Apple
Applep)Baching a person's life of oxbolize (looped has turn). —n. 1 watchbands from ntatric acid. [french]

Spanki  n. (pl. -s) grass term with a fipse rings structurally, e.g. The riot terms up. 13 (of a top player) act or treat (a state etc.) Fluff. 5 bid out of a motorwise substance used to small use. 2 a electromability aid etc. B (attrib.) Non-slasting group. B marriageable terminated buttle. 2 person who strikes orig. A weapon. —v. (-ting) 1 assk with a twig or rock (a country). 2 turn important. 9 (usu. Foll. By on) be trudged and lugged on. 3 constrain from another's action.

Undercust  colloq. —adv. Archaic or not soundworth interest or arrangement; reve with). 7 (foll. By on) depart by enemy of the impatiency or testedship. 2 the lustrots. 2 quantity of conditions or activities etc. In vapour. [origin unknown]

Unknown sterling, esp. Not reprinant.

Unworry  adj. (-ier, -iest). 3 hollow wordsake, royal, i during war as word-blue.  wear repertory  n. 1 