In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf




In [2]:
text = open('shakespeare.txt', 'r').read()

In [3]:
text[:500]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

In [4]:
vocab = sorted(set(text))

In [5]:
vocab

['\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '}']

In [6]:
char_to_ind = {char:ind for ind, char in enumerate(vocab)}

In [7]:
char_to_ind

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '&': 4,
 "'": 5,
 '(': 6,
 ')': 7,
 ',': 8,
 '-': 9,
 '.': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 ':': 21,
 ';': 22,
 '<': 23,
 '>': 24,
 '?': 25,
 'A': 26,
 'B': 27,
 'C': 28,
 'D': 29,
 'E': 30,
 'F': 31,
 'G': 32,
 'H': 33,
 'I': 34,
 'J': 35,
 'K': 36,
 'L': 37,
 'M': 38,
 'N': 39,
 'O': 40,
 'P': 41,
 'Q': 42,
 'R': 43,
 'S': 44,
 'T': 45,
 'U': 46,
 'V': 47,
 'W': 48,
 'X': 49,
 'Y': 50,
 'Z': 51,
 '[': 52,
 ']': 53,
 '_': 54,
 '`': 55,
 'a': 56,
 'b': 57,
 'c': 58,
 'd': 59,
 'e': 60,
 'f': 61,
 'g': 62,
 'h': 63,
 'i': 64,
 'j': 65,
 'k': 66,
 'l': 67,
 'm': 68,
 'n': 69,
 'o': 70,
 'p': 71,
 'q': 72,
 'r': 73,
 's': 74,
 't': 75,
 'u': 76,
 'v': 77,
 'w': 78,
 'x': 79,
 'y': 80,
 'z': 81,
 '|': 82,
 '}': 83}

In [8]:
ind_to_char = np.array(vocab)

In [9]:
ind_to_char

array(['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1',
       '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       '[', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
       'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
       'w', 'x', 'y', 'z', '|', '}'], dtype='<U1')

In [10]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [11]:
encoded_text[:500]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
       75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
       75,  1, 56, 74,  1, 75, 63, 60,  1, 73, 64, 71, 60, 73,  1, 74, 63,
       70, 76, 67, 59,  1, 57, 80,  1, 75, 64, 68, 60,  1, 59, 60, 58, 60,
       56, 74, 60,  8,  0,  1,  1, 33, 64, 74,  1, 75, 60, 69, 59, 60, 73,
        1, 63, 60, 64, 73,  1, 68, 64, 62, 63, 75,  1, 57, 60, 56, 73,  1,
       63, 64, 74,  1, 68, 60, 68, 70, 73, 80, 21,  0,  1,  1, 27, 76, 75,
        1, 75, 63, 70, 76,  1, 58, 70, 69, 75, 73, 56, 58, 75, 60, 59,  1,
       75, 70,  1, 75, 63

In [13]:
text[:500]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

In [14]:
line = 'From fairest creatures we desire increase'

In [15]:
len(line)

41

In [18]:
lines = "From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease"

In [19]:
len(lines)

130

In [20]:
seq_length = 120

In [21]:
num_of_sequences = len(text) // (seq_length + 1)

In [22]:
num_of_sequences

45005

In [23]:
dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [24]:
sequences = dataset.batch(batch_size=(seq_length+1), drop_remainder=True)

In [25]:
for seq in sequences.take(1):
    print(seq)

tf.Tensor(
[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75
  1], shape=(121,), dtype=int32)


In [26]:
def create_seq_targets(seq):
    input = seq[:-1]
    output = seq[1:]
    return input, output

In [28]:
dataset = sequences.map(create_seq_targets)

In [30]:
for input, output in dataset.take(1):
    print(input)
    print("".join(ind_to_char[input]))
    print('\n')
    print(output)
    print("".join(ind_to_char[output]))

tf.Tensor(
[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75], shape=(120,), dtype=int32)

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


tf.Tensor(
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1], shape=(120,), dtype=int32)
                     1
  From fairest creatures we desire increase,

In [31]:
batch_size = 128
buffer_size = 10000
dataset = dataset.shuffle(buffer_size=buffer_size, seed=101).batch(batch_size=batch_size, drop_remainder=True)

In [32]:
def sparse_cat_entropy(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [33]:
layers = tf.keras.layers

In [34]:
vocab_size = len(vocab)
embed_dim = 64
rnn_neurons = 1026

In [39]:
def create_model(batch_size, vocab_size, embed_dim, rnn_neurons):
    model = tf.keras.models.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, batch_input_shape=(batch_size, None)),
        layers.GRU(units=rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        layers.Dense(vocab_size),
    ])
    model.compile(optimizer='adam', loss=sparse_cat_entropy)
    return model

In [40]:
model = create_model(batch_size=batch_size, vocab_size=vocab_size, embed_dim=embed_dim, rnn_neurons=rnn_neurons)

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (128, None, 64)           5376      
                                                                 
 gru_1 (GRU)                 (128, None, 1026)         3361176   
                                                                 
 dense_1 (Dense)             (128, None, 84)           86268     
                                                                 
Total params: 3452820 (13.17 MB)
Trainable params: 3452820 (13.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [53]:
for input, output in dataset.take(1):
    ex_pred = model(input)
    print(ex_pred)

tf.Tensor(
[[[ 1.1356823e-02 -6.8091792e-03 -1.3093210e-02 ... -4.4259345e-03
   -1.0474450e-03  4.3458771e-03]
  [ 6.3339248e-03 -2.7564538e-03 -1.1633095e-02 ... -6.6018025e-03
   -3.3489789e-04 -9.7391481e-04]
  [ 4.0687453e-03 -9.6337888e-03 -1.1449478e-02 ... -2.8603892e-03
    6.2706182e-05 -4.7240686e-03]
  ...
  [ 1.7806839e-03 -9.5131202e-03 -9.3387328e-03 ...  2.4319869e-03
    2.9965825e-03 -2.1149984e-03]
  [ 2.3705550e-03 -4.8285453e-03 -9.6183782e-03 ... -1.8048999e-03
    8.9727418e-04 -3.8662336e-03]
  [-1.5498605e-03  1.0779328e-03  7.0592617e-03 ... -7.7892006e-03
    7.2490395e-04  2.3761154e-03]]

 [[ 4.8678638e-03  1.7698146e-03 -7.0908163e-03 ...  1.6764748e-03
    1.7887385e-03  2.0542056e-03]
  [ 6.3924971e-03 -1.5296359e-03 -1.3693569e-02 ...  3.9296802e-03
    4.3917629e-03  9.6447775e-03]
  [ 3.5645205e-03 -2.3485383e-03 -1.0698269e-02 ... -1.0777169e-03
    2.0696446e-03  1.6092276e-03]
  ...
  [ 1.3649311e-03  8.9955376e-04 -2.7852810e-03 ...  2.0196538e-03

In [54]:
sample = tf.random.categorical(ex_pred[0], num_samples=1, seed=101)

In [55]:
sample

<tf.Tensor: shape=(120, 1), dtype=int64, numpy=
array([[ 1],
       [40],
       [78],
       [78],
       [33],
       [56],
       [45],
       [25],
       [81],
       [29],
       [ 2],
       [45],
       [34],
       [ 3],
       [26],
       [28],
       [ 0],
       [37],
       [72],
       [59],
       [12],
       [50],
       [15],
       [16],
       [42],
       [38],
       [ 6],
       [67],
       [ 9],
       [54],
       [58],
       [65],
       [20],
       [45],
       [56],
       [65],
       [16],
       [17],
       [83],
       [67],
       [75],
       [ 6],
       [ 2],
       [55],
       [ 6],
       [51],
       [ 8],
       [72],
       [12],
       [42],
       [36],
       [57],
       [63],
       [ 0],
       [23],
       [33],
       [12],
       [44],
       [80],
       [19],
       [40],
       [ 7],
       [67],
       [47],
       [49],
       [15],
       [25],
       [14],
       [64],
       [53],
       [47],
       [ 8],
       [79],
   

In [56]:
sample = tf.squeeze(sample, axis=-1).numpy()

In [57]:
sample

array([ 1, 40, 78, 78, 33, 56, 45, 25, 81, 29,  2, 45, 34,  3, 26, 28,  0,
       37, 72, 59, 12, 50, 15, 16, 42, 38,  6, 67,  9, 54, 58, 65, 20, 45,
       56, 65, 16, 17, 83, 67, 75,  6,  2, 55,  6, 51,  8, 72, 12, 42, 36,
       57, 63,  0, 23, 33, 12, 44, 80, 19, 40,  7, 67, 47, 49, 15, 25, 14,
       64, 53, 47,  8, 79, 82, 57, 22, 71, 37,  7,  4, 52, 33, 73,  4, 14,
       32,  7, 81, 62, 47, 32, 52, 35, 21, 20,  2, 13, 50, 14,  1, 25,  5,
       53, 81, 65,  9, 83, 34, 56, 39, 67, 51, 38, 19, 45,  9, 70,  2, 26,
       65], dtype=int64)

In [59]:
''.join(ind_to_char[sample])

' OwwHaT?zD!TI"AC\nLqd1Y45QM(l-_cj9Taj56}lt(!`(Z,q1QKbh\n<H1Sy8O)lVX4?3i]V,x|b;pL)&[Hr&3G)zgVG[J:9!2Y3 ?\']zj-}IaNlZM8T-o!Aj'