In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import pandas as pd
import os
import time

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')


In [3]:
text = open(path_to_file, 'rb').read().decode('utf-8')
print("Get {} text".format(len(text)))

Get 1115394 text


In [4]:
text[:200]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'

In [5]:
vocab = sorted(set(text))
print(len(vocab))

65


In [71]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])


In [72]:
print("--sample text: {}\n sample index: {}".format(text[:10], text_as_int[:10]))

--sample text: First Citi
 sample index: [18 47 56 57 58  1 15 47 58 47]


In [73]:
# make dataset
seq_length = 100
examles_per_batch = len(text) // seq_length

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

F
i
r
s
t


In [74]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for it in sequences.take(5):
    print(''.join(idx2char[it.numpy()]))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k
now Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us ki
ll him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be d
one: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citi


In [75]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)

In [77]:
for input_example, target_example in dataset.take(1):
    print("input data: ", ''.join(idx2char[input_example.numpy()]))
    print("target data: ", ''.join(idx2char[target_example.numpy()]))

input data:  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
target data:  irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


In [80]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("step: {:4d}".format(i))
    print("Input :{} ({:s})".format(input_idx, idx2char[input_idx]))
    print("Target :{} ({:s})".format(target_idx, idx2char[target_idx]))

step:    0
Input :18 (F)
Target :47 (i)
step:    1
Input :47 (i)
Target :56 (r)
step:    2
Input :56 (r)
Target :57 (s)
step:    3
Input :57 (s)
Target :58 (t)
step:    4
Input :58 (t)
Target :1 ( )


In [81]:
batch_size = 64

buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [84]:
vocab_size = len(vocab) + 1
em_dim = 256
rnn_units = 1024

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GRU

model = Sequential()
model.add(Embedding(vocab_size, em_dim, batch_input_shape=[batch_size, None]))
model.add(GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
model.add(Dense(vocab_size))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (64, None, 256)           16896     
_________________________________________________________________
gru_3 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_3 (Dense)              (64, None, 66)            67650     
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [85]:
for input_example_batch, target_example_batch in dataset.take(1):
    ex_pred = model(input_ex_batch)
    print(ex_pred.shape)

(64, 100, 66)


In [90]:
sample_indices = tf.random.categorical(ex_pred[0], num_samples=1)
sample_indices = tf.squeeze(sample_indices, axis=-1).numpy()
sample_indices

array([38, 14, 48, 17, 37, 17, 48, 62, 39, 44, 53, 47, 24, 32, 14, 18,  9,
       21, 54, 62,  4, 41, 48, 15,  9, 38,  7, 19, 36, 47, 11,  0, 57,  4,
       56, 42, 10, 45, 48, 56,  3, 50, 42,  3, 59, 37, 58, 21, 45, 14, 55,
       37,  7,  9, 15, 64, 40, 20, 15,  8, 55,  7, 39, 49, 28, 18, 39, 58,
       48,  8, 61, 29, 32,  1, 23, 20, 38, 56, 43,  1,  8, 53, 47, 11, 39,
       23,  7, 44, 44, 65, 62,  0, 38,  4, 35, 59, 10, 47, 35, 12],
      dtype=int64)

In [96]:
print('Input: {}'.format(''.join(idx2char[input_example_batch[0].numpy()]) ))
print('pred: {}'.format(''.join(idx2char[sample_indices])))

Input: thus. I am half through;
The one part suffer'd, the other will I do.
Here come more voices.
Your voi


IndexError: index 65 is out of bounds for axis 0 with size 65

In [103]:
target_example_batch.shape

TensorShape([64, 100])

In [102]:
ex_pred.shape

TensorShape([64, 100, 66])

In [104]:
import tensorflow as tf

In [106]:
tf.keras.losses.sparse_categorical_crossentropy(target_example_batch, ex_pred).numpy().mean()

9.068836

In [107]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

loss(target_example_batch, ex_pred).numpy().mean()

4.1894803

In [108]:
model.compile(optimizer='adam',  loss=loss)

In [109]:
his = model.fit(dataset, epochs=1)



In [6]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [7]:
ids_from_chars =preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)

In [8]:
ids = ids_from_chars(chars)

In [9]:
ids

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>

In [10]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [11]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [12]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [13]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [14]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1], dtype=int64)>

In [15]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [16]:
sep_length = 100
examples_per_epoch = len(text) // (sep_length+1)

In [17]:
sequences = ids_dataset.batch(sep_length + 1, drop_remainder=True)
for se in sequences.take(1):
    print(chars_from_ids(se))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [18]:
for se in sequences.take(1):
    print(text_from_ids(se))

tf.Tensor(b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou ', shape=(), dtype=string)


In [19]:
def split_input_target(se):
    input_text = se[:-1]
    target_text = se[1:]
    return input_text, target_text

In [20]:
split_input_target(list('tensorflow'))

(['t', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [21]:
dataset = sequences.map(split_input_target)

for inp, out in dataset.take(1):
    print("input: ", text_from_ids(inp).numpy())
    print("out: ", text_from_ids(out).numpy())

input:  b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
out:  b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [22]:
batch_size = 64

buffer_size = 10000

dataset = (dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [23]:
vocab_size =len(vocab)

em_size = 256

units = 1024

In [27]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, em_dim, units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, em_dim)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
    def call(self, inputs, states=None, return_state=False,training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)
        
        if return_state:
            return x, states
        return x

In [29]:
model = MyModel(vocab_size=len(ids_from_chars.get_vocabulary()), em_dim=em_size,units=units)

In [31]:
for input_ex_batch, target_ex_batch in dataset.take(1):
    ex_pred = model(input_ex_batch)
    print(ex_pred.shape)

(64, 100, 66)


In [33]:
vocab_size

65

In [34]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  16896     
_________________________________________________________________
gru_1 (GRU)                  multiple                  3938304   
_________________________________________________________________
dense_1 (Dense)              multiple                  67650     
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [36]:
sample_indices = tf.random.categorical(ex_pred[0], num_samples=1)
sample_indices = tf.squeeze(sample_indices, axis=-1).numpy()

In [56]:
char2idx = {v:i for i, v in enumerate(vocab)}
idx2char = np.array(vocab)

In [65]:
idx2char.shape

(65,)

In [61]:
print("input: {}".join(''.join(idx2char[input_ex_batch[0]])))

pinput: {}vinput: {}!input: {}xinput: {}jinput: {}minput: {}uinput: {}-input: {}!input: {}ginput: {}pinput: {}sinput: {}!input: {}Jinput: {}!input: {}iinput: {}binput: {}winput: {}finput: {}!input: {}einput: {}pinput: {}oinput: {}finput: {}!input: {}xinput: {}jinput: {}uinput: {}iinput: {}!input: {}uinput: {}iinput: {}finput: {}finput: {}3input: {} input: {} input: {}Kinput: {}Vinput: {}Minput: {}Jinput: {}Finput: {}Uinput: {};input: {} input: {}Pinput: {}!input: {}Hinput: {}pinput: {}einput: {}$input: {}.input: {}.input: {}Pinput: {}!input: {}oinput: {}vinput: {}sinput: {}tinput: {}finput: {}-input: {}!input: {}iinput: {}pinput: {}xinput: {}!input: {}tinput: {}iinput: {}binput: {}minput: {}minput: {}!input: {}uinput: {}iinput: {}jinput: {}tinput: {}!input: {}cinput: {}finput: {}!input: {}qinput: {}sinput: {}finput: {}winput: {}finput: {}oinput: {}uinput: {}finput: {}einput: {}Ainput: {} input: {}Ninput: {}zinput: {}!input: {}iinput: {}vinput: {}tinput: {}cinput: {}binput: {}o


In [53]:
import helper as helper

In [55]:
helper.test3()

AttributeError: module 'helper' has no attribute 'test3'