In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

## Load Data

In [2]:
shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [3]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



## Process for Tensorflow

In [4]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [5]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])
tokenizer.texts_to_sequences(["Blurb"])
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

In [6]:
dataset_size = tokenizer.document_count # total number of characters

In [7]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [8]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [9]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset

<FlatMapDataset element_spec=TensorSpec(shape=(None,), dtype=tf.int64, name=None)>

In [10]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset

<MapDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [11]:
#One-hot encoding
dataset = dataset.map( 
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [12]:
#Add prefetching?
dataset = dataset.prefetch(1)
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 39), dtype=tf.float32, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [13]:

for X_batch, Y_batch in dataset.take(1):
    print( X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


## Train

In [14]:
from google.colab import drive
drive.mount('/drive')

checkpoint_path = "training_2/cp-{epoch:04d}.ckpt"
checkpoint_dir = "/drive/My Drive/Colab Notebooks/" + checkpoint_path

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_dir, 
    verbose=1, 
    save_weights_only=True,
    save_freq='epoch')




model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
        dropout=0.2,), #recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
        dropout=0.2,), #recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
        activation="softmax"))
    ])


model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

# Save the weights using the `checkpoint_path` format
# Then, train the model with the new callback

#model.save_weights(checkpoint_dir.format(epoch=0))
#history = model.fit(dataset, epochs=10, callbacks=[cp_callback] )

#Load weights and start training from respective epoch
epoch  = 10
model.load_weights(checkpoint_dir.format(epoch = epoch))
history = model.fit(dataset, epochs=10, callbacks=[cp_callback], initial_epoch = epoch )


Mounted at /drive


## Predict characters using the model

In [15]:
#use Model for predictions
def preprocess(texts):
  X = np.array(tokenizer.texts_to_sequences(texts)) - 1
  return tf.one_hot(X, max_id)

X_new = preprocess(["How are yo"])
Y_pred = np.argmax(model(X_new), axis=-1)

tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]


'u'

In [16]:
def next_char(text, temperature=1):
  X_new = preprocess([text])
  y_proba = model.predict(X_new)[0, -1:, :]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [17]:
def complete_text(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

In [18]:
print(complete_text("Why, o", temperature = 1))

Why, one do homad alone:
i know not this tuty, good firs


In [19]:
print(complete_text("t", temperature=0.2))
print(complete_text("w", temperature=1))
print(complete_text("w", temperature=2))

t thou art so beloved to her.

gremio:
what, are yo
warne.
i am percuaved, but in my needs indeed:
were
wstore instgicn?

grumio:
and weeps, gotdin us, diu


## Stateful RNN's

Left out

## Sentiment Analysis

In [20]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
X_train[0][:10]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [21]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}

for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
  id_to_word[id_] = token

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [22]:
" ".join([id_to_word[id_] for id_ in X_train[5]])

"<sos> begins better than it ends funny that the russian submarine crew outperforms all other actors it's like those scenes where documentary shots br br spoiler part the message dechifered was contrary to the whole story it just does not mesh br br"

### Preprocessing with Tensor Flow

In [23]:
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True,
          with_info=True)
train_size = info.splits["train"].num_examples

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete9LV9FI/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete9LV9FI/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete9LV9FI/imdb_reviews-unsupervised.tfrec…

[1mDataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [24]:
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [25]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
  for review in X_batch:
    vocabulary.update(list(review.numpy()))

In [26]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [27]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [28]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [29]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [30]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [31]:
embed_size = 128
model = keras.models.Sequential([
  keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
  input_shape=[None]),
  keras.layers.GRU(128, return_sequences=True),
  keras.layers.GRU(128),
  keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam",
  metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
#Implement masking using functional API
K = keras.backend
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.Model(inputs=[inputs], outputs=[outputs])

In [33]:
model.compile(loss="binary_crossentropy", optimizer="adam",
  metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Re-use existing embeddings

In [34]:
import tensorflow_hub as hub
model = keras.Sequential([
  hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
    dtype = tf.string, input_shape=[], output_shape=[50]),
  keras.layers.Dense(128, activation="relu"),
  keras.layers.Dense(1, activation="sigmoid")
])

In [35]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True,
with_info=True)
train_size = info.splits["train"].num_examples
batch_size = 32
train_set = datasets["train"].batch(batch_size).prefetch(1)
model.compile(loss="binary_crossentropy", optimizer="adam",
  metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

RuntimeError: ignored