# Chapter 16 Exercise 11

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import shutil

import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [3]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [4]:
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count

In [5]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [6]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [7]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [8]:
np.random.seed(42)
tf.random.set_seed(42)

In [9]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [10]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [11]:
dataset = dataset.prefetch(1)

## GPT model

In [57]:
from transformers import TFOpenAIGPTLMHeadModel

model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")

All model checkpoint layers were used when initializing TFOpenAIGPTLMHeadModel.

All the layers of TFOpenAIGPTLMHeadModel were initialized from the model checkpoint at openai-gpt.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFOpenAIGPTLMHeadModel for predictions without further training.


In [58]:
from transformers import OpenAIGPTTokenizer

tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [59]:
prompt_text = "This royal throne of kings, this sceptred isle"
encoded_prompt = tokenizer.encode(prompt_text,
                                  add_special_tokens=False,
                                  return_tensors="tf")
encoded_prompt

<tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[  616,  5751,  6404,   498,  9606,   240,   616, 26271,  7428,
        16187]])>

In [60]:
num_sequences = 5
length = 40

generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=1.0,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1.0,
    num_return_sequences=num_sequences,
)

generated_sequences

<tf.Tensor: shape=(5, 50), dtype=int32, numpy=
array([[  616,  5751,  6404,   498,  9606,   240,   616, 26271,  7428,
        16187,   636,   868,   580,  9818,   485,  5018,   715,   239,
          244, 40477,  1473,  1699,   239, 40477,   500,   481,  5332,
          240,    26,   518,  4616,  6212,   481, 10022,  1081,   562,
          481,  1541,   239,   524,  2600,   509,  1241,   557,  7816,
          557,   669,   487,   656,   694],
       [  616,  5751,  6404,   498,  9606,   240,   616, 26271,  7428,
        16187,   500,   481,  9458,   260, 16841,  5883,   498,   481,
         1926,    26, 27690,   239,   725,  2283,   815,   481,  6404,
          240,   557,   544,   481,  7613,   240,   544,   481,  1866,
          498,   524,  8140,   240,   618,    10,   837,   500,   239,
         2034,   481,  1866,   498,   618],
       [  616,  5751,  6404,   498,  9606,   240,   616, 26271,  7428,
        16187,   239,   481,  3445,   498,  2060, 24183,   240,   481,
         2595

In [61]:
for sequence in generated_sequences:
    text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(text)
    print("-" * 80)

this royal throne of kings, this sceptred isle would never be theirs to rule over. " 
 everyone laughed. 
 in the palace, llorel wandered the halls looking for the boy. his company was almost as soothing as when he 'd been
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle in the pre - reign period of the pallbearers. more important than the throne, as is the title, is the kiss of his majesty, king dwalin. upon the kiss of king
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle. the whisper of walking carpets, the smell of gilt and tile, the occasional unflattering digit or single word uttered like a groan in a government room of a hundred - years old. gods, he
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle would become a castle to we d on. i am scared to

In [62]:
num_sequences = 5
length = 40

generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=0.2,
    top_k=0,
    top_p=0.9,
    repetition_penalty=3.0,
    num_return_sequences=num_sequences,
)

for sequence in generated_sequences:
    text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(text)
    print("-" * 80)

this royal throne of kings, this sceptred isle. " 
 the king's eyes narrowed and he looked at me with a look that was almost as cold as his voice when i told him about my dream in which we had been separated by water and
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle is the home to a great many people. it has been said that king arthur was once one who ruled over all lands and peoples in his time ; but now he must be replaced by another ruler,
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle is the most important place in all our lands. it's where we're going to be staying for a while and i'm sure you 'll want your rest as well before that happens.'
 sparhawk
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle is the home to a great many people. it was o

## Bert model

In [49]:
from transformers import BertTokenizer, TFBertLMHeadModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertLMHeadModel.from_pretrained('bert-base-uncased')

If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`
All model checkpoint layers were used when initializing TFBertLMHeadModel.

All the layers of TFBertLMHeadModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertLMHeadModel for predictions without further training.


In [23]:
# Author links to transformers run_generation.py page, but that is no longer extant. Find it here:
# https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-generation/run_generation.py

In [50]:
prompt_text = "This royal throne of kings, this sceptred isle"
encoded_prompt = tokenizer.encode(prompt_text,
                                  add_special_tokens=False,
                                  return_tensors="tf")
encoded_prompt

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=
array([[ 2023,  2548,  6106,  1997,  5465,  1010,  2023,  8040, 23606,
         5596,  8842]])>

In [44]:
np_prompt = encoded_prompt.numpy()
np_prompt = np_prompt[...,np.newaxis]

In [51]:
num_sequences = 5
length = 40

generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=1.0,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1.0,
    num_return_sequences=num_sequences,
)

generated_sequences

<tf.Tensor: shape=(5, 51), dtype=int32, numpy=
array([[ 2023,  2548,  6106,  1997,  5465,  1010,  2023,  8040, 23606,
         5596,  8842,  2829,  1010,  1010,  1010,  1010,  1998,  1998,
         1998,  1012,  1012,  1012,  1012,  1012,  1012,  1012,  1012,
         1012,  1012,  1012,  1012,  1012,  1012,  1012,  1012,  1012,
         1012,  1012,  1012,  1012,  1012,  1012,  1012,  1012,  1012,
         1012,  2133,  2133,  2090,  1997,  1998],
       [ 2023,  2548,  6106,  1997,  5465,  1010,  2023,  8040, 23606,
         5596,  8842,  2035,  2035,  2035,  2035,  2035,  2035,  2053,
         2053,  2053,  2053,  2053,  2035,  2035,  2035,  2035,  2035,
         2035,  2035,  1998,  1998,  1998,  1998,  1998,  1998,  1998,
         1998,  1998,  1998,  1998,  1998,  1998,  1998,  1998,  1998,
         1998,  1998,  1998,  1998,  1998,  1998],
       [ 2023,  2548,  6106,  1997,  5465,  1010,  2023,  8040, 23606,
         5596,  8842,  2521,  2521,  2521,  2521,  2053,  2053,  2053,

In [52]:
for sequence in generated_sequences:
    text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(text)
    print("-" * 80)

this royal throne of kings, this sceptred isle brown,,,, and and and................................. between of and
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle all all all all all all no no no no no all all all all all all all and and and and and and and and and and and and and and and and and and and and and and
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle far far far far no no no no no no!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so so
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle all all all no all all all all a

In [53]:
num_sequences = 5
length = 40

generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=1.0,
    top_k=0,
    top_p=0.9,
    repetition_penalty=5.0,
    num_return_sequences=num_sequences,
)

generated_sequences

<tf.Tensor: shape=(5, 51), dtype=int32, numpy=
array([[ 2023,  2548,  6106,  1997,  5465,  1010,  2023,  8040, 23606,
         5596,  8842,  2042,  2054,  2001,  2020,  2025,  5053,  2009,
         2035,  1996,  1998,  2029,  2062,  2021,  2008,  2055,  2102,
         2002,  2045,  2016,  2383,  2044,  2013,  2032,  2058,  2010,
         2029,  2000,  2061,  2122,  2028,  2894,  1999,  1037,  2002,
         2173,  2178,  2005,  2005,  2004,  2253],
       [ 2023,  2548,  6106,  1997,  5465,  1010,  2023,  8040, 23606,
         5596,  8842,  5192,  7804, 11418,  6925,  2466,  2409,  8867,
         7122,  2030,  1998,  2823,  1011,  2774,  2195,  1000,  2005,
         2001,  2000, 11937,  4830,  2014,  2129,  2021,  2061,  2106,
         2070,  1005, 15536,  1051,  2035,  2045,  2205,  2008,  2083,
         2908,  2091,  2735,  3308,  2357,  4332],
       [ 2023,  2548,  6106,  1997,  5465,  1010,  2023,  8040, 23606,
         5596,  8842, 23384,  2485,  2958,  2379,  1012,  1029,  2133,

In [54]:
for sequence in generated_sequences:
    text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(text)
    print("-" * 80)

this royal throne of kings, this sceptred isle been what was were not beauty it all the and which more but that aboutt he there she having after from him over his which to so these one alone in a he place another for for as went
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle shadow goddess rim tale story told fairy tales or and sometimes - songs several " for was to ta da her how but so did some'wi o all there too that through gone down turn wrong turned turns
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle ness close bridge near.?... oh yes well uh er es'ya aye nay being al said spoken here to up safe in out all and but let that be is it what thing can there then
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle later selfless and more less than from

In [56]:
num_sequences = 5
length = 40

generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=0.2,
    top_k=0,
    top_p=0.9,
    repetition_penalty=3.0,
    num_return_sequences=num_sequences,
)

for sequence in generated_sequences:
    text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(text)
    print("-" * 80)

this royal throne of kings, this sceptred isle island all over and or it something there just out for me myself like as such not no yes is still now so if i my mine you other others some more many yet gone away long far lost loss
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle so not no it all over there yeah well ok yes oh ah still some bit fun lit out off and or as a an '. s d c t r & / etc... em -! #
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle island all over and or it something out in for the me him his my no so i he still is will not do would could we us be our some a s many other okay yeah wells they
--------------------------------------------------------------------------------
this royal throne of kings, this sceptred isle island all like about just be me more some advice wisdom wise lost far many or so and out fort