# Installing required packages and downloading dataset

In [1]:
!pip install torch transformers datasets -q
!pip install kaggle -q



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which is incompatible.
pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.[0m[31m
[0m

In [2]:
!mkdir -p ~/.kaggle && mv kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [3]:
!kaggle datasets download kingburrito666/shakespeare-plays


Downloading shakespeare-plays.zip to /content
  0% 0.00/4.55M [00:00<?, ?B/s]
100% 4.55M/4.55M [00:00<00:00, 72.6MB/s]


In [4]:
!unzip shakespeare-plays.zip -d shakespeare_dataset


Archive:  shakespeare-plays.zip
  inflating: shakespeare_dataset/Shakespeare_data.csv  
  inflating: shakespeare_dataset/alllines.txt  
  inflating: shakespeare_dataset/william-shakespeare-black-silhouette.jpg  


In [5]:
!pip install transformers[torch] -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# Preprocessing data

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('/content/shakespeare_dataset/Shakespeare_data.csv')



In [7]:
# Extract the 'PlayerLine' column
player_lines = df['PlayerLine'].dropna()

# Split the data into training and test sets
train_lines, test_lines = train_test_split(player_lines, test_size=0.1, random_state=42)




In [8]:
train_lines_to_save = train_lines[:10000]

In [9]:
test_lines_to_save = test_lines[:3000]

In [10]:
# Save the train and test sets to separate files
train_lines_to_save.to_csv('train_dataset.txt', index=False, header=False)
test_lines_to_save.to_csv('test_dataset.txt', index=False, header=False)

# Fine-Tune GPT-2

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

def fine_tune_shakespeare(dataset_path, model_checkpoint='gpt2', epochs=40):
    tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
    model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=dataset_path,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    training_args = TrainingArguments(
        output_dir="./GPT2_shakespeare",
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=50,
        save_steps=10_000,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    return model, tokenizer

model, tokenizer = fine_tune_shakespeare('train_dataset.txt')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Save the model and tokenizer

In [33]:
 # Save the model and the tokenizer
model.save_pretrained("./gpt2_shakespeare")
tokenizer.save_pretrained("./gpt2_shakespeare")

('./gpt2_shakespeare/tokenizer_config.json',
 './gpt2_shakespeare/special_tokens_map.json',
 './gpt2_shakespeare/vocab.json',
 './gpt2_shakespeare/merges.txt',
 './gpt2_shakespeare/added_tokens.json')

# Inference

In [34]:
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_shakespeare")
model = GPT2LMHeadModel.from_pretrained("./gpt2_shakespeare")


In [39]:
test_lines.iloc[0]

'That hath deprived me of your grace and favour,'

In [40]:
prompt = test_lines.iloc[0]
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text
output = model.generate(input_ids, max_length=200, num_return_sequences=1)
print(tokenizer.decode(output[0], skip_special_tokens=True))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


That hath deprived me of your grace and favour, I will"
"And, with a flourish of your hand,"
"And, with a flourish of your tongue,"
"And, with a flourish of your tongue,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"
"And, with a flourish of your hand,"


# Second Approach : Using a character-based RNN

In [18]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

In [19]:
vocab = sorted(set(" ".join(train_lines)))
print(f'{len(vocab)} unique characters')

75 unique characters


In [20]:
text = " ".join(train_lines)

Before training, we need to convert the strings to a numerical representation.

The `preprocessing.StringLookup` layer can convert each character into a numeric ID

In [21]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab), mask_token=None)

Since the goal is to generate text, it will also be important to invert this representation and recover human-readable strings from it. For this you can use `preprocessing.StringLookup(..., invert=True)`.

In [22]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [23]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [24]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(3930395,), dtype=int64, numpy=array([44, 58, 69, ..., 52, 54, 10])>

Each input sequence will contain `seq_length` characters from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.

So we break the text into chunks of `seq_length+1`. For example, say `seq_length` is 4 and our text is "Hello". The input sequence would be "Hell", and the target sequence "ello".

To do this, we first use the `tf.data.Dataset.from_tensor_slices` function to convert the text vector into a stream of character indices.

In [25]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [26]:
ids_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [27]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [28]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

For training we'll need a dataset of `(input, label)` pairs. Where input and label are sequences. At each time step the input is the current character and the label is the next character.

Here's a function that takes a sequence as input, duplicates, and shifts it to align the input and label for each timestep:

In [29]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [30]:
dataset = sequences.map(split_input_target)

In [31]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b"Without the king's will or the state's allowance, And by me, had not our hap been bad. This tribute "
Target: b"ithout the king's will or the state's allowance, And by me, had not our hap been bad. This tribute f"


We used `tf.data` to split the text into manageable sequences. But before feeding this data into the model, we need to shuffle the data and pack it into batches.

In [32]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

###Build Model

This model has three layers:

* `tf.keras.layers.Embedding`: The input layer. A trainable lookup table that will map each character-ID to a vector with `embedding_dim` dimensions;
* `tf.keras.layers.GRU`: A type of RNN with size `units=rnn_units`.
* `tf.keras.layers.Dense`: The output layer, with `vocab_size` outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.

In [33]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [35]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

For each character the model looks up the embedding, runs the GRU one timestep with the embedding as input, and applies the dense layer to generate logits predicting the log-likelihood of the next character:

In [38]:
model2 = MyModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [40]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model2(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 76) # (batch_size, sequence_length, vocab_size)


In [41]:
model2.summary()

Model: "my_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  19456     
                                                                 
 gru_1 (GRU)                 multiple                  3938304   
                                                                 
 dense_1 (Dense)             multiple                  77900     
                                                                 
Total params: 4035660 (15.39 MB)
Trainable params: 4035660 (15.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


###Train the model

At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

Attaching an optimizer, and a loss function.

The standard `tf.keras.losses.sparse_categorical_crossentropy` loss function works in this case because it is applied across the last dimension of the predictions.

Because our model returns logits, we need to set the `from_logits` flag.

In [42]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [43]:
model2.compile(optimizer='adam', loss=loss)

In [44]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [45]:
EPOCHS = 20

In [46]:
history = model2.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


###Generate text


The simplest way to generate text with this model is to run it in a loop, and keep track of the model's internal state as we execute it

The following makes a single step prediction:

In [47]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    print(skip_ids)
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
def beautiful_print(text):
    text_str = text.numpy()[0].decode("utf-8")
    words = text_str.split()

    lines = []
    current_line = []
    word_count = 0

    for word in words:
        current_line.append(word)
        word_count += 1
        if word.endswith(('.', '?', '!')) and 20 <= word_count <= 30:
            lines.append(" ".join(current_line))
            current_line = []
            word_count = 0
        elif word_count > 30:
            lines.append(" ".join(current_line))
            current_line = []
            word_count = 0

    if current_line:
        lines.append(" ".join(current_line))

    beautiful_text = "\n".join(lines)
    print(beautiful_text)


In [76]:
def generate_text(one_step_model, prompt):
  states = None
  next_char = tf.constant([prompt])
  result = [next_char]

  for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)

  beautiful_print(result)

In [73]:
model2_load = MyModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [74]:
model2_load.load_weights("/content/training_checkpoints/ckpt_20")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7b812e8f7310>

In [78]:
one_step_model = OneStep(model2_load, chars_from_ids, ids_from_chars)

tf.Tensor([[0]], shape=(1, 1), dtype=int64)
SparseTensor(indices=tf.Tensor([[0]], shape=(1, 1), dtype=int64), values=tf.Tensor([-inf], shape=(1,), dtype=float32), dense_shape=tf.Tensor([76], shape=(1,), dtype=int64))


In [79]:
generate_text(one_step_model, "To be or not to be, that's the ")

To be or not to be, that's the musto must be great: Who in prophric shall find him not: Be quite out to interprene her!
O hating whether me to thy love? Good uncle, I must needs to London with thee whatsoe'er! Wert thou dread or for my part, Lord York, or no, lord abused shoot
is inford! To lip his cave and master, where is great heasty To help, madness, I cannot sing in Argas' with close boys of trifles here.
Wilt have of you being 'Tell me, fly, fly, from that very hour! I'll have my judge mine honesty: DON Are more fear me my sooner-bodies.
The which you rush'd and to behold this nights themselves They may possession from the blood of his pohility. We will drink to me, fiend, my brother!
Escans, shepherd, while it is twit 'ord of him. sword in her clamours how it was By his that they have power upon a rich Fellow, Comwell, charge thy imposition: we
are breed True one on Brutus, let the infamy for thy pobe. Those that shall survice this That or our power were most needs one worst or 

Looking at the generated text, we'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [None]:
#We can also save and restore the generator model as follows
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')