# Chapter 16

# Exercise 8

Embedded Reber grammars were used by Hochreiter and Schmidhuber in their paper about LSTMs. They are artificial grammars that produce strings such as “BPBTSXXVPSEPE.” Check out Jenny Orr’s nice introduction to this topic. Choose a particular embedded Reber grammar (such as the one represented on Jenny Orr’s page), then train an RNN to identify whether a string respects that grammar or not. You will first need to write a function capable of generating a training batch containing about 50% strings that respect the grammar, and 50% that don’t.

In [None]:
from collections import defaultdict
from random import choice, random, sample
from tensorflow import keras

import tensorflow as tf

In [None]:
from reber import *

## Reber

In [None]:
reber_edges = ((0,1,'B'), (1,2,'T'), (1,3,'P'), (2,2,'S'), (2,4,'X'), (3,3,'T'), (3,5,'V'), (4,3,'X'), (4,6,'S'), (5,4,'P'), (5,6,'V'), (6,None,'E'))

In [None]:
node_dict = dict_from_edges(reber_edges)

In [None]:
node_dict

In [None]:
sentence = generate_sentence(node_dict)

In [None]:
sentence

In [None]:
string_from_sentence(sentence)

In [None]:
unique_letters(sentence)

In [None]:
unique_letters(reber_edges)

In [None]:
sentence_edge = sentence[3]

In [None]:
sentence_edge

In [None]:
corrupted_sentence_edge = corrupt_edge(sentence_edge, reber_edges)

In [None]:
corrupted_sentence_edge

In [None]:
corrupted_sentence = corrupt_sentence(sentence, reber_edges, 2)

In [None]:
corrupted_sentence

## Embedder Reber Grammar

In [None]:
embedded_reber_edges = ((0,1,'B'), (1,2,'T'), (1,3,'P'), (2,4,reber_edges), (3,5,reber_edges), (4,6, 'T'), (5,6,'P'), (6,None,'E'))

In [None]:
embedded_reber_edges = flatten_embedded_edges(embedded_reber_edges)

In [None]:
embedded_reber_edges

In [None]:
node_dict = dict_from_edges(embedded_reber_edges)

In [None]:
node_dict

In [None]:
sentence = generate_sentence(node_dict)

In [None]:
sentence

In [None]:
string_from_sentence(sentence)

In [None]:
corrupt_sentence(sentence, embedded_reber_edges, 3)

## Generate Training Data

We will write a generator function that produces a reber sentence. With equal probability, the sentence will be corrupted (label 0). If corrupted, the number of corruptions is randonmly determined.

In [None]:
def generate_reber_training_sample(max_corruptions, edges, node_dict, allowed_chars):
    sentence = generate_sentence(node_dict)
    if random() < .5:
        num_corruptions = choice(range(1,max_corruptions+1))
        sentence = corrupt_sentence(sentence, edges, num_corruptions)
        label = 0
    else:
        label = 1
    s = string_from_sentence(sentence)
    x = string_to_ids(s, allowed_chars)
    x = tf.ragged.constant(x, dtype=tf.int8, ragged_rank=0)
    y = tf.constant(label, dtype=tf.int8)
    return (x, y)

In [None]:
def training_data_generator(max_corruptions, edges, n=10000):
    node_dict = dict_from_edges(edges)
    allowed_chars = unique_letters(edges)
    for i in range(n):
        yield generate_reber_training_sample(max_corruptions, edges, node_dict, allowed_chars)

## Train a model

In [None]:
max_corruptions = 3
embedding_size = 5
input_dim = len(unique_letters(embedded_reber_edges)) + 1

In [None]:
data = tf.data.Dataset.from_generator(lambda: training_data_generator(max_corruptions, embedded_reber_edges),
                                     output_types=(tf.int8, tf.int8), output_shapes=(tf.TensorShape([None]), tf.TensorShape([])))

In [None]:
data = data.padded_batch(32).prefetch(1)

In [None]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_size, mask_zero=True),
    keras.layers.GRU(30),
    keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
optimizer = keras.optimizers.Nadam(learning_rate = 0.01)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [None]:
history = model.fit(data, epochs=20)

Let's see how well an LSTM layer works

In [None]:
model_lstm = keras.models.Sequential([
    keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_size, mask_zero=True),
    keras.layers.LSTM(30),
    keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model_lstm.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [None]:
history_lstm = model_lstm.fit(data, epochs=20)

Finally, let's try a SimpleRNN

In [None]:
model_rnn = keras.models.Sequential([
    keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_size, mask_zero=True),
    keras.layers.SimpleRNN(30, return_sequences=True),
    keras.layers.SimpleRNN(30, return_sequences=True),
    keras.layers.SimpleRNN(30),
    keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model_rnn.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [None]:
history_rnn = model_rnn.fit(data, epochs=20)

# Exercise 9
_Exercise: Train an Encoder–Decoder model that can convert a date string from one format to another (e.g., from "April 22, 2019" to "2019-04-22")._

First, we need a method to generate dates in different formats

In [None]:
from tensorflow import keras
import tensorflow as tf
import tensorflow_addons as tfa

## Character level seq-to-seq model

In [None]:
from date_translation import *

In [None]:
x,y = generate_training_dates(100)

In [None]:
CHARS = list(set(''.join(x)))

In [None]:
x[0]

In [None]:
CHARS.index(x[0][0])

In [None]:
CHARS

In [None]:
preprocess_dates(x)
preprocess_dates(y)

In [None]:
np.random.seed(42)

X_train, Y_train = generate_training_data(20000)
X_valid, Y_valid = generate_training_data(2000)
X_test,  Y_test  = generate_training_data(2000)

In [None]:
X_train[0]

In [None]:
embedding_size = 32
max_char_in = tf.math.reduce_max(X_train).numpy()
max_char_out = tf.math.reduce_max(Y_train).numpy()
input_length = X_train.shape[1]
output_length = Y_train.shape[1]

In [None]:
max_char_in

In [None]:
encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=max_char_in+1, output_dim=embedding_size, input_shape=[input_length]),
    keras.layers.LSTM(128)
])

In [None]:
decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(max_char_out+1, activation='softmax')
])

In [None]:
model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(output_length),
    decoder
])

In [None]:
optimizer = keras.optimizers.Nadam()
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, Y_train, epochs=20, validation_data=(X_valid, Y_valid))

In [None]:
X_train

# Exercise 11
_Use one of the recent language models (e.g., GPT) to generate more convincing Shakespearean text._

In [None]:
from random import choice, seed
from tensorflow import keras
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, TFTrainer, TextDataset

## Load data

In [None]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

## Tokenization
GPT-2 uses byte-pair encoding

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
print(tokenizer('Good morning all!'))

## Model

In [None]:
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

## Text generation - No Fine Tuning

In [None]:
shakespeare_lines = shakespeare_text.split('\n')

In [None]:
seed(142)
prompt = choice(shakespeare_lines)
print(prompt)

In [None]:
encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='tf')
print(encoded_prompt)

In [None]:
num_sentences = 5
max_num_tokens = 50

In [None]:
generated_sequences = model.generate(
input_ids=encoded_prompt,
max_length = max_num_tokens + len(encoded_prompt),
do_sample = True,
temperature=1.0,
top_k=0,
top_p=0.9,
repetition_penalty=1.0,
num_return_sequences=num_sentences)

In [None]:
for sequence in generated_sequences:
    sentence = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(sentence)
    print("-" * 80)

## Fine tuning

Training script for fine tuning language models in huggingface is published [here](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)

In [1]:
from tensorflow import keras

In [2]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [10]:
shakespeare_sequences = shakespeare_text.split('\n')

In [11]:
train_size = len(shakespeare_sequences) * 90 // 100

In [16]:
with open("train_shakespeare.txt", 'w') as f:
    f.write('\n'.join(shakespeare_sequences[:train_size]))

In [17]:
with open("valid_shakespeare.txt", 'w') as f:
    f.write('\n'.join(shakespeare_sequences[train_size:]))

In [None]:
!python run_clm.py \
--model_type gpt2-medium \
--model_name_or_path gpt2-medium \
--train_file "train_shakespeare.txt" \
--do_train \
--validation_file "valid_shakespeare.txt" \
--do_eval \
--num_train_epochs 5 \
--output_dir /gpt2_shakepeare/

In [None]:
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
!conda install -y -c huggingface -c conda-forge datasets

In [None]:
from transformers import datasets

In [None]:
tokenized_datasets = datasets.map(
        tokenizer,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

In [None]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=filepath,
    block_size=128)