In [None]:
pip install keras_nlp

In [5]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras

Using TensorFlow backend


In [6]:
# Data
BATCH_SIZE = 64
SEQ_LEN = 128
MIN_TRAINING_SEQ_LEN = 450

# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 256
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000  # Limits parameters in model.

# Training
EPOCHS = 6

# Inference
NUM_TOKENS_TO_GENERATE = 80

In [7]:
keras.utils.get_file(
    origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
    extract=True,
)
dir = os.path.expanduser("~/.keras/datasets/simplebooks/")

# Load simplebooks-92 train set and filter out short lines.
raw_train_ds = (
    tf.data.TextLineDataset(dir + "simplebooks-92-raw/train.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

# Load simplebooks-92 validation set and filter out short lines.
raw_val_ds = (
    tf.data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
)

Downloading data from https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip


In [8]:
# Train tokenizer vocabulary
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

In [9]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [10]:
# packer adds a start token
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels


# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

In [11]:
inputs = keras.layers.Input(shape=(None,), dtype=tf.int32)
# Embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)
# Transformer decoders.
for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)  # Giving one argument only skips cross-attention.
# Output.
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

In [12]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddi  (None, None, 256)         1312768   
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transformer_decoder (Trans  (None, None, 256)         394749    
 formerDecoder)                                                  
                                                                 
 transformer_decoder_1 (Tra  (None, None, 256)         394749    
 nsformerDecoder)                                                
                                                                 
 dense_4 (Dense)             (None, None, 5000)        128500

In [13]:
model.fit(train_ds, validation_datasss=val_ds, verbose=1, epochs=EPOCHS)

Epoch 1/6
3169/3169 - 432s - loss: 4.5635 - perplexity: 96.2995 - val_loss: 4.1293 - val_perplexity: 62.8110 - 432s/epoch - 136ms/step
Epoch 2/6
3169/3169 - 251s - loss: 4.0547 - perplexity: 57.8930 - val_loss: 3.9864 - val_perplexity: 54.3020 - 251s/epoch - 79ms/step
Epoch 3/6
3169/3169 - 248s - loss: 3.9415 - perplexity: 51.6923 - val_loss: 3.9496 - val_perplexity: 52.3990 - 248s/epoch - 78ms/step
Epoch 4/6
3169/3169 - 247s - loss: 3.8794 - perplexity: 48.5792 - val_loss: 3.8851 - val_perplexity: 49.0976 - 247s/epoch - 78ms/step
Epoch 5/6
3169/3169 - 249s - loss: 3.8345 - perplexity: 46.4447 - val_loss: 3.8583 - val_perplexity: 47.9225 - 249s/epoch - 78ms/step
Epoch 6/6
3169/3169 - 247s - loss: 3.8013 - perplexity: 44.9276 - val_loss: 3.8355 - val_perplexity: 46.7612 - 247s/epoch - 78ms/step


<keras.src.callbacks.History at 0x7c7bca940d30>

In [14]:
# The "packer" layers adds the [BOS] token for us.
prompt_tokens = start_packer(tokenizer([""]))
prompt_tokens

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>

In [15]:

def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    # Ignore hidden states for now; only needed for contrastive search.
    hidden_states = None
    return logits, hidden_states, cache


In [16]:
sampler = keras_nlp.samplers.GreedySampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,  # Start sampling immediately after the [BOS] token.
)
txt = tokenizer.detokenize(output_tokens)
print(f"Greedy search generated text: \n{txt}\n")

Greedy search generated text: 
[b'[BOS] " i have been thinking about the matter over , " he said , " i have been thinking of the matter over . i have been thinking about the matter over , and i have been thinking that i have been thinking of the matter . i have been thinking about the matter over , and i have been thinking that i have been thinking of the matter . i have been thinking of the matter over , and i have been thinking of the matter . i have been thinking of the matter over , and i have been thinking about the matter over . i have been thinking about the matter over . i have been thinking about the matter over . i have']



In [17]:
sampler = keras_nlp.samplers.BeamSampler(num_beams=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Beam search generated text: \n{txt}\n")

Beam search generated text: 
[b'[BOS] " well , " he said , " i don \' t know , but i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t know anything about it . i don \' t']



In [18]:
sampler = keras_nlp.samplers.RandomSampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Random search generated text: \n{txt}\n")

Random search generated text: 
[b'[BOS] the bow feather , his stomach was no longer exposed to the precinctiest of notice . puzier was as a lagdiot . but the black creature was the only chance ; great claws being caught by the bows and claws taken from toadstool , that her crying , but gradually , except a fright which was instantly hurled against the stick . a strong pole is like a great run overhead , and the greater portion of her limbs pushed off again . it was only a good swim . hardy was green , the hair man very bare , and he was likely to grow hoisterous . [PAD] therefore he was']



In [19]:
sampler = keras_nlp.samplers.TopKSampler(k=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-K search generated text: \n{txt}\n")

Top-K search generated text: 
[b'[BOS] the men had already told the story of a man . the man who had been in this room , had told him to give him a great deal of time to learn the business of the men , and then he was sure that they were to learn what he wanted . the people had told them they had been so long in the forest , that he would have done something very well and could not be done to the end ; they told him that the men would be killed by the spaniards ; they would be killed , and the men would be killed . [PAD] also the people said that they would come back from their homes , and would be destroyed']



In [20]:
sampler = keras_nlp.samplers.TopPSampler(p=0.5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
[b'[BOS] " my dear sir , " he said , " i have no fear of your being associated , and i have not been able to see any other knight of the court . i have seen that , at least , and were killed , i have heard of the king \' s senor , but i have heard the king \' s words of my brother , i have seen him , and that , if he were here , i should have said , that i have been a very strong , strong , noble , and well , i should be glad to see him , as i have been ; but i will have been here']



In [21]:

class TopKTextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model using top-k."""

    def __init__(self, k):
        self.sampler = keras_nlp.samplers.TopKSampler(k)

    def on_epoch_end(self, epoch, logs=None):
        output_tokens = self.sampler(
            next=next,
            prompt=prompt_tokens,
            index=1,
        )
        txt = tokenizer.detokenize(output_tokens)
        print(f"Top-K search generated text: \n{txt}\n")


text_generation_callback = TopKTextGenerator(k=10)
# Dummy training loop to demonstrate callback.
model.fit(train_ds.take(1), verbose=2, epochs=2, callbacks=[text_generation_callback])

Epoch 1/2
Top-K search generated text: 
[b'[BOS] the men were at work in a position of extreme age , who were to be found in the neighbourhood of the combination of the preserver in the country ; and the men were to be taken in the rear of their own . the men , however , was a large body of french cavalry , and were therefore ordered for the spaniards to enter the town ; and as soon as possible , the spaniards , as to their wives ; and men , in the neighbourhood of the village , were to retire ; while a party of natives , to a considerable distance , and to make a great resistance , to the spaniards , the portuguese']

1/1 - 14s - loss: 3.7706 - perplexity: 43.4882 - 14s/epoch - 14s/step
Epoch 2/2
Top-K search generated text: 
[b'[BOS] when the king saw that his majesty had been engaged in a great effort to escape , and then went to the palace of his brother . he took the greatest pains and , as soon as the princess got his arms to the palace , he gave her her permission to take his pl

<keras.src.callbacks.History at 0x7c7bdb9b9e10>