In [2]:
!pip install transformers
!pip install datasets
!pip install keras_nlp --upgrade

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.3 MB/s[0m eta [36m0:00:

In [3]:
import keras_nlp
import numpy as np
import tensorflow as tf
from keras.layers import Concatenate, TextVectorization
from tensorflow import keras
from keras import layers
from datasets import load_dataset
from transformers import BertTokenizerFast


class AttentionHead(layers.Layer):

    def __init__(self, embedding_space_dimension):
        super().__init__()
        self.embedding_space_dimension = embedding_space_dimension
        self.q_mapping = layers.Dense(embedding_space_dimension)
        self.k_mapping = layers.Dense(embedding_space_dimension)
        self.v_mapping = layers.Dense(embedding_space_dimension)

    def call(self, x):
        q = self.q_mapping(x)
        v = self.v_mapping(x)
        k = self.k_mapping(x)
        return self.scaled_dot_product_attention(q, k, v)

    def scaled_dot_product_attention(self, q, k, v):
        w = tf.matmul(q, k, transpose_b=True)
        d_k = tf.cast(tf.shape(k)[-1], tf.float32)
        w = w / tf.sqrt(d_k)
        w = self.mask_attn_weights(w)
        w = tf.nn.softmax(w)
        o = tf.matmul(w, v)
        return o

    def mask_attn_weights(self, w):
        shape = tf.shape(w)
        n = shape[1]
        attention_mask = self.attention_mask(n, w.dtype)
        attention_mask = tf.reshape(attention_mask, [1, n, n])
        m = tf.reshape(attention_mask, [1, n, n])
        w = w * m - tf.cast(1e11, w.dtype) * (1 - m)
        return w

    def attention_mask(self, n, dtype):
        """
        1's positioned in the lower triangular part, starting from the bottom-right corner.
        example:
          M =  [ 1 0 0
                1 1 0
                1 1 1]
        this means you can do something like new_value = old* m - infinity * (1-m), when m equals 1 you keep the value else
        send it to - infinity
        I think OpenAI (https://github.com/openai/gpt-2/blob/master/src/model.py) did this masking instead of a simple addition/substraction because you don't control the norm of the weights and probably it leads to better training if we get completely rid of the weights when the mask applies, I am not sure
        """
        i = tf.range(n)[:, None]
        j = tf.range(n)
        m = i >= j
        return tf.cast(m, dtype)


class MultiAttentionHead(layers.Layer):

    def __init__(self, embedding_space_dimension, numb_heads):
        super().__init__()
        self.attention_heads = [AttentionHead(embedding_space_dimension) for _ in range(numb_heads)]
        self.linear = layers.Dense(embedding_space_dimension)

    def call(self, x):
        heads = Concatenate()([attention_head(x) for attention_head in self.attention_heads])
        return self.linear(heads)


class TransformerDecoderBlock(layers.Layer):

    def __init__(self, embedding_space_dimension, numb_heads, ffn_dimension):
        super().__init__()
        self.self_attention = MultiAttentionHead(embedding_space_dimension, numb_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ffn_dimension, activation="relu"), layers.Dense(embedding_space_dimension), ])
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(0.1)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout2 = layers.Dropout(0.1)

    def call(self, x):
        x = self.norm1(x + self.dropout1(self.self_attention(x)))
        x = self.norm2(x + self.dropout2(self.ffn(x)))
        return x


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


class GPT(layers.Layer):
    def __init__(self, maxlen, embedding_space_dimension, numb_heads, vocab_size, num_layers=1):
        super().__init__()
        self.transformer_decoder_blocks = keras.Sequential([TransformerDecoderBlock(
            embedding_space_dimension=embedding_space_dimension,
            numb_heads=numb_heads,
            ffn_dimension=embedding_space_dimension,
        ) for _ in range(num_layers)])
        self.input_embedding = TokenAndPositionEmbedding(maxlen, vocab_size, embedding_space_dimension)
        self.prediction_output = keras.layers.Dense(vocab_size)

    def call(self, x):
        x = self.input_embedding(x)
        x = self.transformer_decoder_blocks(x)
        o = self.prediction_output(x)
        return o

maxlen = 121
projection_dimension = 256
n_heads = 2
vocab_size = 30522
nb_layers = 2

gpt = GPT(maxlen, projection_dimension, n_heads, vocab_size, nb_layers)
inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
model = keras.Model(name="gpt", inputs=inputs, outputs=gpt(inputs))

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True)  # cf https://saturncloud.io/blog/how-does-tensorflow-sparsecategoricalcrossentropy-work/
model.compile("adam", loss=loss_fn)

print("************************************")
print("GPT model compiled successfully :)")
print(model.summary())


Using TensorFlow backend
************************************
GPT model compiled successfully :)
Model: "gpt"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 121)]             0         
                                                                 
 gpt (GPT)                   (None, 121, 30522)        17006138  
                                                                 
Total params: 17006138 (64.87 MB)
Trainable params: 17006138 (64.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [4]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  9396k      0  0:00:08  0:00:08 --:--:-- 16.6M


In [5]:
# same as in https://keras.io/examples/generative/text_generation_with_miniature_gpt/
import os
filenames = []
dirs = [
    "aclImdb/train/pos",
    "aclImdb/train/neg",
    "aclImdb/test/pos",
    "aclImdb/test/neg",
]
for dir in dirs:
    for f in os.listdir(dir):
        filenames.append(os.path.join(dir, f))


In [6]:
from transformers import BertTokenizerFast
dataset = load_dataset("text", data_files=filenames)['train']
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
encoded_dataset = dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length', max_length = maxlen + 1))
final_dataset = np.reshape(np.array(encoded_dataset["input_ids"]), [len(dataset), maxlen + 1])
inputs = final_dataset[:,:-1]
outputs = final_dataset[:, 1:]

Resolving data files:   0%|          | 0/50000 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [42]:
model.fit(x=inputs, y=outputs, epochs=60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
 149/1563 [=>............................] - ETA: 2:26 - loss: 2.9319

KeyboardInterrupt: ignored

In [43]:
text = "honestly"
input_padded_tokens = tokenizer(text, truncation=True, padding='max_length', max_length = maxlen)

In [44]:
input_token_ids = np.array([input_padded_tokens["input_ids"]])

In [45]:
def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    hidden_states = None
    return logits, hidden_states, cache

In [46]:

sampler = keras_nlp.samplers.GreedySampler()

In [47]:
output_tokens = sampler(
    next=next,
    prompt=input_token_ids,
    index=len(np.nonzero(input_padded_tokens["input_ids"])[0]) - 1,
)
txt = tokenizer.decode(output_tokens[0])
print(f"Generated Text: \n{txt}\n")

Generated Text: 
[CLS] honestly, i'm not sure what to expect from this movie, but i was expecting something to be disappointed. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i was wrong. i



In [48]:
sampler = keras_nlp.samplers.TopKSampler(k=10)
output_tokens = sampler(
    next=next,
    prompt=input_token_ids,
    index=len(np.nonzero(input_padded_tokens["input_ids"])[0]) - 1,
)
txt = tokenizer.decode(output_tokens[0])
print(f"Top-K search generated text: \n{txt}\n")

Top-K search generated text: 
[CLS] honestly, this is an excellent example of what a good film that is, i think, this is a good film. it is a film that is about some people who don't want to say it.... < br / > < br / > the film centers around a group of convicted " killing a young woman, played by amy adams, who was forced to quit soccer on freedom, and then find out what happened " to come out " that is not the case. it's a shame that it is, in fact, it's not just another movie in the same

