# GPT

## INITIALIZATION

### Imports

In [73]:
import regex as re
import string
import tensorflow as tf
from tensorflow.keras import (
    layers,
    models,
    losses,
    callbacks
)
import numpy as np
import datetime
from IPython.display import display, HTML

### Functions

In [74]:
def pad_punctuation(s):
    """
    Adds spaces around punctuation symbols in a string and normalizes whitespace.

    This function takes an input string, identifies all punctuation characters,
    and surrounds them with spaces. It then removes any instances of multiple
    consecutive spaces, ensuring the string is neatly formatted.

    Parameters:
    ----------
    s : str
        The input string to process.

    Returns:
    -------
    str
        The processed string with spaces around punctuation and normalized whitespace.

    Example:
    --------
    >>> pad_punctuation("Hello,world! This is a test.")
    'Hello , world ! This is a test .'

    Notes:
    ------
    - The function uses Python's `string.punctuation` to identify all standard punctuation symbols.
    - Regular expressions are used to handle the replacement efficiently.
    """
    
    # buld a puctuation symbol regular expression, and then replace coughted puctuation symbol with ' symbol '
    s= re.sub(f'([{string.punctuation}])', r' \1 ', s)
    # replace all occurance of more the 1 whitespace in the row
    return re.sub(' +', ' ', s)   

def prepare_inputs(text):
    """
    Prepares input and target tensors for a token prediction model.

    This transformation processes a batch of text data to help the model learn to predict 
    the next token in a sentence by analyzing previous tokens within the same sentence.

    Args:
        text (tf.Tensor): A batch of raw text input, typically as a 1D tensor or list of strings.

    Returns:
        tuple: A tuple (x, y) where:
            - x (tf.Tensor): The input tensor containing all tokens of each sentence 
              except the last token, with shape (batch_size, sequence_length-1).
            - y (tf.Tensor): The target tensor containing all tokens of each sentence 
              starting from the second token, with shape (batch_size, sequence_length-1).
    
    Steps:
        1. Expands the text tensor by adding a new dimension at the end.
        2. Tokenizes the sentences using the provided `vectorize_layer`.
        3. Splits the tokenized sentences into:
            - `x`: All tokens except the last one.
            - `y`: All tokens except the first one.
    """
    # Add 1 dimension to the text data for compatibility with the vectorizer
    text = tf.expand_dims(text, -1)
    
    # Tokenize sentences using the vectorize_layer (assumed to be predefined)
    tokenized_sentences = vectorize_layer(text)

    # Input tensor: all tokens except the last one
    x = tokenized_sentences[:, :-1]
    
    # Target tensor: all tokens starting from the second one
    y = tokenized_sentences[:, 1:]

    return x, y

def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Generates a causal attention mask for Transformer-based models.

    A causal mask ensures that for each destination (decoder) token, the model can only
    attend to source (encoder) tokens that have already been processed or align in time.
    This is critical for tasks such as autoregressive generation where future tokens should
    not be visible.

    Args:
        batch_size (int): Number of sequences in the batch.
        n_dest (int): Number of destination tokens (e.g., in the decoder).
        n_src (int): Number of source tokens (e.g., in the encoder or the decoder itself).
        dtype (tf.DType): Data type for the resulting mask (e.g., `tf.float32`).

    Returns:
        tf.Tensor: A mask tensor of shape `(batch_size, n_dest, n_src)` with values of 1.0
                   for allowed positions and 0.0 for disallowed positions.
    """
    # Create a range of indices for destination tokens and reshape to a column vector
    i = tf.range(n_dest)[:, None]  # Shape: (n_dest, 1)
    
    # Create a range of indices for source tokens
    j = tf.range(n_src)  # Shape: (n_src,)
    
    # Compare indices to establish causality: i >= j - offset
    # This ensures the mask allows only past and current positions
    # j - n_src + n_dest - computes an adjusted offset for the indices of the source tokens (j) to align them with the destination tokens (i).
    # i >= (adjusted j) - 1. it is where the causality is enforced. The mask sets True when a destination token's index i is greater 
    #                     than or equal to the adjusted source token index j - n_src + n_dest.
    #                     2. it  ensures that a destination token can "see" only the tokens in the source that come before it or align with it.
    m = i >= j - n_src + n_dest  # Shape: (n_dest, n_src)
    
    # Cast the boolean mask into the specified dtype (e.g., float32 or float16)
    mask = tf.cast(m, dtype)  # Shape: (n_dest, n_src)
    
    # Reshape the mask to add a singleton batch dimension
    mask = tf.reshape(mask, [1, n_dest, n_src])  # Shape: (1, n_dest, n_src)
    
    # Calculate the multiplier for tiling the mask to match batch size
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0)
    # `mult` is a vector specifying how many times to replicate along each axis
    
    # Tile the mask along the batch dimension
    return tf.tile(mask, mult)  # Final shape: (batch_size, n_dest, n_src)

def print_probs(info, vocab, top_k=5):
    """
    Prints the top-k predicted probabilities of words along with their attention scores
    visualized as HTML for a given prompt and model output.

    Args:
        info (list of dicts): A list of dictionaries containing:
            - 'prompt' (str): The input prompt sentence for which probabilities are being analyzed.
            - 'atts' (np.ndarray): Attention scores corresponding to the words in the prompt.
            - 'word_probs' (np.ndarray): Probabilities associated with each word in the vocabulary.
        vocab (list of str): A list of vocabulary words indexed to match the probabilities.
        top_k (int, optional): The number of top probabilities to display for each prompt. Defaults to 5.

    Output:
        Displays HTML to highlight words in the prompt based on their attention scores.
        Prints the top-k most probable words from the vocabulary along with their probabilities.

    """

    # Loop through each element in the info list.
    for i in info:
        highlighted_text = []  # Initialize an empty list to store HTML strings for highlighted words.

        # Loop through each word in the prompt and its corresponding average attention score.
        for word, att_score in zip(i['prompt'].split(), np.mean(i['atts'], axis=0)):
            # Create a span element with a background color that varies based on the attention score.
            highlighted_text.append(
                '<span style="background-color:rgba(135,206,250,'
                + str(att_score / max(np.mean(i["atts"], axis=0)))  # Normalize attention score.
                + ');">'
                + word
                + "</span>"
            )
        
        # Join the list of highlighted words into a single string.
        highlighted_text = ' '.join(highlighted_text)
        
        # Display the HTML to visualize the attention scores as background color.
        display(HTML(highlighted_text))

        # Extract word probabilities for the current prompt.
        word_probs = i['word_probs']
        
        # Sort the word probabilities in descending order and take the top-k.
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        
        # Get the indices of the top-k probabilities.
        i_sorted = np.argsort(word_probs)[::-1][:top_k]

        # Print the top-k words along with their probabilities (rounded to 2 decimal places).
        for p, i in zip(p_sorted, i_sorted):
            print(f'{vocab[i]}:    \t{np.round(100 * p, 2)}%')

        print('-----------\n')  # Print a separator between different prompts.

## PREPARE DATA

### Download

In [3]:
#!kaggle datasets download -d zynicide/wine-reviews

### Load

In [13]:
with open('./datasets/wine-reviews/winemag-data-130k-v2.json') as json_data:
    wine_data= json.load(json_data)

In [None]:
wine_data[10]

### Filter

In [9]:
# Filter datasets
filtered_data= [
    'wine_review : ' + x['country'] + ' : ' + x['province'] + ' : ' + x['variety'] + ' : ' + x['description'] for x in wine_data 
    if x['country'] is not None
       and x['province'] is not None
       and x['variety'] is not None
       and x['description'] is not None
]

In [10]:
filtered_data[10]

'wine_review : US : California : Cabernet Sauvignon : Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.'

In [12]:
n_wines= len(filtered_data)
print(f'{n_wines} recepies loaded')

129907 recepies loaded


## TOKENIZATION

### Pad the punctuation

In [22]:
%%time
# ~3s
text_data= [pad_punctuation(x) for x in filtered_data]

CPU times: user 3.56 s, sys: 10.2 ms, total: 3.57 s
Wall time: 3.78 s


In [23]:
example_data= text_data[25]
example_data

'wine _ review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard - designated Pinot that hails from a high - elevation site . Small in production , it offers intense , full - bodied raspberry and blackberry steeped in smoky spice and smooth texture . '

### Convert to TF

In [26]:
# Convert data to a TensorFlow dataset devided by batches with 32 recepies and shuffle buffer thus all recepies are devided randomly
text_ds= (
    tf.data.Dataset.from_tensor_slices(text_data).batch(32).shuffle(1000)
)

2024-11-26 22:09:54.060761: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.265071: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.265131: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.269764: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.269813: I external/local_xla/xla/stream_executor

### Vectorization

#### Create Vect. layer

Create a Keras TextVectorization layer:
- convert text to lowercase
- give most prevalent 10k words a corresponding integer token
- pad the sequnce to 81 tokens long

In [29]:
VOCAB_SIZE=10000
MAX_LEN=80

vectorize_layer= layers.TextVectorization(
    standardize='lower',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=80 + 1
)

#### Calc text statistics

- Apply TextVectorization to the training data
- Get the vocabulary of 10k most pevalent words.  
  NOTE:  
  - all words over 10k will be coded as 1 (i.e. UNK)
  - if number of words in sentence less then 201, thouse will be coded as 0 (i.e. stop token - text string come to an end)

In [31]:
%%time
# ~20s 
# Adapt layer to the training set
vectorize_layer.adapt(text_ds)
vocab=vectorize_layer.get_vocabulary()

CPU times: user 16.4 s, sys: 3.64 s, total: 20.1 s
Wall time: 19.6 s


2024-11-26 22:34:27.669896: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [32]:
for i,word in enumerate(vocab[:10]):
    print(f'{i} : {word}')

0 : 
1 : [UNK]
2 : :
3 : ,
4 : .
5 : and
6 : the
7 : wine
8 : a
9 : of


INTERIM CONCLUSION

We see a subset of tokens mapped to their respctive indices. The layer reserves the 0 token for padding, and 1 for unknown. NOTE:

    The other words are assigned tokens in order of frequency



In [34]:
text_data[2]

'wine _ review : US : Oregon : Pinot Gris : Tart and snappy , the flavors of lime flesh and rind dominate . Some green pineapple pokes through , with crisp acidity underscoring the flavors . The wine was all stainless - steel fermented . '

In [35]:
# display same as above but as converted to int word mappings
example_tokenised=vectorize_layer(text_data[2])
print(example_tokenised.numpy())

[   7   11   10    2   21    2  151    2   44  411    2  139    5 1009
    3    6   17    9  150 1030    5  681  627    4  105   95  235 6405
   85    3   12   74   31 5782    6   17    4    6    7  440  128  879
   15  797  542    4    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


#### Create Training Dataset

prepare_inputs convert the dataset to the MapDataset where each sentnce is splited on to the 2 sets of sequences:
- x: contains all words in sentece except last
- y: shifted by one left, thus it starts from the 2nd element

Thus we will have a tuple `[x,y]` thus when our model will train it will learn relation ships between words as it nos that word `x` the target will be `y`. For example in sentence `The cloud is white` model will learn that `x=The` and `y=cloud` so it will adjust it weight accordingly.

In [40]:
train_ds= text_ds.map(prepare_inputs)

In [41]:
example_input_output= train_ds.take(1).get_single_element()
example_input_output[0][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([   7,   11,   10,    2,   41,    2,  333,   41,    2,  164,    2,
         13,  187,  466,    9,  164,   76,   53,  796,  552,    9,   27,
         26, 8090,   33,   73, 2925,   50,   20,  464,  289,   20,   95,
        131,    4,    8,  109,    9,  214,  122,  908,  266,    6,    7,
          8,   88,    3, 4671,  615,    4,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])>

In [42]:
example_input_output[0][1]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([   7,   11,   10,    2,   86,    2,  209,   86,    2,  272,    2,
       1009,   37,    5,   60,   27,  328,    6, 1955,  683,    9,  341,
          5,  741,    4,   16,  211,   57,    3,  343,    5,  181,    3,
         12,   68,    5,  124,   60,   17,    4,    6,   32,  788,   97,
          5,  103,    4,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])>

## BUILD MODEL

### Causal attention mask

<img src="./images/causal-masking.png" width="600" height="400">


**The attention score matrix for a sequence of words**

- The green blocks at the top represent the queries — tokens for which the model is trying to predict the next token.
- The red blocks on the left represent the keys — tokens that the model searches for relevant information.

**Explanation of Causal Masking**
- The gray area indicates where the causal mask is applied.
- The purpose of the mask is to prevent "information leakage" from future words. Without this mask, our GPT model would be able to perfectly guess the next word in the sentence, because it would be using the key from the word itself as a feature. For example, when calculating attention for the word "the", the model can only use information from previous words, not the ones that come after.
- The mask sets the attention weight for future words to 0. This explains why part of the matrix is gray for the first words like "the," "pink," and "elephant."
- Each word's vector is multiplied by the corresponding keys, but the attention to future tokens is blocked by the mask.


In [46]:
np.transpose(causal_attentin_mask(1,10,10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int32)

### Transformer Block Layer

<img src="./images/transformer-block.png" width="500" height="400">

A Transformer block is a single component within a Transformer that applies some skip connections, feed-forward (dense) layers, and normalization around the multihead attention layer.

1. query is passed around the multihead attention layer to be added
to the output—this is a skip connection and is common in modern deep learning
architectures. It means we can build very deep neural networks that do not suffer as
much from the vanishing gradient problem, because the skip connection provides a
gradient-free highway that allows the network to transfer information forward
uninterrupted.
2. layer normalization is used in the Transformer block to provide stability to the
training process. We have already seen the batch normalization layer in action throughout
this book, where the output from each channel is normalized to have a mean of 0 and
standard deviation of 1.
3. Layer normalization. Each position within a sentence is normalized independently, but across the entire feature set for that position. This ensures that normalization happens independently for each word in a sentence, making it well-suited for sequential data like text. Use Case: Common in Transformers and other sequential models where the position-wise normalization provides stability and prevents dependencies on batch size.
4. a set of feed-forward (i.e., densely connected) layers is included in the Transformer block, to allow the component to extract higher-level features as we go deeper into the network.

In [50]:
class TransformerBlock(layers.Layer):
    """
    TransformerBlock is a custom Keras layer that implements a single block of the Transformer model.
    
    Attributes:
        num_heads (int): Number of attention heads in the Multi-Head Attention layer.
        key_dim (int): Dimensionality of the query and key vectors in attention.
        embed_dim (int): Dimensionality of the input and output embeddings.
        ff_dim (int): Dimensionality of the hidden layer in the feed-forward network (FFN).
        dropout_rate (float): Dropout rate applied after the attention and FFN layers.
    """
    
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        """
        Initializes the Transformer block with its components.

        Args:
            num_heads (int): Number of attention heads.
            key_dim (int): Dimension of the query/key vectors.
            embed_dim (int): Dimension of the output embeddings.
            ff_dim (int): Dimension of the feed-forward layer.
            dropout_rate (float): Dropout rate to prevent overfitting.
        """
        super(TransformerBlock, self).__init__()

        # Store input parameters as instance variables
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate

        # Define a Multi-Head Attention layer with the given number of heads and key dimension
        self.attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, output_shape=embed_dim)

        # Dropout layer applied after the attention mechanism
        self.dropout_1 = layers.Dropout(rate=dropout_rate)

        # Layer Normalization for the residual connection after attention
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)

        # First layer of the Feed-Forward Network (FFN) with ReLU activation
        self.ffn_1 = layers.Dense(units=ff_dim, activation='relu')

        # Second layer of the Feed-Forward Network (FFN) to project back to the embedding dimension
        self.ffn_2 = layers.Dense(units=embed_dim)

        # Dropout layer applied after the FFN
        self.dropout_2 = layers.Dropout(rate=dropout_rate)

        # Layer Normalization for the residual connection after the FFN
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        """
        Forward pass of the Transformer block.
        
        Args:
            inputs (tf.Tensor): Input tensor of shape (batch_size, seq_len, embed_dim).
        
        Returns:
            tuple: A tuple containing:
                - Output tensor of shape (batch_size, seq_len, embed_dim).
                - Attention scores tensor (for visualization or analysis).
        """
        # Get the shape of the input tensor to use in creating a causal mask
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]

        # Create a causal attention mask to prevent the model from looking at future tokens
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)

        # Apply Multi-Head Attention and mask to the inputs
        attention_output, attention_scores = self.attn(inputs, inputs, attention_mask=causal_mask, return_attention_scores=True)

        # Apply dropout to the attention output
        attention_output = self.dropout_1(attention_output)

        # Apply Layer Normalization and add a residual connection (skip connection)
        out1 = self.ln_1(inputs + attention_output)

        # Pass the normalized output through the first FFN layer with ReLU activation
        ffn_1 = self.ffn_1(out1)

        # Pass the result through the second FFN layer to project it back to the embedding dimension
        ffn_2 = self.ffn_2(ffn_1)

        # Apply dropout to the FFN output
        ffn_output = self.dropout_2(ffn_2)

        # Apply Layer Normalization and add a residual connection (skip connection)
        return (self.ln_2(out1 + ffn_output), attention_scores)

    def get_config(self):
        """
        Returns the configuration of the Transformer block for serialization.
        
        Returns:
            dict: Configuration dictionary containing the layer's parameters.
        """
        # Get the base configuration from the parent class and update it with additional parameters
        config = super().get_config()
        config.update(
            {
                'num_heads': self.num_heads,
                'key_dim': self.key_dim,
                'embed_dim': self.embed_dim,
                'ff_dim': self.ff_dim,
                'dropout_rate': self.dropout_rate
            }
        )
        return config

### Token and Position Embedding

The token embedding is to convert each token into a learned vector. We create the positional embedding, using a
standard Embedding layer to convert each integer position into a learned vector.

In [51]:
class TokenAndPositionEmbedding(layers.Layer):
    """
    Custom Keras layer that combines token embeddings and positional embeddings.

    This layer maps input tokens and their positions into dense vectors of the same dimension,
    which are then added together to produce the final embedding. This is typically used in 
    transformer-based models to provide both token and position information for sequential data.

    Attributes:
        max_len (int): Maximum sequence length of the input.
        vocab_size (int): Size of the vocabulary for token embeddings.
        embed_dim (int): Dimension of the embedding vectors.
    """

    def __init__(self, max_len, vocab_size, embed_dim):
        """
        Initializes the TokenAndPositionEmbedding layer.

        Args:
            max_len (int): Maximum length of the input sequences.
            vocab_size (int): Size of the vocabulary for the token embeddings.
            embed_dim (int): Dimension of the output embedding vectors.
        """
        # Call the parent constructor (Layer's __init__ method).
        super(TokenAndPositionEmbedding, self).__init__()

        # Store the maximum sequence length, vocabulary size, and embedding dimension.
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        # Create the token embedding layer:
        # Maps each token (word) index to a dense embedding vector of size embed_dim.
        # This is an embedding layer that transforms token indices (e.g., words) into dense vectors of size embed_dim.
        # Each token from the vocabulary is mapped to a unique vector.
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)

        # Create the position embedding layer:
        # Maps each position index (0, 1, 2, ...) to a dense embedding vector of size embed_dim.
        # This embedding layer assigns a unique embedding vector to each position index in the input sequence. 
        # Positional embeddings help the model learn the order of tokens, which is essential for sequential tasks.
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        """
        Combines token and positional embeddings for input sequences.

        Args:
            x (Tensor): Input tensor of token indices of shape (batch_size, sequence_length).

        Returns:
            Tensor: Output tensor of shape (batch_size, sequence_length, embed_dim) with the 
            sum of token and positional embeddings.
        """
        # Get the dynamic length of the input sequence (for padding or truncated sequences).
        maxlen = tf.shape(x)[-1]

        # Create a tensor of position indices [0, 1, 2, ..., maxlen-1].
        positions = tf.range(start=0, limit=maxlen, delta=1)

        # Generate position embeddings for each position index in the sequence.
        positions = self.pos_emb(positions)

        # Generate token embeddings for each token index in the input sequence.
        x = self.token_emb(x)

        # Add the token embeddings and positional embeddings element-wise.
        return x + positions

    def get_config(self):
        """
        Returns the configuration of the layer for serialization.

        Returns:
            dict: Dictionary containing the configuration of the layer.
        """
        # Get the configuration of the parent class and update it with custom attributes.
        config = super().get_config()
        config.update(
            {
                'max_len': self.max_len,
                'vocab_size': self.vocab_size,
                'embed_dim': self.embed_dim
            }
        )

        return config


### Build Model

In [56]:
# Define constants for the model configuration.
MAX_LEN = 80               # Maximum length of the input sequence (time steps).
VOCAB_SIZE = 10000         # Size of the vocabulary (number of unique tokens).
EMBEDDING_DIM = 256        # Dimension of the embedding space (size of each token embedding vector).
N_HEADS = 2                # Number of attention heads in the multi-head attention mechanism.
KEY_DIM = 256              # Dimension of the query, key, and value vectors in the attention mechanism.
FEED_FORWARD_DIM = 256     # Dimension of the hidden layer in the feed-forward network within the transformer block.

# Define the input layer for the model.
# Input shape: (batch_size, sequence_length), where sequence_length is flexible (None).
inputs = layers.Input(shape=(None,), dtype=tf.int32)  

# Step 1: Apply token and position embeddings.
# Inputs: token indices of shape (batch_size, sequence_length)
# Outputs: Combined token and position embeddings of shape (batch_size, sequence_length, EMBEDDING_DIM)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)

# Step 2: Apply a transformer block to process the embeddings.
# Inputs: Embedded tokens of shape (batch_size, sequence_length, EMBEDDING_DIM)
# Outputs: 
#   - Processed embeddings of shape (batch_size, sequence_length, EMBEDDING_DIM)
#   - Attention scores of shape (batch_size, N_HEADS, sequence_length, sequence_length)
x, attention_scores = TransformerBlock(N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM)(x)

# Step 3: Apply a Dense layer to project the output to the vocabulary size.
# This is typically used in language models for predicting the next token in a sequence.
# Inputs: Processed embeddings of shape (batch_size, sequence_length, EMBEDDING_DIM)
# Outputs: Probability distribution over vocabulary of shape (batch_size, sequence_length, VOCAB_SIZE)
outputs = layers.Dense(VOCAB_SIZE, activation='softmax')(x)

In [57]:
# Compile the model with the Adam optimizer and a sparse categorical cross-entropy loss function.
# SparseCategoricalCrossentropy is used because the labels are integer indices, not one-hot vectors.
gpt= models.Model(inputs=inputs, outputs=[outputs, attention_scores])

### Compile Model

In [58]:
gpt.compile(
    'adam',
    loss=[losses.SparseCategoricalCrossentropy(), None]
)

In [59]:
gpt.summary()

Token and Position Embedding:
- VOCAB_SIZE * EMBEDDING_DIM for token embeddings.
- MAX_LEN * EMBEDDING_DIM for positional embeddings.
- 2,580,480 = (10,000 * 256) + (80 * 256).

Transformer Block:
- Multi-head attention and feed-forward network parameters.

Dense Layer:
- (EMBEDDING_DIM * VOCAB_SIZE) = (256 * 10,000) = 2,570,000.

Dimensional Flow:
- Input: (batch_size, sequence_length)
- After Embedding: (batch_size, sequence_length, 256)
- After Transformer: (batch_size, sequence_length, 256)
- After Dense Layer: (batch_size, sequence_length, VOCAB_SIZE)

In [61]:
LOAD_MODEL= False

if LOAD_MODEL:
    gpt= models.load_model('./models/gpt')

## TRAIN MODEL

### Text Generator

In [75]:
class TextGenerator(callbacks.Callback):
    """
    A custom Keras callback for generating text during training.
    
    This callback generates text using the model's predictions after each training epoch.
    
    Args:
        index_to_word (list): A list of words representing the vocabulary, where each index 
                              corresponds to a specific word.
        top_k (int, optional): The number of top words to consider for sampling. Defaults to 10.
    
    Methods:
        sample_from(probs, temperature):
            Samples a word index from a probability distribution with temperature scaling.

        generate(start_prompt, max_tokens, temperature):
            Generates a sequence of text starting from a given prompt.

        on_epoch_end(epoch, logs=None):
            Called at the end of each epoch to generate text using the model.
    """

    def __init__(self, index_to_word, top_k=10):
        """
        Initializes the TextGenerator callback.
        
        Args:
            index_to_word (list): Vocabulary mapping from index to word.
            top_k (int): Number of top predictions to consider for sampling. Defaults to 10.
        """
        self.index_to_word = index_to_word  # Vocabulary list mapping indices to words.
        # Create a reverse mapping from word to index for fast lookup.
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        """
        Samples an index from a probability distribution after scaling it with temperature.
        
        Args:
            probs (np.ndarray): An array of probabilities for each vocabulary word.
            temperature (float): The temperature value for controlling randomness in sampling.
                                 A higher temperature increases diversity of generated text,
                                 while lower values make it more deterministic.

        Returns:
            tuple: A sampled index and the modified probability distribution.
        """
        # Adjust the probability distribution using temperature scaling.
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)  # Normalize the probabilities.

        # Sample an index from the adjusted probability distribution.
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        """
        Generates text starting from a prompt and continues until reaching max_tokens or an end token.
        
        Args:
            start_prompt (str): The initial text prompt to start generating from.
            max_tokens (int): The maximum number of tokens to generate.
            temperature (float): Temperature value for controlling randomness during sampling.

        Returns:
            list: A list of dictionaries containing information about each generated token,
                  including its probability and attention scores.
        """
        # Convert the start prompt into a list of token indices.
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]

        sample_token = None  # Placeholder for the next sampled token.
        info = []  # List to store information about each generated token.

        # Continue generating tokens until reaching max_tokens or sampling an end token (index 0).
        while len(start_tokens) < max_tokens and sample_token != 0:
            # Convert the list of tokens into a NumPy array with shape (1, sequence_length).
            x = np.array([start_tokens])

            # Use the model to predict the next token and its attention scores.
            y, att = self.model.predict(x, verbose=0)

            # Sample the next token from the predicted probabilities.
            sample_token, probs = self.sample_from(y[0][-1], temperature)

            # Append information about the current generation step.
            info.append(
                {
                    'prompt': start_prompt,        # Original prompt used.
                    'word_probs': probs,           # Probability distribution for the next word.
                    'atts': att[0, :, -1, :]       # Attention scores for the generated token.
                }
            )

            # Add the sampled token to the list of tokens.
            start_tokens.append(sample_token)

            # Update the start prompt by appending the new word.
            start_prompt = start_prompt + ' ' + self.index_to_word[sample_token]

        # Print the final generated text after the loop completes.
        print(f'\ngenerated text:\n{start_prompt}\n')

        return info

    def on_epoch_end(self, epoch, logs=None):
        """
        Called automatically by Keras at the end of each training epoch.
        Generates and prints a sample text starting from a predefined prompt.
        
        Args:
            epoch (int): The current epoch number.
            logs (dict, optional): A dictionary containing training metrics and information.
        """
        self.generate('wine review', max_tokens=80, temperature=1.0)


### Callbacks

In [66]:
model_checkpoint_callback= callbacks.ModelCheckpoint(
    filepath='./checkpoints/gpt-checkpoint.weights.h5',
    save_weights_only=True,
    save_freq='epoch',
    verbose=0
)

In [68]:
log_dir='./logs/fit/gpt/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

tensorboard_callback= callbacks.TensorBoard(
    log_dir= log_dir,
    histogram_freq=1,
    write_graph=True,
    write_images=False,
    update_freq='epoch',
    profile_batch=2,
    embeddings_freq=1
)

2024-11-27 23:10:08.837500: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.
2024-11-27 23:10:08.837598: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.
2024-11-27 23:10:08.840660: I external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1239] Profiler found 1 GPUs
2024-11-27 23:10:08.855489: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:137] cuptiGetTimestamp: error 999: 
2024-11-27 23:10:08.855632: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:186] cuptiSubscribe: ignored due to a previous error.
2024-11-27 23:10:08.855638: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:223] cuptiGetResultString: ignored due to a previous error.
2024-11-27 23:10:08.855695: E external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1281] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with

### Tokenizer

In [71]:
# Tokenize starting prompt
text_generator= TextGenerator(vocab)

### Fit

In [72]:
%%time
gpt.fit(
    train_ds,
    epochs=5,
    callbacks=[
        model_checkpoint_callback,
        tensorboard_callback,
        text_generator
    ]
)

Epoch 1/5


2024-11-27 23:13:51.088816: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-27 23:13:51.660976: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m   4/4060[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:13[0m 63ms/step - loss: 8.7935

2024-11-27 23:14:01.208333: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.
2024-11-27 23:14:01.208381: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.
2024-11-27 23:14:01.208401: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:135] cuptiGetTimestamp: ignored due to a previous error.
2024-11-27 23:14:01.208407: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:186] cuptiSubscribe: ignored due to a previous error.
2024-11-27 23:14:01.208409: E external/local_xla/xla/backends/profiler/gpu/cupti_error_manager.cc:223] cuptiGetResultString: ignored due to a previous error.
2024-11-27 23:14:01.208413: E external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1281] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error 
2024-11-27 23:14:01.262898: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:70] 

[1m 378/4060[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:55[0m 31ms/step - loss: 3.8544

KeyboardInterrupt: 

In [None]:
gpt.save('./models/gpt')

## GENERATE TEXT

In [None]:
info= text_generator.generate('wine review: italy', max_tokens=80, temperature=.5)

In [None]:
info= text_generator.generate('wine review: germany', max_tokens=80, temperature=.5)

## REFERENCES

1. [Generative Deep Learning, 2nd Edition](https://www.oreilly.com/library/view/generative-deep-learning/9781098134174/): David Foster's book from which has become an inspiration of this notebook.
2. [David Foster](https://github.com/davidADSP): GitHub page
3. [David Foster (Keynote) - Generative Deep Learning -Key To Unlocking Artificial General Intelligence?](https://www.youtube.com/watch?v=rHLf78CmNmQ): David's video session at Youtube regarding some key concepts has written in his book