# GPT

## INITIALIZATION

### Imports

In [44]:
import regex as re
import string
import tensorflow as tf
from tensorflow.keras import (
    layers
)
import numpy as np

### Functions

In [43]:
def pad_punctuation(s):
    """
    Adds spaces around punctuation symbols in a string and normalizes whitespace.

    This function takes an input string, identifies all punctuation characters,
    and surrounds them with spaces. It then removes any instances of multiple
    consecutive spaces, ensuring the string is neatly formatted.

    Parameters:
    ----------
    s : str
        The input string to process.

    Returns:
    -------
    str
        The processed string with spaces around punctuation and normalized whitespace.

    Example:
    --------
    >>> pad_punctuation("Hello,world! This is a test.")
    'Hello , world ! This is a test .'

    Notes:
    ------
    - The function uses Python's `string.punctuation` to identify all standard punctuation symbols.
    - Regular expressions are used to handle the replacement efficiently.
    """
    
    # buld a puctuation symbol regular expression, and then replace coughted puctuation symbol with ' symbol '
    s= re.sub(f'([{string.punctuation}])', r' \1 ', s)
    # replace all occurance of more the 1 whitespace in the row
    return re.sub(' +', ' ', s)   

def prepare_inputs(text):
    """
    Prepares input and target tensors for a token prediction model.

    This transformation processes a batch of text data to help the model learn to predict 
    the next token in a sentence by analyzing previous tokens within the same sentence.

    Args:
        text (tf.Tensor): A batch of raw text input, typically as a 1D tensor or list of strings.

    Returns:
        tuple: A tuple (x, y) where:
            - x (tf.Tensor): The input tensor containing all tokens of each sentence 
              except the last token, with shape (batch_size, sequence_length-1).
            - y (tf.Tensor): The target tensor containing all tokens of each sentence 
              starting from the second token, with shape (batch_size, sequence_length-1).
    
    Steps:
        1. Expands the text tensor by adding a new dimension at the end.
        2. Tokenizes the sentences using the provided `vectorize_layer`.
        3. Splits the tokenized sentences into:
            - `x`: All tokens except the last one.
            - `y`: All tokens except the first one.
    """
    # Add 1 dimension to the text data for compatibility with the vectorizer
    text = tf.expand_dims(text, -1)
    
    # Tokenize sentences using the vectorize_layer (assumed to be predefined)
    tokenized_sentences = vectorize_layer(text)

    # Input tensor: all tokens except the last one
    x = tokenized_sentences[:, :-1]
    
    # Target tensor: all tokens starting from the second one
    y = tokenized_sentences[:, 1:]

    return x, y

def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Generates a causal attention mask for Transformer-based models.

    A causal mask ensures that for each destination (decoder) token, the model can only
    attend to source (encoder) tokens that have already been processed or align in time.
    This is critical for tasks such as autoregressive generation where future tokens should
    not be visible.

    Args:
        batch_size (int): Number of sequences in the batch.
        n_dest (int): Number of destination tokens (e.g., in the decoder).
        n_src (int): Number of source tokens (e.g., in the encoder or the decoder itself).
        dtype (tf.DType): Data type for the resulting mask (e.g., `tf.float32`).

    Returns:
        tf.Tensor: A mask tensor of shape `(batch_size, n_dest, n_src)` with values of 1.0
                   for allowed positions and 0.0 for disallowed positions.
    """
    # Create a range of indices for destination tokens and reshape to a column vector
    i = tf.range(n_dest)[:, None]  # Shape: (n_dest, 1)
    
    # Create a range of indices for source tokens
    j = tf.range(n_src)  # Shape: (n_src,)
    
    # Compare indices to establish causality: i >= j - offset
    # This ensures the mask allows only past and current positions
    m = i >= j - n_src + n_dest  # Shape: (n_dest, n_src)
    
    # Cast the boolean mask into the specified dtype (e.g., float32 or float16)
    mask = tf.cast(m, dtype)  # Shape: (n_dest, n_src)
    
    # Reshape the mask to add a singleton batch dimension
    mask = tf.reshape(mask, [1, n_dest, n_src])  # Shape: (1, n_dest, n_src)
    
    # Calculate the multiplier for tiling the mask to match batch size
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0)
    # `mult` is a vector specifying how many times to replicate along each axis
    
    # Tile the mask along the batch dimension
    return tf.tile(mask, mult)  # Final shape: (batch_size, n_dest, n_src)


## PREPARE DATA

### Download

In [3]:
#!kaggle datasets download -d zynicide/wine-reviews

### Load

In [13]:
with open('./datasets/wine-reviews/winemag-data-130k-v2.json') as json_data:
    wine_data= json.load(json_data)

In [None]:
wine_data[10]

### Filter

In [9]:
# Filter datasets
filtered_data= [
    'wine_review : ' + x['country'] + ' : ' + x['province'] + ' : ' + x['variety'] + ' : ' + x['description'] for x in wine_data 
    if x['country'] is not None
       and x['province'] is not None
       and x['variety'] is not None
       and x['description'] is not None
]

In [10]:
filtered_data[10]

'wine_review : US : California : Cabernet Sauvignon : Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.'

In [12]:
n_wines= len(filtered_data)
print(f'{n_wines} recepies loaded')

129907 recepies loaded


## TOKENIZATION

### Pad the punctuation

In [22]:
%%time
# ~3s
text_data= [pad_punctuation(x) for x in filtered_data]

CPU times: user 3.56 s, sys: 10.2 ms, total: 3.57 s
Wall time: 3.78 s


In [23]:
example_data= text_data[25]
example_data

'wine _ review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard - designated Pinot that hails from a high - elevation site . Small in production , it offers intense , full - bodied raspberry and blackberry steeped in smoky spice and smooth texture . '

### Convert to TF

In [26]:
# Convert data to a TensorFlow dataset devided by batches with 32 recepies and shuffle buffer thus all recepies are devided randomly
text_ds= (
    tf.data.Dataset.from_tensor_slices(text_data).batch(32).shuffle(1000)
)

2024-11-26 22:09:54.060761: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.265071: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.265131: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.269764: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-11-26 22:09:54.269813: I external/local_xla/xla/stream_executor

### Vectorization

#### Create Vect. layer

Create a Keras TextVectorization layer:
- convert text to lowercase
- give most prevalent 10k words a corresponding integer token
- pad the sequnce to 81 tokens long

In [29]:
VOCAB_SIZE=10000
MAX_LEN=80

vectorize_layer= layers.TextVectorization(
    standardize='lower',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=80 + 1
)

#### Calc text statistics

- Apply TextVectorization to the training data
- Get the vocabulary of 10k most pevalent words.  
  NOTE:  
  - all words over 10k will be coded as 1 (i.e. UNK)
  - if number of words in sentence less then 201, thouse will be coded as 0 (i.e. stop token - text string come to an end)

In [31]:
%%time
# ~20s 
# Adapt layer to the training set
vectorize_layer.adapt(text_ds)
vocab=vectorize_layer.get_vocabulary()

CPU times: user 16.4 s, sys: 3.64 s, total: 20.1 s
Wall time: 19.6 s


2024-11-26 22:34:27.669896: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [32]:
for i,word in enumerate(vocab[:10]):
    print(f'{i} : {word}')

0 : 
1 : [UNK]
2 : :
3 : ,
4 : .
5 : and
6 : the
7 : wine
8 : a
9 : of


INTERIM CONCLUSION

We see a subset of tokens mapped to their respctive indices. The layer reserves the 0 token for padding, and 1 for unknown. NOTE:

    The other words are assigned tokens in order of frequency



In [34]:
text_data[2]

'wine _ review : US : Oregon : Pinot Gris : Tart and snappy , the flavors of lime flesh and rind dominate . Some green pineapple pokes through , with crisp acidity underscoring the flavors . The wine was all stainless - steel fermented . '

In [35]:
# display same as above but as converted to int word mappings
example_tokenised=vectorize_layer(text_data[2])
print(example_tokenised.numpy())

[   7   11   10    2   21    2  151    2   44  411    2  139    5 1009
    3    6   17    9  150 1030    5  681  627    4  105   95  235 6405
   85    3   12   74   31 5782    6   17    4    6    7  440  128  879
   15  797  542    4    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


#### Create Training Dataset

prepare_inputs convert the dataset to the MapDataset where each sentnce is splited on to the 2 sets of sequences:
- x: contains all words in sentece except last
- y: shifted by one left, thus it starts from the 2nd element

Thus we will have a tuple `[x,y]` thus when our model will train it will learn relation ships between words as it nos that word `x` the target will be `y`. For example in sentence `The cloud is white` model will learn that `x=The` and `y=cloud` so it will adjust it weight accordingly.

In [40]:
train_ds= text_ds.map(prepare_inputs)

In [41]:
example_input_output= train_ds.take(1).get_single_element()
example_input_output[0][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([   7,   11,   10,    2,   41,    2,  333,   41,    2,  164,    2,
         13,  187,  466,    9,  164,   76,   53,  796,  552,    9,   27,
         26, 8090,   33,   73, 2925,   50,   20,  464,  289,   20,   95,
        131,    4,    8,  109,    9,  214,  122,  908,  266,    6,    7,
          8,   88,    3, 4671,  615,    4,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])>

In [42]:
example_input_output[0][1]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([   7,   11,   10,    2,   86,    2,  209,   86,    2,  272,    2,
       1009,   37,    5,   60,   27,  328,    6, 1955,  683,    9,  341,
          5,  741,    4,   16,  211,   57,    3,  343,    5,  181,    3,
         12,   68,    5,  124,   60,   17,    4,    6,   32,  788,   97,
          5,  103,    4,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])>

#### Creat the causal attention mask

In [46]:
np.transpose(causal_attentin_mask(1,10,10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int32)