In [None]:
# import the pytorch library and tiktoken module
import torch
import tiktoken


In [2]:
# Initialize the gpt2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

In [10]:
# read my short story and save it
with open( "../data/gautier.txt", "r" ) as f:
    raw_text = f.read()

Had given orders that day to deny my door to every one; having made a solemn resolution that morning


In [11]:
# encode the text to tokens
enc_text = tokenizer.encode_ordinary( raw_text )

In [12]:
from torch.utils.data import Dataset, DataLoader

# Create a custom Dataset class
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [14]:

# Create a dataloader for GPTDatasetV1
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [15]:
# test the dataloader
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[25383,  1813,  6266,   326]]), tensor([[1813, 6266,  326, 1110]])]


In [None]:
# try another batch
second_batch = next(data_iter)
print(second_batch)

[tensor([[1813, 6266,  326, 1110]]), tensor([[6266,  326, 1110,  284]])]


In [17]:
# test different variables
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=2, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[25383,  1813]]), tensor([[1813, 6266]])]


In [18]:
# test different strides and context sizes
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=8, stride=2, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[25383,  1813,  6266,   326,  1110,   284, 10129,   616]]), tensor([[ 1813,  6266,   326,  1110,   284, 10129,   616,  3420]])]


In [None]:
# test different batch sizes v4 but use these variables for now
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[25383,  1813,  6266,   326],
        [ 1110,   284, 10129,   616],
        [ 3420,   284,   790,   530],
        [   26,  1719,   925,   257],
        [26322,  6323,   326,  3329],
        [  326,   314,   561,   466],
        [ 2147,    11,   314,   750],
        [  407,  4601,   284,   307]])

Targets:
 tensor([[ 1813,  6266,   326,  1110],
        [  284, 10129,   616,  3420],
        [  284,   790,   530,    26],
        [ 1719,   925,   257, 26322],
        [ 6323,   326,  3329,   326],
        [  314,   561,   466,  2147],
        [   11,   314,   750,   407],
        [ 4601,   284,   307, 24069]])


In [24]:
# to apply the tokenizer's decoder to these IDs, the rows of the tensor `inputs` have to be converted into lists:
for row in inputs:
    print( tokenizer.decode( row.tolist() ) )

Had given orders that
 day to deny my
 door to every one
; having made a
 solemn resolution that morning
 that I would do
 nothing, I did
 not wish to be


In [None]:
# Create an embedding layer
vocab_size = 4
output_dim = 8

# A simple lookup table that stores embeddings of a fixed dictionary and size.
# This module is often used to store word embeddings and retrieve them using indices.
# The input to the module is a list of indices, and the output is the corresponding
# word embeddings.
inputs = torch.nn.Embedding( vocab_size, output_dim )

print( inputs.weight )


Parameter containing:
tensor([[ 1.1246,  0.3535, -0.0302, -0.2198,  0.5967, -1.3539, -0.1885, -0.3454],
        [ 1.2149, -0.4090, -0.6654,  0.4349,  0.5361,  1.0460, -0.1951,  0.7766],
        [ 0.5441,  1.2044, -0.0768, -0.9519,  0.5792, -0.3672,  0.0118,  0.7613],
        [-0.5808, -0.5994, -0.1550, -1.0729,  0.1593,  1.8672,  1.5700, -0.1120]],
       requires_grad=True)


Question 9: Since LLM's cannot take in raw text, video, images, etc.. we must convert said data into numbers, and with the help of PyTorch we can turn them into numerical vectors which help the LLM process with them with their neural network operations. The embedding layer transforms the tokens we create into these numerical vectors that the model can now learn from and take in patterns from the data.

In [None]:
# get the weights of the embedding layer
inputs = inputs.weight.data
inputs

tensor([[ 1.1246,  0.3535, -0.0302, -0.2198,  0.5967, -1.3539, -0.1885, -0.3454],
        [ 1.2149, -0.4090, -0.6654,  0.4349,  0.5361,  1.0460, -0.1951,  0.7766],
        [ 0.5441,  1.2044, -0.0768, -0.9519,  0.5792, -0.3672,  0.0118,  0.7613],
        [-0.5808, -0.5994, -0.1550, -1.0729,  0.1593,  1.8672,  1.5700, -0.1120]])

In [None]:
# 
query = inputs[2]
attention_scores_2 = torch.zeros(len(inputs)) 
for i in range( len( inputs ) ):
    attention_scores_2[i] = torch.dot( query, inputs[i] )
print( attention_scores_2 )

tensor([ 1.8270,  0.3208,  3.7087, -0.6648])


In [31]:
attention_weights_2 = torch.softmax(attention_scores_2, dim=0)
attention_weights_2

tensor([0.1271, 0.0282, 0.8342, 0.0105])

In [32]:
attention_weights_2.sum()

tensor(1.)

In [33]:
context_vector_2 = torch.zeros( query.shape )
for i in range( len( attention_weights_2 ) ):
    context_vector_2 += attention_weights_2[i] * inputs[i]
context_vector_2

tensor([ 0.6250,  1.0318, -0.0883, -0.8211,  0.5758, -0.4293, -0.0031,  0.6119])

In [34]:
# get all of the attention scores via matrix multiplication
attention_scores_2 = inputs @ inputs.T
attention_scores_2

tensor([[ 3.7829, -0.1816,  1.8270, -3.3147],
        [-0.1816,  4.2978,  0.3208,  0.8214],
        [ 1.8270,  0.3208,  3.7087, -0.6648],
        [-3.3147,  0.8214, -0.6648,  7.8611]])

In [35]:
attention_weights = torch.softmax(attention_scores_2, dim=-1)
attention_weights 

tensor([[8.6115e-01, 1.6343e-02, 1.2180e-01, 7.1227e-04],
        [1.0688e-02, 9.4251e-01, 1.7664e-02, 2.9139e-02],
        [1.2707e-01, 2.8179e-02, 8.3423e-01, 1.0517e-02],
        [1.3995e-05, 8.7540e-04, 1.9805e-04, 9.9891e-01]])

In [36]:
attention_weights[0].sum()

tensor(1.)

In [38]:
context_vectors = attention_weights @ inputs
context_vectors

tensor([[ 1.0542,  0.4440, -0.0463, -0.2989,  0.5933, -1.1922, -0.1629, -0.1921],
        [ 1.1498, -0.3779, -0.6333,  0.3595,  0.5265,  1.0193, -0.1399,  0.7384],
        [ 0.6250,  1.0318, -0.0883, -0.8211,  0.5758, -0.4293, -0.0031,  0.6119],
        [-0.5789, -0.5988, -0.1554, -1.0716,  0.1598,  1.8660,  1.5681, -0.1110]])