In [1]:
# Read the file
import os  
import urllib.request 

if not os.path.exists("Resources/Story.txt"):
    URL = ("https://raw.githubusercontent.com/majidarasteh/Large-Language-Model_LLM/refs/heads/main/Resources/Story.txt")
    file_path = "Story.txt"
    urllib.request.urlretrieve(URL, file_path)

with open("Story.txt", "r", encoding="utf-8") as f:
    story_text = f.read()

In [2]:
# Create a tokenizer object
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:

"""
   # Create an embedding layer.
   1. suppose we have a small vocabulary of only 6 words (instead of the 50,257 words in the BPE tokenizer vocabulary).
   2. we want to create embeddings of size 3 (in GPT-3, the embedding size is 12,288 dimensions).
   3. Using the vocab_size and output_dim, we can instantiate an embedding layer in PyTorch.
   4. setting the random seed to 123 for reproducibility purposes.
"""
import torch

vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) # At begin, contains small, random values.
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [4]:
print(embedding_layer(torch.tensor([3]))) # Get embedding vector of a token ID=3

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [5]:
# Get embedding vector of 4 token IDs. Output reveals that this results in a 4 × 3 matrix:
input_ids = torch.tensor([2, 3, 5, 1])
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)    

        for i in range(0, len(token_ids) - max_length, stride):     
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):    
        return len(self.input_ids)

    def __getitem__(self, idx):         
        return self.input_ids[idx], self.target_ids[idx]

In [7]:
def create_dataloader_v1(txt,            # The input text data to process.
                         batch_size=4,   # Number of input-target pairs per batch (4 sequences processed simultaneously).
                         max_length=256, # Maximum length (in tokens) for each input sequence.
                         stride=128,     # How many tokens the sliding window moves forward between sequences.
                         shuffle=True,   # Whether to randomize the order of sequences before batching.
                         drop_last=True, # If True, discards incomplete batches at the end.
                         num_workers=0   #Number of CPU cores for parallel data loading.
                        ):
    tokenizer = tiktoken.get_encoding('gpt2')                        
    dataset = GPTDatasetV1(txt,        # Your text
                           tokenizer,  # Your tokenizer object
                           max_length, # Number of tokens
                           stride      # Tokens to slide window
                          ) 
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,     
        num_workers=num_workers     
    )

    return dataloader

In [8]:
batch_size = 8  # Size of batches
max_length = 4  # Size of window

dataloader = create_dataloader_v1(
    story_text, batch_size=batch_size, max_length=max_length, stride=1, shuffle=False)
data_iter = iter(dataloader)     
inputs, targets = next(data_iter)
print(inputs, "\n", inputs.shape, "\n")
print(targets, "\n", targets.shape)

tensor([[ 3198,  8872,   290, 37516],
        [ 8872,   290, 37516,    12],
        [  290, 37516,    12, 26548],
        [37516,    12, 26548, 16059],
        [   12, 26548, 16059,    13],
        [26548, 16059,    13,  1320],
        [16059,    13,  1320,   373],
        [   13,  1320,   373,   477]]) 
 torch.Size([8, 4]) 

tensor([[ 8872,   290, 37516,    12],
        [  290, 37516,    12, 26548],
        [37516,    12, 26548, 16059],
        [   12, 26548, 16059,    13],
        [26548, 16059,    13,  1320],
        [16059,    13,  1320,   373],
        [   13,  1320,   373,   477],
        [ 1320,   373,   477,    13]]) 
 torch.Size([8, 4])


In [9]:
vocab_size = 50257  
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embeddings = token_embedding_layer(inputs)

# The 8 × 4 × 256–dimensional tensor output shows that each token ID is now embedded as a 256-dimensional vector.
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [10]:
# Add positional embedding layer.
# For a GPT model’s absolute embedding approach, we just need to create another embedding layer
# that has the same embedding dimension as the token_embedding_ layer:

context_length = max_length 
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

# PyTorch will add the 4 × 256–dimensional pos_embeddings tensor to each 4 × 256–dimensional token embedding tensor 
# in each of the eight batches:
print(pos_embeddings.shape)

torch.Size([4, 256])


In [11]:
input_embedding = token_embeddings + pos_embeddings
print(input_embedding.shape)
print(input_embedding[0])

torch.Size([8, 4, 256])
tensor([[ 0.7998, -0.2336,  0.9178,  ...,  0.3650,  3.5821, -0.1087],
        [ 1.8502, -1.2202, -0.5523,  ...,  0.6996, -0.5810,  0.6370],
        [ 0.3747, -1.3935,  0.1445,  ...,  0.1859, -2.2134,  0.5034],
        [-0.7105, -1.0425,  0.5492,  ...,  0.7626,  0.3722,  1.0562]],
       grad_fn=<SelectBackward0>)
