### Creating Input-Target Pairs

In [18]:
import tiktoken
import numpy as np

tokenizer = tiktoken.get_encoding('gpt2')

In [3]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [4]:
enc_sample = enc_text[50:]

In [None]:
context_size = 4
# The input x is the first 4 token [1, 2, 3, 4], and the target y is the next 4 tokens [2, 3, 4, 5]
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f'x: {x}')
print(f'y:      {y}')

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [24]:
print('Context: ', enc_sample[:context_size+1], '\n')

for i in range(1, context_size+1):
    context_array = enc_sample[:i] + ['.'] * (context_size - i)
    print('Input Tensor: ', context_array)
    print('Target Tensor: ', enc_sample[i])
    print('--'*20)

Context:  [290, 4920, 2241, 287, 257] 

Input Tensor:  [290, '.', '.', '.']
Target Tensor:  4920
----------------------------------------
Input Tensor:  [290, 4920, '.', '.']
Target Tensor:  2241
----------------------------------------
Input Tensor:  [290, 4920, 2241, '.']
Target Tensor:  287
----------------------------------------
Input Tensor:  [290, 4920, 2241, 287]
Target Tensor:  257
----------------------------------------


In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    
    print(context, '--->', desired)

[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287
[290, 4920, 2241, 287] ---> 257


In [15]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    
    print(tokenizer.decode(context), '--->', tokenizer.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


### Implementing a Data Loader

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    
    def __init__(self, text, tokenizer, max_length, stride):
        # max_length = context_size
        self.input_ids = []
        self.target_ids = []
        
        # Tokenize the entire text
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        # Use a sliding window to chunk the text into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i:i+max_length]))
            self.target_ids.append(torch.tensor(token_ids[i+1: i+max_length+1]))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [31]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                        stride=128, shuffle=True, drop_last=True,
                        num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding('gpt2')
    
    # create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    # create dataloader
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader

In [32]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [33]:
print('PyTorch version: ', torch.__version__)

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version:  2.5.1
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


- The `first_batch` variable contains two tensors: the first tensor stores the input token IDs, and the second tensor stores the target token IDs.
- Since the `max_length` is set to 4, each of the two tensors contains 4 token IDs.
- It is common to train LLMs with input size of at least 256.
- Overlapping on the input sequences may lead to overfitting...

In [34]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [40]:
dataloader2 = create_dataloader_v1(
    raw_text, batch_size=4, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader2)
first_batch = next(data_iter)
print('Inputs:\n', first_batch[0])
print('Targets:\n', first_batch[1])

Inputs:
 tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]])
Targets:
 tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402],
        [1807, 3619,  402,  271]])


In [41]:
second_batch = next(data_iter)
print('Inputs:\n', second_batch[0])
print('Targets:\n', second_batch[1])

Inputs:
 tensor([[ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]])
Targets:
 tensor([[ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257],
        [10899,  2138,   257,  7026]])
