In [1]:
# Read the file
import os  
import urllib.request 

if not os.path.exists("Resources/Story.txt"):
    URL = ("https://raw.githubusercontent.com/majidarasteh/Large-Language-Model_LLM/refs/heads/main/Resources/Story.txt")
    file_path = "Story.txt"
    urllib.request.urlretrieve(URL, file_path)

with open("Story.txt", "r", encoding="utf-8") as f:
    story_text = f.read()

In [2]:
import tiktoken
print(tiktoken.__version__)

0.9.0


In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
# Encoding a text with BPE
text = "Hello world!, it is just a test..."
tokenizer.encode(text)

[15496, 995, 28265, 340, 318, 655, 257, 1332, 986]

In [5]:
# Decoding the encoded text
tokenizer.decode(tokenizer.encode(text))

'Hello world!, it is just a test...'

In [6]:
# Special tokens as <|endoftext|> in gpt2
text = "Hello world!, <|endoftext|>, how are you? lkjrjoi09ri#@$#@$" 
tokenizer.encode(text)

ValueError: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


In [None]:
# Handling special tokens as <|endoftext|> 
tokenizer.encode(text, allowed_special={"<|endoftext|>"})

In [7]:
tokenizer.decode(tokenizer.encode(text, allowed_special={"<|endoftext|>"}))

'Hello world!, <|endoftext|>, how are you? lkjrjoi09ri#@$#@$'

In [8]:
# Tokenizing the text of the uploaded file.
encoded_text = tokenizer.encode(story_text)
encoded_text[:10]

[3198, 8872, 290, 37516, 12, 26548, 16059, 13, 1320, 373]

In [9]:
len(encoded_text)

2733

In [10]:
# example for the concept of sliding window
#1 The context size determines how many tokens are included in the input.
context_size = 11
for i in range(1, context_size+1):
    condext = encoded_text[:i]
    desire = encoded_text[i]

    print(condext, "---->", desire)

[3198] ----> 8872
[3198, 8872] ----> 290
[3198, 8872, 290] ----> 37516
[3198, 8872, 290, 37516] ----> 12
[3198, 8872, 290, 37516, 12] ----> 26548
[3198, 8872, 290, 37516, 12, 26548] ----> 16059
[3198, 8872, 290, 37516, 12, 26548, 16059] ----> 13
[3198, 8872, 290, 37516, 12, 26548, 16059, 13] ----> 1320
[3198, 8872, 290, 37516, 12, 26548, 16059, 13, 1320] ----> 373
[3198, 8872, 290, 37516, 12, 26548, 16059, 13, 1320, 373] ----> 477
[3198, 8872, 290, 37516, 12, 26548, 16059, 13, 1320, 373, 477] ----> 13


In [11]:
# decoding
context_size = 11
for i in range(1, context_size+1):
    condext = encoded_text[:i]
    desire = encoded_text[i]

    print(tokenizer.decode(condext), "---->", tokenizer.decode([desire]))

One ---->  dollar
One dollar ---->  and
One dollar and ---->  eighty
One dollar and eighty ----> -
One dollar and eighty- ----> seven
One dollar and eighty-seven ---->  cents
One dollar and eighty-seven cents ----> .
One dollar and eighty-seven cents. ---->  That
One dollar and eighty-seven cents. That ---->  was
One dollar and eighty-seven cents. That was ---->  all
One dollar and eighty-seven cents. That was all ----> .


In [12]:
import torch
print(torch.__version__)

2.6.0


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

"""
   PyTorch’s Dataset Class is a standard way to load data.
   GPTDatasetV1: Takes raw text and splits it into pairs of input_chunk + target_chunk.
   Returns one pair at a time when asked.

   PyTorch Dataset classes make training neater and faster. 
   Key Features of a Dataset Class:
     1. __len__ Method: Answers: "How many items are in this dataset?"
     2. __getitem__ Method: Answers: "Give me item #X."

   1 Tokenizes the entire text
   2 Uses a sliding window to chunk the book into overlapping sequences of max_length
   3 Returns the total number of rows in the dataset
   4 Returns a single row from the dataset
"""
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)    

        for i in range(0, len(token_ids) - max_length, stride):     
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):    
        return len(self.input_ids)

    def __getitem__(self, idx):         
        return self.input_ids[idx], self.target_ids[idx]

In [14]:
"""  Data loader = Efficient text-to-batches converter.
     It splits the text into chunks (input & target pairs).
     Converts them into PyTorch tensors. PyTorch tensors make training fast and GPU-friendly.

     1. Initializes the tokenizer
     2. Creates dataset
     3. drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training.
     4. The number of CPU processes to use for preprocessing

"""
def create_dataloader_v1(txt,            # The input text data to process.
                         batch_size=4,   # Number of input-target pairs per batch (4 sequences processed simultaneously).
                         max_length=256, # Maximum length (in tokens) for each input sequence.
                         stride=128,     # How many tokens the sliding window moves forward between sequences.
                         shuffle=True,   # Whether to randomize the order of sequences before batching.
                         drop_last=True, # If True, discards incomplete batches at the end.
                         num_workers=0   #Number of CPU cores for parallel data loading.
                        ):
    tokenizer = tiktoken.get_encoding('gpt2')                        
    dataset = GPTDatasetV1(txt,        # Your text
                           tokenizer,  # Your tokenizer object
                           max_length, # Number of tokens
                           stride      # Tokens to slide window
                          ) 
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,     
        num_workers=num_workers     
    )

    return dataloader

In [15]:
# Converts dataloader into a Python iterator to fetch the next entry via Python’s built-in next() function

with open('Resources/Story.txt', 'r', encoding='utf-8') as f:
    story_text = f.read()

dataloader = create_dataloader_v1(
    story_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)     
first_batch = next(data_iter)
print(first_batch)

[tensor([[628, 628, 198, 198]]), tensor([[628, 198, 198,  27]])]


In [16]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[628, 198, 198,  27]]), tensor([[198, 198,  27,   0]])]


In [17]:
dataloader = create_dataloader_v1(
    story_text, batch_size=8, max_length=4, stride=1,
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print('Inputs:\n', inputs)
print('\nTargets:\\n', targets)

Inputs:
 tensor([[  628,   628,   198,   198],
        [  628,   198,   198,    27],
        [  198,   198,    27,     0],
        [  198,    27,     0, 18227],
        [   27,     0, 18227,  4177],
        [    0, 18227,  4177,    56],
        [18227,  4177,    56, 11401],
        [ 4177,    56, 11401, 27711]])

Targets:\n tensor([[  628,   198,   198,    27],
        [  198,   198,    27,     0],
        [  198,    27,     0, 18227],
        [   27,     0, 18227,  4177],
        [    0, 18227,  4177,    56],
        [18227,  4177,    56, 11401],
        [ 4177,    56, 11401, 27711],
        [   56, 11401, 27711,    29]])
