In [16]:
!pip install tiktoken



In [19]:
import importlib
import tiktoken
tokenizer=tiktoken.get_encoding("o200k_base")

In [20]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
  raw_text=f.read()

In [23]:
enc_text=tokenizer.encode(raw_text)
print(len(enc_text))

4836


Executing the code above will return 4836, the total number of tokens or Vocab size  in the training set,
after applying the BPE tokenizer .

<div class="alert alert-block alert-success">
Next, we remove the first 50 tokens from the dataset for demonstration purposes as it
results in a slightly more interesting text passage in the next steps:</div>

In [25]:
enc_sample=enc_text[50:]

In [27]:
#One of the easiest and most intuitive ways to create the input-target pairs for the nextword prediction task is to create two variables, x and y, where x contains the input tokens
#and y contains the targets, which are the inputs shifted by 1

In [28]:
context_size = 4 #length of the input
#The context_size of 4 means that the model is trained to look at a sequence of 4 words (or tokens)
#to predict the next word in the sequence.
#The input x is the first 4 tokens [1, 2, 3, 4], and the target y is the next 4 tokens [2, 3, 4, 5]

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [11166, 306, 261, 38350]
y:      [306, 261, 38350, 402]


In [29]:
#context size means how many words the model should pay attention at time of orediction of next words

<div class="alert alert-block alert-success">
Processing the inputs along with the targets, which are the inputs shifted by one position,
we can then create the next-word prediction tasks as
follows:</div>

In [31]:
for i in range(1,context_size+1):
  context=enc_sample[:i]
  desired=enc_sample[i]

  print(context,"--->",desired)

[11166] ---> 306
[11166, 306] ---> 261
[11166, 306, 261] ---> 38350
[11166, 306, 261, 38350] ---> 402


In [32]:
#in case of LLM one input o/p pair has context size prediction task
#here one ip->op pair has 4 prediction task

In [35]:
for i in range(1,context_size+1):
  context=enc_sample[:i]
  desired=enc_sample[i]

  print(tokenizer.decode(context),"--->",tokenizer.decode([desired]))

 himself --->  in
 himself in --->  a
 himself in a --->  villa
 himself in a villa --->  on



There's only one more task before we can turn the tokens into embeddings:implementing an efficient data loader that
iterates over the input dataset and returns the inputs and targets as PyTorch tensors, which
can be thought of as multidimensional arrays.
    

In [36]:
#In particular, we are interested in returning two tensors: an input tensor containing the
#text that the LLM sees and a target tensor that includes the targets for the LLM to predict,

**Implementing A Data Loader**

<div class="alert alert-block alert-info">
    
Step 1: Tokenize the entire text
    
Step 2: Use a sliding window to chunk the book into overlapping sequences of max_length

Step 3: Return the total number of rows in the dataset

Step 4: Return a single row from the dataset
</div>

In [52]:
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [46]:
#what we have done

<div class="alert alert-block alert-warning">

The GPTDatasetV1 class in listing 2.5 is based on the PyTorch Dataset class.

It defines how individual rows are fetched from the dataset.

Each row consists of a number of
token IDs (based on a max_length) assigned to an input_chunk tensor.

The target_chunk
tensor contains the corresponding targets.

I recommend reading on to see how the data
returned from this dataset looks like when we combine the dataset with a PyTorch
DataLoader -- this will bring additional intuition and clarity.
    
</div>

In [47]:
#The following code will use the GPTDatasetV1 to load the inputs in batches via a PyTorch
#DataLoader

<div class="alert alert-block alert-info">
    
Step 1: Initialize the tokenizer

Step 2: Create dataset

Step 3: drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes
during training

Step 4: The number of CPU processes to use for preprocessing
    
</div>

In [53]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initializing the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Creating dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [49]:
#Let's test the dataloader with a batch size of 1 for an LLM with a context size of 4,

#This will develop an intuition of how the GPTDatasetV1 class and the create_dataloader_v1 function work together:

In [50]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [54]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [55]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


we have previous experience with deep learning, we  know
that small batch sizes require less memory during training but lead to more noisy model
updates.

Just like in regular deep learning, the batch size is a trade-off and hyperparameter
to experiment with when training LLMs.

In [56]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
