# Large Language Model

1. In this notebook I have implemented a large language model from scratch
2. The code below follows following steps from a transformder architecture:
    - Tokenize the training text input
      
  

In [2]:
import torch

## Preprocessing : Read and Process the Text, Convert to Tokens, Embed into the Vectors

## Step 1: Create Tokens from the text read from a book

In [7]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total characters read from book" , len(raw_text))
# print the first 100 characters 
print(raw_text[:99])

Total characters read from book 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


### Tokenize all the text to be used for LLM model 

In [4]:
# Use Regular Expressions 
import re

text = "Hello, This is a Test to split words from a large text.-- Just testing"
#result = re.split(r'(\s)', text)

# split by all these characters 
result = re.split(r'([,.:;?_!"()\']|--|\s)', text) 

# strip words of whitespaces and also remove whitespaces 
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'This', 'is', 'a', 'Test', 'to', 'split', 'words', 'from', 'a', 'large', 'text', '.', '--', 'Just', 'testing']


In [85]:
# Apply the tokenizer code on entire raw text from the book
preprocessed = re.split(r'([,.:;?_!"\']|--|\s)', raw_text)

preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:99])
print("Total Tokens:", len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--']
Total Tokens: 4685


### Convert Tokens to Token IDs 

- Build Vocabulary - list of all the unique tokens sorted alphabetically, give them unique indexes
- Token id of any token is its inside the vocabulary built so far

In [86]:
# Convert preprocessed to a Set so all elements are unique and then sort it
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1131


In [87]:
# Allocate an integer to the words 
# This is encoding words to number -- we woild need decoder to convert number to corresponding word
vocab = {token:integer for integer,token in enumerate(all_words)}


In [88]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break;
    

('!', 0)
('"', 1)
("'", 2)
('(I', 3)
('(Though', 4)
(')', 5)
(',', 6)
('--', 7)
('.', 8)
(':', 9)
(';', 10)
('?', 11)
('A', 12)
('Ah', 13)
('Among', 14)
('And', 15)
('Are', 16)
('Arrt', 17)
('As', 18)
('At', 19)
('Be', 20)
('Begin', 21)
('Burlington', 22)
('But', 23)
('By', 24)
('Carlo', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Croft)', 30)
('Destroyed', 31)
('Devonshire', 32)
('Don', 33)
('Dubarry', 34)
('Emperors', 35)
('Florence', 36)
('For', 37)
('Gallery', 38)
('Gideon', 39)
('Gisburn', 40)
('Gisburns', 41)
('Grafton', 42)
('Greek', 43)
('Grindle', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)


In [107]:
# Define an Tokenizer class to provide logic for Encoding and Decoding Text using provided Vocabulary
# Constructor accepts a vocabulary - which is the dictionary storing the words we want to train on an numerical ids
class SimpleTokenizerV1:
    def __init__(self, vocab):
        #class variable str_to_int would store vocabulary words with their token ids
        self.str_to_int = vocab
        #class variables int_to_str will store the mapping of token ids to strings - opposite of vocabulary
        self.int_to_str = {i:s for s,i in vocab.items()}

    # Use given vocabulary to store mapping of words to token ids (given in the vocabulary)
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|__|\s)', text)
        preprocessed = [ item.strip() for item in preprocessed if item.strip()
        ]
        # use the vocabulary - str to int dictionary to get list of all token ids
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        # remove spaces before every puctuations
        text = re.sub(r'([,.:;?_!"()\']|__|\s)', r'\1', text)
        return text.strip()
        

## Instantiate Tokenizer

In [108]:
tokenizer = SimpleTokenizerV1(vocab)

#test the encoder
text = """"It is the last of all the Greek," 
            said he. """
ids = tokenizer.encode(text)
print(ids)

[1, 58, 585, 989, 603, 723, 146, 989, 43, 6, 1, 852, 534, 8]


In [110]:
#test the decoder
tokenizer.decode(ids)

'" It is the last of all the Greek , " said he .'

## Special Context Tokens to Handle Unknown Words in Dictionary and End of Text Token

### End of Text tokens are embedded between texts from two different sources or unrelated texts.

In [115]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

1133


In [120]:
list(vocab.items())[-5:]


[('younger', 1128),
 ('your', 1129),
 ('yourself', 1130),
 ('<|endoftext|>', 1131),
 ('<|unk|>', 1132)]

In [121]:
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1128)
('your', 1129)
('yourself', 1130)
('<|endoftext|>', 1131)
('<|unk|>', 1132)


In [134]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        #class variable str_to_int would store vocabulary words with their token ids
        self.str_to_int = vocab
        #class variables int_to_str will store the mapping of token ids to strings - opposite of vocabulary
        self.int_to_str = {i:s for s,i in vocab.items()}

    # Use given vocabulary to store mapping of words to token ids (given in the vocabulary)
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|__|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        #print(self.str_to_int)
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed 
        ]
        
        # use the vocabulary - str to int dictionary to get list of all token ids
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        # remove spaces before every puctuations
        text = re.sub(r'([,.:;?_!"()\']|__|\s)', r'\1', text)
        return text.strip()

In [135]:
# testing SimpleTokenizerV2
tokenizer1 = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In sunlit terraces of palace"

final_text = " <|endoftext|> ".join((text1, text2))
print(final_text)

Hello, do you like tea? <|endoftext|> In sunlit terraces of palace


In [136]:
ids = tokenizer1.encode(final_text)
print(ids)

[1132, 6, 356, 1127, 629, 976, 11, 1131, 57, 957, 985, 723, 1132]


In [137]:
tokenizer1.decode(ids)



'<|unk|> , do you like tea ? <|endoftext|> In sunlit terraces of <|unk|>'

### Some additional tokens used during training of LLMs
- [BOS] : Beginning of Sequence
- [EOS] : End of Sequence
- [PAD] : Padding for smaller batches when they are trained in parallel

## Byte-Pair Encoding : Word can be broken into sub word tokens 

In [1]:
# Use Python open-source tiktoken library that implements BPE (Byte Pair Encoding) Algorithm for Tokenizing
# Also used by ChatGPT
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Downloading tiktoken-0.9.0-cp312-cp312-macosx_11_0_arm64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl (284 kB)
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.11.6 tiktoken-0.9.0


In [15]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [16]:
#instantiate tokenizer using GPT encoder

tokenizer = tiktoken.get_encoding("gpt2")

In [17]:
# Sample code to use tiktoken tokenizer
original_text = (
    "Howdy whachha doing? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)
# special tokens are assigned tokens towards the end of the dictionary tokens
# Their token number usually indicates how big the vocabulary is going to be 
int_tokens = tokenizer.encode(original_text, allowed_special={'<|endoftext|>'})

# vocabulary size is 50255, after which '<|endoftext|>' is assigned the token number 50256
# someunknownPlace does not give error as the BPE scheme can successfully break it down in to some byte pair
print(int_tokens)

[2437, 9892, 348, 620, 3099, 1804, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [18]:
#Test the Reverse of Tokenizing
decoded_text = tokenizer.decode(int_tokens)
print(decoded_text)

Howdy whachha doing? <|endoftext|> In the sunlit terracesof someunknownPlace.


## Input - Target Pair Creation

In [20]:
# Use sliding window approach to create input-target pairs 
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [21]:
# Remove 50 words from the enc_text 
enc_sample = enc_text[50:]
print(len(enc_sample))

5095


In [25]:
# Train the LLM, repeatedly by using three different partitions to the enc_text:
# input part of length = input context size 
# target part = one word at index of one position to the right of the input context
# hidden part = portion of input text beyond input and target part 
context_size = 4

# input array is an arrat from the text of length = context size
x = enc_sample[:context_size]

#shift indices by 1 to get the target array , also of length = context size so that 
y = enc_sample[1: context_size+1]

print(f"x: {x}")
print(f"y:      {y}")



x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [26]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "-------->" , desired)

[290] --------> 4920
[290, 4920] --------> 2241
[290, 4920, 2241] --------> 287
[290, 4920, 2241, 287] --------> 257


In [29]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    #desired is a single integer so has to be converted to a list
    print(tokenizer.decode(context), "-------->" , tokenizer.decode([desired]))

 and -------->  established
 and established -------->  himself
 and established himself -------->  in
 and established himself in -------->  a


## Implement a data loader to iterate over input dataset and return batches of data 
(For next word prediction task)
DataLoader class helps load data in batches and utilize parallel processing efficiently

- Entire input text will be loaded and stored in form of input tensor: Each row in the tensor would represent one new input context of length 4 from the entire input.
  x = tensor([["In", "the", "heart" , "of"],
             ["the", "city" , "stood", "the"],
             ["old", "library", ".", "a"],
  ...
  ])
- Target tensor: There will be rows of target part of the input text which is an array of predicted word for each combination of inputs. It is one position offset from the entire input
  y = tensor([["the", "heart", "of", "the"],
  ["city", "stood", "the", "old"],
  ["library", ",","a","relic"]
  ])

In [22]:
# we need data in form of tensors for future use of PyTorch library 
# Input Tensors and Target Tensors

from torch.utils.data import Dataset, DataLoader

# This class will be used to create dataset in the form of input output pairs for training 
# context_size = how big each input tensor row should be 
# how many words should we skip for every iteration
# This class implenents the Dataset class of PyTorch
class GPTDatasetV1(Dataset):
    def __init__(self, input_text, tokenizer, context_size, stride):
        self.input_ids = []
        self.target_ids = []
        
        # Step1: Tokenize the entire input text as we only work with the tokens 
        token_ids = tokenizer.encode(input_text, allowed_special = {"<|endoftext|>"})


        # Step2: Read chunks of book text and add them to input and target tensors
        for i in range(0, len(token_ids) - context_size , stride):
            input_chunk = token_ids[i: i+context_size]
            target_chunk = token_ids[i+1:i+context_size+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    # Step3: Return the size of the tensor dataset
    def __len__(self):
        return len(self.input_ids)
    

    # Step4: Return the input row at index 'idx' and corresponding target row at idx. It returns two items x and y
    # It is a Map style data loader 
    def __getitem__(self, idx):
        return self.input_ids[idx],self.target_ids[idx]
        
    
        

In [23]:
# Feed above dataset to the Pytorch dataloader to get the capabilities of loading data in batches

def create_dataloader_v1(txt, batch_size = 4, max_length=256, stride = 128, 
                         shuffle=True, drop_last = True, num_workers=0):
    #Step1: Initialize tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")


    #Step2: Create customer GPT Dataset defined above 
    #GPTs use a context length of 256 and more 
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)


    #Step3: Create dataloader, drop_last = True will let dataloader drop that last batch if it is shorter than the given batch_size
    # DAtaloader uses the getitem method defined in the dataset class to return the pairs of input x and target output y values 
    
    data_loader = DataLoader(
                dataset,
                batch_size = batch_size, 
                shuffle = shuffle,
                drop_last = drop_last,
                num_workers = num_workers
    )
    return data_loader

    #Step4: specify number of CPU processes to use

In [24]:
#TEST

with open("the-verdict.txt", "r" ,encoding="utf-8") as f:
    raw_text = f.read()

In [67]:

#print("Pytorch Version", torch.__version__)
dataloader = create_dataloader_v1(raw_text, batch_size = 1, max_length=4, stride=4, shuffle=False)


# USE PYTHON ITERATOR TO FETCH NEXT AND NEXT BATCHES 
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [68]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]


In [72]:
#Batch Size = 8

dataloader = create_dataloader_v1(raw_text, batch_size = 8 , max_length=4, stride=4, shuffle=False)


# USE PYTHON ITERATOR TO FETCH NEXT AND NEXT BATCHES 
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Vector Embeddings / Token Embeddings

- Tokens are converted into Vectors

####  Assume Word Tokenizer 
- Input text = "quick fox is in the house"
- Words and their tokens in alphabetical order:
  - fox : 0
  - house: 1
  - in: 2
  - is: 3
  - quick: 4
  - the: 5

In [92]:
input_ids = torch.tensor([ 2,3,4,5])
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
# Using torch library to create an Embedding for given vocab size and output vector dimensions
# Embeddings is a dictionary(lookup table) of weights of given vocab size and output dim 
# Following statement will create an Embedding layer weight matrix of size vocab_size x output_dim (in this case 6 x 3)
# The matrix contains weights for NN initialized randomly
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [80]:
# Print entire tensor for embedding layer 
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [91]:
# Print the weights at id 3 
# Pass argument as tensor 3 
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [93]:
# Prints weights for input ids inside input id vector 
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], grad_fn=<EmbeddingBackward0>)


## Absolute Positional Encoding

In [33]:
# Assume a vocab size of 50257 and embedded into vector of dimensional 256

vocab_size = 50257
output_dim = 256

#First create an embedding layer for the given size of vocab and vector size / output dimension
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [34]:
#Context length
max_length = 4 

dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride = max_length, shuffle = False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [35]:
print("Token IDs:\n", inputs)
print("Input shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Input shape:
 torch.Size([8, 4])


In [36]:
#Use embedding layer to convert each token id in the input tensor to vector of dimension 256
# Embedding layer will store and facilitiate fetching vector representation of each input token 
# If the input token tensor was 8x4: after vector embedding input tensor would be 8x4x256 
# this is because each input token would be converted to vector of size 256

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [38]:
# Now add positional embedding information to the embedded vectors
# At one time only an input sequence of length 4 tokens (context length) would be given as input to the LLM
# Input size for LLM at any point is 4 in order to predict next token or word
# Hence only 4 positions are to required to be encoded in any input sequence at a time 
# Positional Embedding layer size : 4 x vector_dimension = 4 x 256
context_length = max_length 
# create positional embeddings layer
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [39]:
# Embeddings layers are lookup tables 
# To get positional embedding vectors we only need to pass the position value 0, 1, 2 or 3
# Hence we have pass the array [0, 1, 2, 3]
pos_embeggings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeggings.shape)

torch.Size([4, 256])


## Final Input Embeddings
### Input Embeddings + Positional Embeddings
- [8 x 4 x 256] + [4 x 256] = [8 x 4 x 256]