In [2]:
#CREATING TOKENS FOR LLM

In [3]:
#Importing our dataset
with open("/content/drive/MyDrive/TOKENIZATION/the-verdict.txt","r", encoding="UTF-8") as f:
  raw_text = f.read()
print("Total numbers of characters:",len(raw_text))
print(raw_text[:99])

Total numbers of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:
#The goal is to tokenize these 20479 characters.

In [5]:
#I use the "re" library for spliting into token/individual words.
#Example:
import re
text = "Hello, this is Glen. Be free."
result = re.split(r'(\s)',text) #Here the '(\s)' splits where there is a white space.
print(result)

['Hello,', ' ', 'this', ' ', 'is', ' ', 'Glen.', ' ', 'Be', ' ', 'free.']


In [6]:
#But I want commas and fullstops to be split individually therefore:
result = re.split(r'([,.]|\s)',text)
print(result)

['Hello', ',', '', ' ', 'this', ' ', 'is', ' ', 'Glen', '.', '', ' ', 'Be', ' ', 'free', '.', '']


In [7]:
#But the white space characters are very much available and they're not required therefore:
result = [item for item in result if item.strip()] #.strip() removes whitespace.
print(result)

['Hello', ',', 'this', 'is', 'Glen', '.', 'Be', 'free', '.']


In [8]:
#Now watch me apply this tokenization scheme to the raw_text.
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:30]) #Just checking the first 30 tokens.

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [9]:
print(len(preprocessed))

4690


In [10]:
#Create token IDs
all_words = sorted(set(preprocessed)) #Sorts in alphabetical order
vocabulary_size = len(all_words)
print(vocabulary_size)

1130


In [11]:
#Now the IDs
vocab =  {token:integer for integer, token in enumerate(all_words)} #Assigning each unique token with a unique number ID.
for i, item in enumerate(vocab.items()):
  print(item)
  if i >= 50:
    break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [12]:
#Instantiate a class for tokenization
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s, i in vocab.items()} #'s' is our token 'i' is the unique id for vocab.

  def encode(self, text):
    preprocessed = re.split(r'[,.:;?_!"()\']|--|\s',text)
    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    #Replacing the spaces before the punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [13]:
#Let's test the class above.
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 850, 988, 602, 533, 746, 1126, 596, 67, 38, 851, 1108, 754, 793]


In [14]:
#Now let's see if it can convert the token ids into text
tokenizer.decode(ids)

'It s the last he painted you know Mrs Gisburn said with pardonable pride'

In [15]:
# #Let's try to use words not in the vocabulary.
# text = "Glen, do you like tea"
# tokenizer.encode(text)
# #Of course there's a key error i.e Glen. This highlights the need to consider large and diverse training sets to extend vocabulary.

In [16]:
#Adding Special Context Tokens to handle unknown words.
#I can modify the tokenizer to add a new unknow token |unk| that handles unknown words and |endoftext| token for between unrelated text.
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>","<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [17]:
len(vocab)

1132

In [18]:
#Let's see the last 5 entries in the updated vocab.
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [19]:
 #Instantiate a class for tokenization
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s, i in vocab.items()} #'s' is our token 'i' is the unique id for vocab.

  def encode(self, text):
    preprocessed = re.split(r'[,.:;?_!"()\']|--|\s', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>" for item in preprocessed #Dealing with unkown values.
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[s] for s in ids])
    #Replacing the spaces before the punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [20]:
#Now let's see what V2 can do.
tokenizer = SimpleTokenizerV2(vocab)

In [21]:
#Our unknowns are "Glen" and "palace".
text1 = "Glen, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1,text2))
print(text)

Glen, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [22]:
tokenizer.encode(text)

[1131, 355, 1126, 628, 975, 1130, 55, 988, 956, 984, 722, 988, 1131]

In [23]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|> do you like tea <|endoftext|> In the sunlit terraces of the <|unk|>'

In [24]:
#There are other 3 special tokens i.e:
# 1: beginning of sequence(BOS) that signifies to the LLM where a piece of content begins.
# 2: end of sequence(EOS) which is useful in concatenating multiple unrelated texts.
# 3: padding (PAD) which ensures all texts have tehe same length.
#GPT models don't rely on these three tokens except for <|endoftext|>. They also don't use <|unk|> for unknowwn words instead they break the words into subwords units through Byte Pair Encoding.

In [25]:
#BPE IS MUCH MORE EFFECTIVE IN TOKENIZATION THUS IT SHALL BE IMPLEMENTED.
! pip3 install tiktoken #tiktoken is BPE tokenizer made for OpenAi models. You can check it on github.



In [26]:
import importlib
import tiktoken
print("Tiktoken version:", importlib.metadata.version("tiktoken"))

Tiktoken version: 0.12.0


In [27]:
#Now instantiating the tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [28]:
#Now let's see if it will produce some ids.
text = (
    "Glen, do you like tea? <|endoftext|> In the sunlit terraces of"
    "of someunknownPlace."
)
integers = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

[9861, 268, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 1659, 617, 34680, 27271, 13]


In [29]:
#Converting the ids back to text.
our_text = tokenizer.decode(integers)
print(our_text)

Glen, do you like tea? <|endoftext|> In the sunlit terraces ofof someunknownPlace.


In [30]:
#Let's just use random words and try to encode and decode it.
text = "Dgdneh hdkba"
integers = tokenizer.encode(text)
print(integers)
string = tokenizer.decode(integers)
print(string)

[35, 21287, 710, 71, 289, 34388, 7012]
Dgdneh hdkba


In [31]:
#CREATING INPUT-TARGET PAIRS

In [32]:
#First encode the dataset using BPE
with open("/content/drive/MyDrive/TOKENIZATION/the-verdict.txt","r",encoding="UTF-8") as f:
  raw_text = f.read()

encoded_text = tokenizer.encode(raw_text)
print(len(encoded_text))

5145


In [33]:
#So the vocabulary size is 5145.

In [34]:
#Let's see the remove the first 50 tokens
encoded_sample = encoded_text[50:]

In [35]:
#The context size determines how many tokens are included in the output
context_size = 7 #So the model will look at a sequence of 7 words to to predict the next word in the sequence.
#For example: input x has the first 4 tokens [1,2,3,4] and the target y is the next 4 tokens [2,3,4,5]

x = encoded_sample[:context_size] #Take the first 7.
y = encoded_sample[1:context_size+1] #Shifting the input by 1 position.
print(f"x:{x}")
print(f"y:     {y}")

x:[290, 4920, 2241, 287, 257, 4489, 64]
y:     [4920, 2241, 287, 257, 4489, 64, 319]


In [36]:
#In a for loop it'll look like:
for i in range(1, context_size+1): #This loop will go from 1 to 7
  context = encoded_sample[:i]
  desired = encoded_sample[i]
  print(context,"--------->", desired)

[290] ---------> 4920
[290, 4920] ---------> 2241
[290, 4920, 2241] ---------> 287
[290, 4920, 2241, 287] ---------> 257
[290, 4920, 2241, 287, 257] ---------> 4489
[290, 4920, 2241, 287, 257, 4489] ---------> 64
[290, 4920, 2241, 287, 257, 4489, 64] ---------> 319


In [37]:
#In short, everything left of "--------->" is the input an LLM would receive and the on the right is what it would try to predict.

In [38]:
#Now let's turn it into text to get an overall understanding.
for i in range(1, context_size+1): #This loop will go from 1 to 7
  context = encoded_sample[:i]
  desired = encoded_sample[i]
  print(tokenizer.decode(context),"--------->", tokenizer.decode([desired]))

 and --------->  established
 and established --------->  himself
 and established himself --------->  in
 and established himself in --------->  a
 and established himself in a --------->  vill
 and established himself in a vill ---------> a
 and established himself in a villa --------->  on


In [39]:
#Now the inputs are required to be in tensors (input tensor for the LLM and target tensor for the LLM to predict) hence the introduction of a DATALOADER.

In [40]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride): #max_length is the context size.  stride so as to know how much to slide.
    self.input_ids = []
    self.target_ids = []

    #Tokenize the entire text.
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    #Using a sliding window to chunk the book (The verdict) into overlapping sequence of max_length
    for i in range(0, len(token_ids) - max_length, stride): #I don't want to spill over the dataset hence '- max_length'.
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i:i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [41]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0): #drop_last=True so as to prevent loss spikes i.e if the last batch is not the same as the specified batch_size.

  #Initialize the tokenizer.
  tokenizer = tiktoken.get_encoding("gpt2")

  #Create Dataset
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  #Create Dataloader
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )

  return dataloader

In [42]:
#Testing the dataloader with a batch size of 1 with context size of 4.
with open("/content/drive/MyDrive/TOKENIZATION/the-verdict.txt", "r", encoding="UTF-8") as f:
  raw_text = f.read()

In [43]:
#Next converting the dataloader into a python iterator to fetch the next entry via python's built-in next functionI()
import torch
print("Pytorch version:", torch.__version__)
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

Pytorch version: 2.9.0+cu126
[tensor([[  40,  367, 2885, 1464]]), tensor([[  40,  367, 2885, 1464, 1807]])]


In [44]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[ 367, 2885, 1464, 1807, 3619]])]


In [45]:
#Looking at first_batcch and second_batch we can see the stride is one.
#Smaller batches save computational power but are more noisy during training.

In [46]:
#Let's see the result when the batch_size is greater than 1.
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n",inputs)
print("Targets:\n",targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[   40,   367,  2885,  1464,  1807],
        [ 1807,  3619,   402,   271, 10899],
        [10899,  2138,   257,  7026, 15632],
        [15632,   438,  2016,   257,   922],
        [  922,  5891,  1576,   438,   568],
        [  568,   340,   373,   645,  1049],
        [ 1049,  5975,   284,   502,   284],
        [  284,  3285,   326,    11,   287]])


In [1]:
#The increase in batch size is to prevent overfitting and to ensure we don't skip a single word and avoid any overlapping.

In [47]:
##Vector embeddings
#The need of capturing the semantic meaning of a word basically.
! pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [48]:
#Performing a demo to see the overall vector embedding.
import gensim.downloader as api
model = api.load("word2vec-google-news-300") #300 dimensional vectors, trained on a google news dataset



In [49]:
#Let's see the representation of 'cat' as a 300-dimensional vector.
vector_dict = model
print(vector_dict['cat'])

[ 0.0123291   0.20410156 -0.28515625  0.21679688  0.11816406  0.08300781
  0.04980469 -0.00952148  0.22070312 -0.12597656  0.08056641 -0.5859375
 -0.00445557 -0.296875   -0.01312256 -0.08349609  0.05053711  0.15136719
 -0.44921875 -0.0135498   0.21484375 -0.14746094  0.22460938 -0.125
 -0.09716797  0.24902344 -0.2890625   0.36523438  0.41210938 -0.0859375
 -0.07861328 -0.19726562 -0.09082031 -0.14160156 -0.10253906  0.13085938
 -0.00346375  0.07226562  0.04418945  0.34570312  0.07470703 -0.11230469
  0.06738281  0.11230469  0.01977539 -0.12353516  0.20996094 -0.07226562
 -0.02783203  0.05541992 -0.33398438  0.08544922  0.34375     0.13964844
  0.04931641 -0.13476562  0.16308594 -0.37304688  0.39648438  0.10693359
  0.22167969  0.21289062 -0.08984375  0.20703125  0.08935547 -0.08251953
  0.05957031  0.10205078 -0.19238281 -0.09082031  0.4921875   0.03955078
 -0.07080078 -0.0019989  -0.23046875  0.25585938  0.08984375 -0.10644531
  0.00105286 -0.05883789  0.05102539 -0.0291748   0.193359

In [61]:
#Just confirming the shape.
print(vector_dict['cat'].shape)

(300,)


In [53]:
#Now there's a semantic meaning between say 'King', 'Woman' & 'man', let's see how such a scenario is captured.
print(vector_dict.most_similar(positive=["king","woman"], negative=["man"],topn=10)) #In short king+woman=queen.

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.518113374710083), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


In [58]:
#Checcking similarity between random pairs of words.
print(vector_dict.similarity("boy","girl"))
print(vector_dict.similarity("man","woman"))
print(vector_dict.similarity("uncle","aunt"))
print(vector_dict.similarity("cow","pluto"))

0.8543272
0.76640123
0.7643474
0.154539


In [59]:
#Looking for most similar words.
print(vector_dict.most_similar(positive=["Beach"],topn=10))

[('beach', 0.6789422631263733), ('John_Pacenti_Palm', 0.6141721606254578), ('Shores', 0.5926690101623535), ('Beach_Oceanfront', 0.5917191505432129), ('Oceanfront', 0.5849085450172424), ('beachfront', 0.5822123885154724), ('Haulover_Park', 0.5812641978263855), ('Beaches', 0.5807040333747864), ('beaches', 0.571523904800415), ('&_Crematory_Boynton', 0.5706855654716492)]
