# Tokenizers

In [5]:
DRIVE_LINK = "/content/drive/MyDrive/LLMFromScratch"

## Word based tokenizer

In [6]:
import re

In [7]:
with open(f"{DRIVE_LINK}/verdict.txt", "r") as f:
  data = f.read()

In [8]:
data

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [9]:
processed_data = re.split(r'([.,"!?]|--|\s)', data)
# Now we wanted to remove the space
processed_data = [d for d in processed_data if d.strip()]
processed_data[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

In [10]:
# Assign token ids by sorting the list first and then assigning enum values
vocab_set = sorted(set(processed_data))
vocab_dict_encoded = {vocab_string: number for number, vocab_string in enumerate(vocab_set)}
vocab_dict_decoded = {v: k for k, v in vocab_dict_encoded.items()}

In [11]:
# Now lets create a class with method encode and decode

In [12]:
class WordTokenizer:

  def __init__(self, vocab):
    self.vocab = vocab
    self.vocab_encoded = {vocab_string: number for number, vocab_string in enumerate(self.vocab)}
    self.vocab_decoded = {v: k for k, v in self.vocab_encoded.items()}

  def encode(self, text):
    processed_data = re.split(r'([.,"!?]|--|\s)', text)
    # Now we wanted to remove the space
    processed_data = [d for d in processed_data if d.strip()]
    # Convert to tokens and return the list
    token_ids = [self.vocab_encoded[processed] for processed in processed_data]
    return token_ids

  def decode(self, tokens):
    token_to_str = [self.vocab_decoded[token] for token in tokens]
    decoded_string = " ".join(token_to_str)
    return decoded_string

In [13]:
data[:33]

'I HAD always thought Jack Gisburn'

In [14]:
tokenizer = WordTokenizer(vocab_set)
text = "I HAD always thought Jack Gisburn"

token_list = tokenizer.encode(text)
token_list

[66, 56, 187, 1059, 74, 47]

In [15]:
tokenizer.decode(token_list) # So the basic tokenizer is working fine

'I HAD always thought Jack Gisburn'

In [16]:
# it cannot handle unknowns
text = "Hello I am happy"
token_list = tokenizer.encode(text)
token_list

KeyError: 'Hello'

In [17]:
# New vocab set where unknowns and end of texts are handled
vocab_set.extend(['<UNK>','<|endoftext|>'])

In [18]:
class WordTokenizerV2:

  def __init__(self, vocab):
    self.vocab = vocab
    self.vocab_encoded = {vocab_string: number for number, vocab_string in enumerate(self.vocab)}
    self.vocab_decoded = {v: k for k, v in self.vocab_encoded.items()}

  def encode(self, text):
    processed_data = re.split(r'([.,"!?]|--|\s)', text)
    # Now we wanted to remove the space
    processed_data = [d for d in processed_data if d.strip()]
    processed_data.append('<|endoftext|>')
    # Convert to tokens and return the list
    token_ids = [self.vocab_encoded[processed] if processed in self.vocab
                 else self.vocab_encoded['<UNK>']
                 for processed in processed_data
                 ]
    return token_ids

  def decode(self, tokens):
    token_to_str = [self.vocab_decoded[token] for token in tokens]
    decoded_string = " ".join(token_to_str)
    return decoded_string

In [19]:
tokenizer = WordTokenizerV2(vocab_set)
text = "I HAD always thought Jack Gisburn"

token_list = tokenizer.encode(text)
token_list

[66, 56, 187, 1059, 74, 47, 1194]

In [20]:
tokenizer.decode(token_list) # So the basic tokenizer is working fine

'I HAD always thought Jack Gisburn <|endoftext|>'

In [21]:
# it cannot handle unknowns
text = "Hello I am happy"
token_list = tokenizer.encode(text)
token_list

[1193, 66, 1193, 1193, 1194]

In [22]:
tokenizer.decode(token_list) # So the basic tokenizer is working fine

'<UNK> I <UNK> <UNK> <|endoftext|>'

## BPE (Sub word tokenization)
Even though word or character can be useful in somecases, it cannot differentiate unknown words. For instance "boy" and "boys" will be treated as separate vocab. Thats why we need to use BPE which is used by GPT itself

In [23]:
!pip install tiktoken



In [24]:
import tiktoken

In [25]:
tokenizer = tiktoken.get_encoding("gpt2") # we dont have to use our own vocab set. We will GPT2's

text = "I HAD always thought Jack Gisburn"

token_list = tokenizer.encode(text)
token_list

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899]

In [26]:
tokenizer.decode(token_list) # So the basic tokenizer is working fine

'I HAD always thought Jack Gisburn'

In [27]:
text = "Hello I am happy"
token_list = tokenizer.encode(text)
token_list

[15496, 314, 716, 3772]

In [28]:
tokenizer.decode(token_list) # So the basic tokenizer is working fine

'Hello I am happy'

In [29]:
text = "Aidhsadisafgsafsa fsfs <|endoftext|>"
token_list = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
token_list

[44245, 11994, 324, 271, 1878, 14542, 1878, 11400, 43458, 9501, 220, 50256]

In [30]:
tokenizer.decode(token_list) # So the basic tokenizer is working fine even with gibberish texts, it can do BPE

'Aidhsadisafgsafsa fsfs <|endoftext|>'

# Preparing a data loader pair
Input and target pair is created with the given context window
The LLM predict context number times

In [31]:
with open(f"{DRIVE_LINK}/verdict.txt", "r") as f:
  data = f.read()

In [43]:
tokenized_id = tokenizer.encode(data,allowed_special={"<|endoftext|>"})
max_len = 4
stride = 4
input_pair_list = []
target_pair_list = []
for i in range(0, (len(tokenized_id) - max_len), stride):
  input_pair = tokenized_id[i:i+max_len]
  target_ids = tokenized_id[i+1:(i+max_len+1)]
  input_pair_list.append(input_pair)
  target_pair_list.append(target_ids)



In [46]:
input_pair_list[:4], target_pair_list[:4]

([[40, 367, 2885, 1464],
  [1807, 3619, 402, 271],
  [10899, 2138, 257, 7026],
  [15632, 438, 2016, 257]],
 [[367, 2885, 1464, 1807],
  [3619, 402, 271, 10899],
  [2138, 257, 7026, 15632],
  [438, 2016, 257, 922]])

In [51]:
import torch
from torch.utils.data import Dataset, DataLoader

# Dataset is used to create a dataset with input and target pairs
# whereas data loaders prepares batches of data
# pair should be [1,2,3,4,5] where input is [1,2,3,4],[5,...] and target id is[2,3,4,5] if the context length is 4

class GPTData(Dataset):

  def __init__(self, text, max_len, tokenizer, strides):
    self.input_pairs = []
    self.target_ids = []

    tokenized_id = tokenizer.encode(text,allowed_special={"<|endoftext|>"})

    for i in range(0, (len(tokenized_id) - max_len), stride):
      input_pair = tokenized_id[i:i+max_len]
      target_id = tokenized_id[i+1:(i+max_len+1)]
      self.input_pairs.append(torch.tensor(input_pair))
      self.target_ids.append(torch.tensor(target_id))

  # from pytorch documentation
  # A custom Dataset class must implement three functions: __init__, __len__, and __getitem__
  def __len__(self):
    return len(self.input_pairs)

  def __getitem__(self, idx):
    return self.input_pairs[idx], self.target_ids[idx]


In [63]:
def create_dataloaders(text, tokenizer, max_len=4, strides=4, shuffle = True):
  dataset = GPTData(text, max_len, tokenizer, strides)

  dataloader = DataLoader(
      dataset,
      batch_size = 4,
      shuffle = shuffle,
      num_workers=0,
      drop_last = True # to remove the last batch if it doesnt fit
  )
  return dataloader

In [64]:
sample_dataloader = create_dataloaders(data, tokenizer)

In [65]:
data_iter = iter(sample_dataloader)

inputs, targets = next(data_iter)

# Print them
print("Inputs:\n", inputs)
print("Targets:\n", targets)

Inputs:
 tensor([[  11,  355,  345,  910],
        [ 340,   26,  290, 3619],
        [ 510,  351,  617, 9105],
        [ 438, 5562,  373,  644]])
Targets:
 tensor([[ 355,  345,  910,   13],
        [  26,  290, 3619,  338],
        [ 351,  617, 9105, 7521],
        [5562,  373,  644,  262]])


In [66]:
# In GPT bigset of context length and strides are used 256 and 128 strides and we are also going to shuffle
dataloader_gpt = create_dataloaders(data, tokenizer, 256, 128, True)

In [67]:
data_iter = iter(dataloader_gpt)

inputs, targets = next(data_iter)

# Print them
print("Inputs:\n", inputs)
print("Targets:\n", targets)

Inputs:
 tensor([[   30,  1400,   438,  ...,  4808, 10134,    62],
        [  550,  6405,   607,  ..., 26546,  1068,   465],
        [  262,  4269,    11,  ...,  4369,    11,   523],
        [  922,  5891,  1576,  ...,  1986,   262,  1109]])
Targets:
 tensor([[ 1400,   438,  1640,  ..., 10134,    62,   339],
        [ 6405,   607,    13,  ...,  1068,   465,  1242],
        [ 4269,    11, 22211,  ...,    11,   523,   326],
        [ 5891,  1576,   438,  ...,   262,  1109,   351]])
