<a href="https://colab.research.google.com/github/onevay/llm_from_scratch/blob/main/tokenize_llm_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text tokenization

In [None]:
with open("/content/Build-a-Large-Language-Model-_From-Scratch_-.txt", "r") as f:
    row_text = f.read()

print(f"Total number of characters: {len(row_text)}")
print(f"Start data: {row_text[:100]}")

Total number of characters: 420297
Start data: Build a Large Language Model (From Scratch)
 1.   welcome
 2.   1_Understanding_Large_Language_Mod


In [None]:
import re

example = "This is example,  rate."
result = re.split(r"(\s)+", example)
print(result)

['This', ' ', 'is', ' ', 'example,', ' ', 'rate.']


In [None]:
result = re.split(r"([,.!?]|\s)+", example)
print(result)

['This', ' ', 'is', ' ', 'example', ' ', 'rate', '.', '']


In [None]:
text = "Hello, world. This --a test."

result = re.split(r"([,.:;{}!?'\"|/]|\s|--)+", text)
result = [token for token in result if token.strip()]
print(result)

['Hello', 'world', 'This', '--', 'a', 'test', '.']


In [None]:
text_tokens = re.split(r"([,.:;{}!?'\"|/]|\s|--)+", row_text)
text_tokens = [token for token in text_tokens if token.strip()]
print(f"Text was split on {len(text_tokens)} tokens")

Text was split on 69444 tokens


In [None]:
vocab = {token: id for id, token in enumerate(sorted(set(text_tokens)))}
for idx, i in enumerate(vocab.items()):
    if idx > 30:
        break
    print(i)

('!', 0)
('"', 1)
('#', 2)
('###', 3)
('#A', 4)
('#B', 5)
('#C', 6)
('#D', 7)
('#E', 8)
('#F', 9)
('#G', 10)
('#H', 11)
('#I', 12)
('#J', 13)
('#K', 14)
('#L', 15)
('#M', 16)
('$30', 17)
('$30)', 18)
('$4', 19)
('$690', 20)
('%', 21)
('%timeit', 22)
("'", 23)
('(', 24)
('((global_step', 25)
('()\\', 26)
('(-inf)', 27)
('(-∞)', 28)
('(0', 29)
('(0)', 30)


In [None]:
print(len(vocab))

7048


# Tokenizer class

In [None]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)

sample_text = """Training deep neural networks with many layers can sometimes prove
challenging due to issues like vanishing or exploding gradients."""
ids = tokenizer.encode(sample_text)
print(ids)

[2229, 3304, 4930, 4928, 6935, 4669, 4497, 2858, 6100, 5526, 2903, 3523, 6511, 4386, 4554, 6810, 5115, 3771, 4049, 283]


## V2

In [None]:
all_tokens = re.split(r"([,.:_;{}()!?'\"|/%]|\s|--)+", row_text)
all_tokens = [token for token in all_tokens if token is not None and token.strip()]
all_tokens = sorted(set(all_tokens))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: id for id, token in enumerate(all_tokens)}
print(f"Vocab lenght: {len(vocab)}")

Vocab lenght: 5879


In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r"([,.:_;{}()!?'\"|/%]|\s|--)+", text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizerv2 = SimpleTokenizerV2(vocab)

In [None]:
tokenizerv2.encode("Hello, simple example")

[1434, 5099, 3155]

In [None]:
tokenizerv2.encode("Bebebe")

[5878]

In [None]:
tokenizerv2.decode([5878])

'<|unk|>'

# Tiktoken BPE tokenizer

In [None]:
!pip install tiktoken transformers



In [None]:
import importlib
import tiktoken

print(f"Tiktoken version: {importlib.metadata.version("tiktoken")}")

Tiktoken version: 0.12.0


In [None]:
tiktokenizer = tiktoken.get_encoding("gpt2")

In [None]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("Xenova/gpt-4")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
# tokens1 = tiktokenizer.encode(text)
tokens2 = tokenizer.encode(text)
# print(f"tokens1: {tokens1}")
print(f"tokens2: {tokens2}")

tokens2: [9906, 11, 656, 499, 1093, 15600, 30, 220, 100257, 763, 279, 7160, 32735, 7317, 2492, 1073, 1063, 16476, 17826, 13]


# Input-target

In [None]:
text_tokens = tokenizer.encode(row_text)
inc_tokens = text_tokens[1000:]

Token indices sequence length is longer than the specified maximum sequence length for this model (99736 > 8192). Running this sequence through the model will result in indexing errors


In [None]:
window_size = 4
for i in range(window_size):
    input = inc_tokens[:i+1]
    target = inc_tokens[i+1]
    print(f"Input--> {input}")
    print(f"Target-->{'       '*(i+1)}{target}")

Input--> [2038]
Target-->       323
Input--> [2038, 323]
Target-->              42129
Input--> [2038, 323, 42129]
Target-->                     1169
Input--> [2038, 323, 42129, 1169]
Target-->                            552


In [None]:
window_size = 4
for i in range(window_size):
    input = inc_tokens[:i+1]
    target = inc_tokens[i+1]
    print(f"Input--> {tokenizer.decode(input)}")
    print(f"Target-->{'       '*(i+1)}{tokenizer.decode(target)}")

Input-->  information
Target-->        and
Input-->  information and
Target-->               subt
Input-->  information and subt
Target-->                     let
Input-->  information and subtlet
Target-->                            ies


# Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDatasetV1(Dataset):

    def __init__(self, text, max_length, tokenizer, stride=1):
        self.input_ids = []
        self.target_ids = []

        self.tokens = tokenizer.encode(text)
        for i in range(0, len( self.tokens) - max_length, stride):
            input_row = torch.tensor( self.tokens[i:i+max_length])
            target_row = torch.tensor(self.tokens[i+1:i+max_length+1])
            self.input_ids.append(input_row)
            self.target_ids.append(target_row)

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, ids):
        return self.input_ids[ids], self.target_ids[ids]

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    tokenizer = GPT2TokenizerFast.from_pretrained("Xenova/gpt-4")

    dataset = GPTDatasetV1(txt, max_length, tokenizer, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [None]:
with open("/content/Build-a-Large-Language-Model-_From-Scratch_-.txt", "r", encoding="utf-8") as f:
    row_text = f.read()

In [None]:
dataloader = create_dataloader_v1(row_text, batch_size=4, max_length=4, stride=156, shuffle=False, drop_last=True, num_workers=0)
#stride help for fix overfitting
iterator = iter(dataloader)
first_step = next(iterator)
print(f"First step: {first_step}")

Token indices sequence length is longer than the specified maximum sequence length for this model (99736 > 8192). Running this sequence through the model will result in indexing errors


First step: [tensor([[  200,   200, 11313,   264],
        [  198, 13359,   499,   369],
        [11537,   449,   856,   990],
        [   11, 23391,   912,   198]]), tensor([[  200, 11313,   264, 20902],
        [13359,   499,   369, 23395],
        [  449,   856,   990,    11],
        [23391,   912,   198, 42641]])]


# Embeddings

In [None]:
!pip install gensim



In [None]:
import gensim.downloader as download_api
embedding_model = download_api.load('word2vec-google-news-300')

In [None]:
embedding_model.get_vector('Word').shape

(300,)

In [None]:
input_ids = torch.tensor([5, 3, 1, 2])

vocab_size = 6
output_size = 4
embedding = torch.nn.Embedding(vocab_size, output_size)

In [None]:
print(f"Embedding weight: {embedding.weight}")

Embedding weight: Parameter containing:
tensor([[ 0.6184,  1.0475, -0.1818,  2.4886],
        [ 0.1928, -0.4946, -0.4927, -0.4495],
        [ 0.1808, -0.4505,  0.7139, -0.3409],
        [ 1.1522,  0.2217,  1.3078, -0.5854],
        [-1.4921, -0.5727, -1.4264, -1.0420],
        [-1.6150,  1.0893,  1.7864, -0.5186]], requires_grad=True)


In [None]:
print(f"Needed ids: {embedding.weight[input_ids]}")

Needed ids: tensor([[-1.6150,  1.0893,  1.7864, -0.5186],
        [ 1.1522,  0.2217,  1.3078, -0.5854],
        [ 0.1928, -0.4946, -0.4927, -0.4495],
        [ 0.1808, -0.4505,  0.7139, -0.3409]], grad_fn=<IndexBackward0>)


In [None]:
token_embeddings = embedding(input_ids)
print(token_embeddings.shape)

torch.Size([4, 4])


In [None]:
vocab_size = 81075
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
    row_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

Token indices sequence length is longer than the specified maximum sequence length for this model (99736 > 8192). Running this sequence through the model will result in indexing errors


In [None]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[  200,   200, 11313,   264],
        [20902, 11688,  5008,   320],
        [ 3915, 81074,   340,   220],
        [   16,    13,   256, 10788],
        [  198,   220,    17,    13],
        [  256,   220,    16,  6803],
        [  910, 10276,  2406,  2812],
        [ 2406,  2681, 22357,    82]])

Inputs shape:
 torch.Size([8, 4])


In [None]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [None]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
