In [7]:
import regex as re

In [3]:
from importlib.metadata import version

print("torch version: ", version('torch'))
print("tiktokenizer version: ", version('tiktoken'))

torch version:  2.3.0+cpu
tiktokenizer version:  0.7.0


In [112]:
with open('the-verdict.txt','r',encoding='utf-8') as f:
    text = f.read()

print("total_number_of_characters: ", len(text))
print(text[0:99])


total_number_of_characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [82]:
# tokenize and embed the text for LLM
# start with simple tokinization based on simple simple text than we can later appy to text above


preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed = [item for item in preprocessed if item] # remove empty strings as a result of split using regex


In [83]:

print("Number of tokens:", len(preprocessed))

Number of tokens: 8405


### Converting token to token IDs

- Next, we convert the text tokens into token IDs that we can process via embedding layers later
- For this we first need to build a vocabulary

In [84]:


all_words = set(sorted(preprocessed))

print("Number of unique tokens:", len(all_words))



Number of unique tokens: 1132


In [85]:
vocab = {token:index for index, token in enumerate(all_words)}

### Create Tokenizer class to encode decode the given text as per the vocabular

In [86]:
class SimpleTokenizerv1():
    def __init__(self,vocab) -> None:
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed =[ item.strip() for item in preprocessed if item.strip()]

        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
                # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


        


In [87]:
tokenizerv1  = SimpleTokenizerv1(vocab)

### Byte Pair Encoding

GPT-2 used BytePair encoding (BPE) as its tokenizer
it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words
For instance, if GPT-2's vocabulary doesn't have the word "unfamiliarword," it might tokenize it as ["unfam", "iliar", "word"] or some other subword breakdown, depending on its trained BPE merges
The original BPE tokenizer can be found here: https://github.com/openai/gpt-2/blob/master/src/encoder.py
In this lecture, we are using the BPE tokenizer from OpenAI's open-source tiktoken library, which implements its core algorithms in Rust to improve computational performance
(Based on an analysis here, I found that tiktoken is approx. 3x faster than the original tokenizer and 6x faster than an equivalent tokenizer in Hugging Face)

In [None]:
"""Byte pair encoding utilities"""

import os
import json
import regex as re
from functools import lru_cache

@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

def get_pairs(word):
    """Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

class Encoder:
    def __init__(self, encoder, bpe_merges, errors='replace'):
        self.encoder = encoder
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
        return text

def get_encoder(model_name, models_dir):
    with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

In [92]:
def get_pairs(word):
    """Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        print(char)
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

In [93]:
get_pairs('hi anil')

i
 
a
n
i
l


{(' ', 'a'), ('a', 'n'), ('h', 'i'), ('i', ' '), ('i', 'l'), ('n', 'i')}

In [96]:
import importlib
import importlib.metadata
import tiktoken

print('tiktokn version',importlib.metadata.version('tiktoken'))

tiktokn version 0.7.0


In [98]:
tokenizer = tiktoken.get_encoding('gpt2')

In [101]:
text = (
    "<|endoftext|>  <|endoftext|>  <|endoftext|>  Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[50256, 220, 220, 50256, 220, 220, 50256, 220, 18435, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [102]:
strings = tokenizer.decode(integers)

print(strings)

<|endoftext|>  <|endoftext|>  <|endoftext|>  Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [103]:
tokenizer.encode("Akwirw ier", allowed_special={"<|endoftext|>"})

[33901, 86, 343, 86, 220, 959]

### Data Sampling with a sliding window
- Above, we took care of the tokenization (converting text into word tokens represented as token ID numbers)
- Now, let's talk about how we create the data loading for LLMs
- We train LLMs to generate one word at a time, so we want to prepare the training data accordingly where the next word in a sequence represents the target to predi

In [113]:
from supplementary import create_dataloader_v1

# Ensure text is defined and not empty
print("Text length:", len(text))

dataloader = create_dataloader_v1(text, batch_size=8, max_length=4, stride=4, shuffle=False)

# Check if dataloader is empty
data_iter = iter(dataloader)
try:
    inputs, targets = next(data_iter)
    print("Inputs:\n", inputs)
    print("\nTargets:\n", targets)
except StopIteration:
    print("DataLoader is empty or exhausted.")

Text length: 20479
Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
