In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('/media/mistertandon/DATA/git_repos/ai/01-llm-rsbt/settings')

with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [3]:
import re

In [4]:
import sys
sys.path.insert(0, '/media/mistertandon/DATA/git_repos/ai/01-llm-rsbt/ch_02')

from ch02_03_verdict_vocab import Vocabulary


In [5]:
vocab_builder = Vocabulary(raw_text)
vocab_builder.tokenize(r'([,.:;?_!"()\']|--|\s)')
vocab_builder.get_unique_tokens()
vocab_builder.sort_unique_tokens()
vocab_builder.add_special_tokens(["<|endoftext|>", "<|unk|>"])
vocab_builder.build_vocabulary()

vocabulary_v1 = vocab_builder.get_vocabulary()

In [6]:
for idx, item in enumerate(list(vocabulary_v1.items())[-5:]):
    print(f"{idx}: {item}")

print(len(vocabulary_v1))

0: ('younger', 1127)
1: ('your', 1128)
2: ('yourself', 1129)
3: ('<|endoftext|>', 1130)
4: ('<|unk|>', 1131)
1132


In [7]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {token_id:token for token, token_id in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        preprocessed = [item if item in self.str_to_int else '<|unk|>' for item in preprocessed]

        token_ids = [self.str_to_int[token] for token in preprocessed]

        return token_ids

    def decode(self, token_ids):
        text = ' '.join([self.int_to_str[token_id] for token_id in token_ids])
        text = re.sub(r'\s+([,.?_!"()\'])', r'\1', text)

        return text


In [8]:
tokenizer = SimpleTokenizerV2(vocabulary_v1)

In [9]:
dummy_text_v1 = "Hello, do you like tea?"

dummy_text_token_ids_v1 = tokenizer.encode(dummy_text_v1)
print("Token IDs:", dummy_text_token_ids_v1)
print(tokenizer.decode(dummy_text_token_ids_v1))

Token IDs: [1131, 5, 355, 1126, 628, 975, 10]
<|unk|>, do you like tea?


In [11]:
raw_text_v1 = "Hello, do you like tea?"
raw_text_v2 = "In the sunlit terraces of the palace."
raw_text_v1v2 = " <|endoftext|> ".join((raw_text_v1, raw_text_v2))
print(raw_text_v1v2)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [12]:
raw_text_v1v2_token_ids = tokenizer.encode(raw_text_v1v2)
print("Token IDs:", raw_text_v1v2_token_ids)

Token IDs: [1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [13]:
decoded_text_v1v2 = tokenizer.decode(raw_text_v1v2_token_ids)
print("Decoded text:", decoded_text_v1v2)

Decoded text: <|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
