In [33]:
from pathlib import Path
import re

In [35]:
text = Path('text.txt').read_text()

In [62]:
class Tokenizer:
    
    def __init__(self, stoi: dict[str, int]):
        self.stoi = stoi
        self.itos = {i: s for s, i in stoi.items()}
        
    @staticmethod
    def _text_to_tokens(text: str) -> list[str]:
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        return [t for t in tokens if t.strip()]
        
    @classmethod
    def train(cls, text: str) -> 'Tokenizer':
        print(f"{len(text)=}")
        tokens = cls._text_to_tokens(text)
        print(f"{tokens[:5]=}")
        print(f"{len(tokens)=}")
        vocab = sorted(set(tokens))
        print(f"{len(vocab)=}")
        stoi = {s: i for i, s in enumerate(vocab)}
        return cls(stoi)
        
    def encode(self, text: str) -> list[int]:
        tokens = self._text_to_tokens(text)
        token_ids = [self.stoi[s] for s in tokens]
        return token_ids
    
    def decode(self, token_ids: list[int]) -> str:
        text = ' '.join([self.itos[i] for i in token_ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [63]:
print(f"{text[:50]=}")
tokenizer = Tokenizer.train(text)
new_text = "lazy dog reduce carbon emissions."
print(f"{new_text=}")
token_ids = tokenizer.encode(new_text)
print(f"{token_ids=}")
decoded = tokenizer.decode(token_ids)
print(f"{decoded=}")

text[:50]='In the vast expanse of space, stars form from clou'
len(text)=947
tokens[:5]=['In', 'the', 'vast', 'expanse', 'of']
len(tokens)=162
len(vocab)=120
new_text='lazy dog reduce carbon emissions.'
token_ids=[65, 39, 90, 23, 42, 1]
decoded='lazy dog reduce carbon emissions.'
