In [81]:
import re

In [82]:
text = """
In the vast expanse of space, stars form from clouds of gas and dust.
Each galaxy holds billions of stars, many of which have planets orbiting them.
Machine learning has revolutionized industries by enabling models to learn from data.
Python is a widely-used programming language known for its simplicity and versatility.
The weather today is sunny with a chance of scattered thunderstorms in the evening.
The quick brown fox jumps over the lazy dog while the cat observes from a distance.
In 2020, remote work became the norm, leading to an increased reliance on digital tools.
Scientists discovered new species in the depths of the ocean, challenging our understanding of marine biology.
Modern architecture blends functionality with sleek, minimalist design principles.
As climate change progresses, efforts to reduce carbon emissions are intensifying.
History shows that civilizations often collapse when resources are mismanaged or depleted.
"""

In [83]:
EOT_TOKEN = "<|endoftext|>"
UNK_TOKEN = "<|unk|>"

class Tokenizer:
    
    def __init__(self, stoi: dict[str, int]):
        self.stoi = stoi
        self.itos = {i: s for s, i in stoi.items()}
        
    @staticmethod
    def _text_to_tokens(text: str) -> list[str]:
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        return [t.strip() for t in tokens if t.strip()]
        
    @classmethod
    def train(cls, text: str) -> 'Tokenizer':
        print(f"{len(text)=}")
        tokens = cls._text_to_tokens(text)
        print(f"{tokens[:5]=}")
        print(f"{len(tokens)=}")
        vocab = sorted(set(tokens))
        special_tokens = [EOT_TOKEN, UNK_TOKEN]
        print(f"{special_tokens=}")
        vocab.extend(special_tokens)
        print(f"{len(vocab)=}")
        stoi = {s: i for i, s in enumerate(vocab)}
        return cls(stoi)
        
    def encode(self, text: str) -> list[int]:
        tokens = self._text_to_tokens(text)
        tokens = [t if t in self.stoi else UNK_TOKEN for t in tokens]
        token_ids = [self.stoi[s] for s in tokens]
        return token_ids
    
    def decode(self, token_ids: list[int]) -> str:
        text = ' '.join([self.itos[i] for i in token_ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [85]:
print(f"{text[:50]=}")
tokenizer = Tokenizer.train(text)
new_text = "lazy dog reduce massive amount of carbon emissions. <|endoftext|> childlike or childish?"
print(f"{new_text=}")
token_ids = tokenizer.encode(new_text)
print(f"{token_ids=}")
decoded = tokenizer.decode(token_ids)
print(f"{decoded=}")

text[:50]='\nIn the vast expanse of space, stars form from clo'
len(text)=948
tokens[:5]=['In', 'the', 'vast', 'expanse', 'of']
len(tokens)=162
special_tokens=['<|endoftext|>', '<|unk|>']
len(vocab)=122
new_text='lazy dog reduce massive amount of carbon emissions. <|endoftext|> childlike or childish?'
token_ids=[65, 39, 90, 121, 121, 78, 23, 42, 1, 120, 121, 81, 121, 121]
decoded='lazy dog reduce <|unk|> <|unk|> of carbon emissions. <|endoftext|> <|unk|> or <|unk|> <|unk|>'
