In [1]:
import re

In [2]:
with open("../assets/corpus_01.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print(f"# of characters including specials: {len(raw_text)}")

preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
print("# of tokens: ", len(preprocessed_text))

# removing empty strings
preprocessed_text = [token for token in preprocessed_text if token.strip()]

print("# of tokens after removing empty strings: ", len(preprocessed_text))

all_words = sorted(set(preprocessed_text))
vocab_size = len(all_words)
print("Vocabulary size: ", vocab_size)

vocab = {token: idx for idx, token in enumerate(all_words)}
vocab.update({"<endoftext>": len(vocab), "<unk>": len(vocab) + 1})
vocab_size = len(vocab)
print("Updated vocabulary size: ", vocab_size)

# of characters including specials: 20479
# of tokens:  9235
# of tokens after removing empty strings:  4690
Vocabulary size:  1130
Updated vocabulary size:  1132


In [3]:
import pickle

with open("../assets/vocab.pkl", "wb") as f:
    f.write(pickle.dumps(vocab))

In [4]:
from nanollm.token import RegexTokenizer

with open("../assets/vocab.pkl", "rb") as f:
    voc = pickle.loads(f.read())

tokenizer = RegexTokenizer(voc)

encoded = tokenizer.tokenize("this the a is")
print(f"encoded = {encoded}")
decoded = tokenizer.detokenize(encoded)
print(f"decoded = {decoded}")

encoded = [999, 988, 115, 584]
decoded = this the a is


In [7]:
from nanollm.token import BPETokenizer

bpe = BPETokenizer()
t = bpe.tokenize("Hello, world - this is a test!")
print(f"encoded = {t}")

d = bpe.detokenize(t)
print(f"decoded = {d}")

encoded = [15496, 11, 995, 532, 428, 318, 257, 1332, 0]
decoded = Hello, world - this is a test!
