In [43]:
import re

In [44]:
with open("data/wizard-of-oz.txt", encoding="utf-8") as f:
    data = f.read()

In [45]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', data)
preprocessed = [item for item in preprocessed if item.strip()]
all_words = sorted(set(preprocessed))
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(all_words)
vocab = {token:integer for integer, token in enumerate(all_words)}
print(vocab)



In [46]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {integer:string for string, integer in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        # Recolocando espacos antes das pontuacoes
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [47]:
tokenizer = SimpleTokenizerV1(vocab)

text = """Princess Ozma, whom I love as much as my readers do, is again introduced
in this story, and so are several of our old friends of Oz. You will
also become acquainted with Jim the Cab-Horse, the Nine Tiny Piglets,
and Eureka, the Kitten."""

ids = tokenizer.encode(text)
print(ids)
tokenizer.decode(ids)

[633, 589, 11, 5078, 409, 3095, 1088, 3264, 1088, 3276, 3833, 1967, 11, 2877, 962, 2862, 2799, 4689, 4497, 11, 1014, 4320, 1066, 4164, 3390, 3446, 3405, 2430, 3390, 586, 14, 873, 5093, 989, 1212, 921, 5115, 448, 4672, 186, 11, 4672, 543, 785, 622, 11, 1014, 290, 11, 4672, 466, 14]


'Princess Ozma, whom I love as much as my readers do, is again introduced in this story, and so are several of our old friends of Oz. You will also become acquainted with Jim the Cab-Horse, the Nine Tiny Piglets, and Eureka, the Kitten.'

In [48]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {integer:string for string, integer in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        # Recolocando espacos antes das pontuacoes
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [51]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlight terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(text)

tokenizer.decode(tokenizer.encode(text))

Hello, do you like tea? <|endoftext|> In the sunlight terraces of the palace.


'Hello, do you like <|unk|>? <|endoftext|> In the sunlight <|unk|> of the palace.'