In [333]:
import re
import pandas as pd
from collections import Counter

In [334]:
df = pd.read_csv('mtsamples.csv')
sentences = df['transcription'].dropna().astype(str).values

In [335]:
class CharacterLevelTokenizer:
    def __init__(self):
        self.token_to_idx = {}
        self.idx_to_token = {}

    def clean_sentence(self, sentence):
        return str(sentence).lower()

    def build_vocab(self, sentences):
        """Monta o vocabulário a partir de uma lista de sentenças."""
        unique_chars = set()
        
        for sentence in sentences:
            cleaned_sentence = self.clean_sentence(sentence)
            unique_chars.update(list(cleaned_sentence))

        self.token_to_idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3} # Tokens especiais
        for i, char in enumerate(sorted(list(unique_chars)), 4):
            self.token_to_idx[char] = i

        self.idx_to_token = {i: c for c, i in self.token_to_idx.items()}

    def tokenize(self, sentence):
        """Converte uma sentença em uma lista de caracteres (tokens)."""
        cleaned_sentence = self.clean_sentence(sentence)
        return list(cleaned_sentence)

    def convert_tokens_to_ids(self, tokens):
        """Converte uma lista de tokens (caracteres) em seus IDs correspondentes."""
        return [self.token_to_idx.get(token, self.token_to_idx['[UNK]']) for token in tokens]

    def get_vocab_size(self):
        return len(self.token_to_idx)

tokenizer = CharacterLevelTokenizer()
tokenizer.build_vocab(sentences)
print(f"\nTamanho do vocabulário: {tokenizer.get_vocab_size()}")


Tamanho do vocabulário: 84


In [336]:
tokens = tokenizer.tokenize("She has a terrible disease")
print(tokens)

['s', 'h', 'e', ' ', 'h', 'a', 's', ' ', 'a', ' ', 't', 'e', 'r', 'r', 'i', 'b', 'l', 'e', ' ', 'd', 'i', 's', 'e', 'a', 's', 'e']


In [337]:
print(tokenizer.convert_tokens_to_ids(tokens))

[58, 47, 44, 4, 47, 40, 58, 4, 40, 4, 59, 44, 57, 57, 48, 41, 51, 44, 4, 43, 48, 58, 44, 40, 58, 44]


In [338]:
class MostFrequentWordsTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.token_to_idx = {}
        self.idx_to_token = {}

    def clean_sentence(self, sentence):
        sentence = sentence.lower()
        return re.sub(r'[^a-z0-9\s]', '', sentence)

    def build_vocab(self, sentences):
        words = []
        for sentence in sentences:
            words.extend(self.clean_sentence(str(sentence)).split())

        word_counts = Counter(words)
        most_common_words = word_counts.most_common(self.vocab_size - 4)

        self.token_to_idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3} # Tokens especiais
        for i, (word, _) in enumerate(most_common_words, 4):
            self.token_to_idx[word] = i

        self.idx_to_token = {i: w for w, i in self.token_to_idx.items()}

    def tokenize(self, sentence):
        cleaned_sentence = self.clean_sentence(str(sentence))
        return cleaned_sentence.split()

    def convert_tokens_to_ids(self, tokens):
        return [self.token_to_idx.get(token, self.token_to_idx['[UNK]']) for token in tokens]
    
    def get_vocab_size(self):
        return len(self.token_to_idx)
    
tokenizer = MostFrequentWordsTokenizer(vocab_size=256)
tokenizer.build_vocab(sentences)
print(f"\nTamanho do vocabulário: {tokenizer.get_vocab_size()}")


Tamanho do vocabulário: 256


In [339]:
tokens = tokenizer.tokenize("She has a terrible disease")
tokens

['she', 'has', 'a', 'terrible', 'disease']

In [340]:
tokenizer.convert_tokens_to_ids(tokens)

[15, 24, 9, 1, 112]

In [359]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

class BytePairEncodingTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = Whitespace()
        self.special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] # Tokens especiais

    def build_vocab(self, sentences):
        trainer = BpeTrainer(vocab_size=self.vocab_size, special_tokens=self.special_tokens)
        self.tokenizer.train_from_iterator(sentences, trainer=trainer)

    def tokenize(self, text):
        return self.tokenizer.encode(text).tokens

    def convert_tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(token) for token in tokens]
    
    def get_vocab_size(self):
        return self.tokenizer.get_vocab_size()
    
tokenizer = BytePairEncodingTokenizer(vocab_size=2048)
tokenizer.build_vocab(sentences)
print(f"\nTamanho do vocabulário: {tokenizer.get_vocab_size()}")





Tamanho do vocabulário: 2048


In [360]:
tokens = tokenizer.tokenize("She has a terrible disease")
tokens

['She', 'has', 'a', 'ter', 'r', 'ib', 'le', 'disease']