In [50]:
import re
import pandas as pd
from collections import Counter

In [51]:
VOCAB_SIZE = 256

In [52]:
df = pd.read_csv('mtsamples.csv')
sentences = df['transcription'].dropna().astype(str).values

In [53]:
class MostFrequentWordsTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.token_to_idx = {}
        self.idx_to_token = {}

    def clean_sentence(self, sentence):
        sentence = sentence.lower()
        return re.sub(r'[^a-z0-9\s]', '', sentence)

    def build_vocab(self, sentences):
        words = []
        for sentence in sentences:
            words.extend(self.clean_sentence(str(sentence)).split())

        word_counts = Counter(words)
        most_common_words = word_counts.most_common(self.vocab_size - 4)

        self.token_to_idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3} # Tokens especiais
        for i, (word, _) in enumerate(most_common_words, 4):
            self.token_to_idx[word] = i

        self.idx_to_token = {i: w for w, i in self.token_to_idx.items()}

    def tokenize(self, sentence):
        cleaned_sentence = self.clean_sentence(str(sentence))
        return cleaned_sentence.split()

    def convert_tokens_to_ids(self, tokens):
        return [self.token_to_idx.get(token, self.token_to_idx['[UNK]']) for token in tokens]
    
    def get_vocab_size(self):
        return len(self.token_to_idx)
    
tokenizer = MostFrequentWordsTokenizer(vocab_size=VOCAB_SIZE)
tokenizer.build_vocab(sentences)
print(f"\nTamanho do vocabulário: {tokenizer.get_vocab_size()}")


Tamanho do vocabulário: 256


In [54]:
import torch
import torch.nn as nn

EMBEDDING_DIM = 768

embedding_layer = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM)

In [55]:
tokens = tokenizer.tokenize("She is sick")
tokens

['she', 'is', 'sick']

In [56]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[15, 12, 1]

In [57]:
embedding_layer(torch.tensor(ids)).shape

torch.Size([3, 768])