In [597]:
import re
import pandas as pd
from collections import Counter

## Transformação de texto em números

![Transformação de texto em números](tokenization.png "Tranformação de texto em números")

## Tokenização

In [598]:
df = pd.read_csv('mtsamples.csv')
sentences = df['transcription'].dropna().astype(str).values
sentences[0]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

### Nível de caractere

In [599]:
class CharacterLevelTokenizer:
    def __init__(self):
        self.token_to_idx = {}
        self.idx_to_token = {}

    def clean_sentence(self, sentence):
        return str(sentence).lower()

    def build_vocab(self, sentences):
        """Monta o vocabulário a partir de uma lista de sentenças."""
        unique_chars = set()
        
        for sentence in sentences:
            cleaned_sentence = self.clean_sentence(sentence)
            unique_chars.update(list(cleaned_sentence))

        self.token_to_idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3} # Tokens especiais
        for i, char in enumerate(sorted(list(unique_chars)), 4):
            self.token_to_idx[char] = i

        self.idx_to_token = {i: c for c, i in self.token_to_idx.items()}

    def tokenize(self, sentence):
        """Converte uma sentença em uma lista de caracteres (tokens)."""
        cleaned_sentence = self.clean_sentence(sentence)
        return list(cleaned_sentence)

    def convert_tokens_to_ids(self, tokens):
        """Converte uma lista de tokens (caracteres) em seus IDs correspondentes."""
        return [self.token_to_idx.get(token, self.token_to_idx['[UNK]']) for token in tokens]

    def get_vocab_size(self):
        return len(self.token_to_idx)

tokenizer = CharacterLevelTokenizer()
tokenizer.build_vocab(sentences)
print(f"\nTamanho do vocabulário: {tokenizer.get_vocab_size()}")


Tamanho do vocabulário: 84


In [600]:
tokens = tokenizer.tokenize("She has a terrible disease")
print(tokens)

['s', 'h', 'e', ' ', 'h', 'a', 's', ' ', 'a', ' ', 't', 'e', 'r', 'r', 'i', 'b', 'l', 'e', ' ', 'd', 'i', 's', 'e', 'a', 's', 'e']


In [601]:
class MostFrequentWordsTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.token_to_idx = {}
        self.idx_to_token = {}

    def clean_sentence(self, sentence):
        sentence = sentence.lower()
        return re.sub(r'[^a-z0-9\s]', '', sentence)

    def build_vocab(self, sentences):
        words = []
        for sentence in sentences:
            words.extend(self.clean_sentence(str(sentence)).split())

        word_counts = Counter(words)
        most_common_words = word_counts.most_common(self.vocab_size - 4)

        self.token_to_idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3} # Tokens especiais
        for i, (word, _) in enumerate(most_common_words, 4):
            self.token_to_idx[word] = i

        self.idx_to_token = {i: w for w, i in self.token_to_idx.items()}

    def tokenize(self, sentence):
        cleaned_sentence = self.clean_sentence(str(sentence))
        return cleaned_sentence.split()

    def convert_tokens_to_ids(self, tokens):
        return [self.token_to_idx.get(token, self.token_to_idx['[UNK]']) for token in tokens]
    
    def get_vocab_size(self):
        return len(self.token_to_idx)
    
tokenizer = MostFrequentWordsTokenizer(vocab_size=256)
tokenizer.build_vocab(sentences)

In [602]:
tokenizer.tokenize("She has a terrible disease")

['she', 'has', 'a', 'terrible', 'disease']

### Nível de sub-palavras

In [603]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

class BytePairEncodingTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = Whitespace()
        self.special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"] # Tokens especiais

    def build_vocab(self, sentences):
        trainer = BpeTrainer(vocab_size=self.vocab_size, special_tokens=self.special_tokens)
        self.tokenizer.train_from_iterator(sentences, trainer=trainer)

    def tokenize(self, text):
        return self.tokenizer.encode(text).tokens

    def convert_tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(token) for token in tokens]
    
    def get_vocab_size(self):
        return self.tokenizer.get_vocab_size()
    
tokenizer = BytePairEncodingTokenizer(vocab_size=2048)
tokenizer.build_vocab(sentences)







In [604]:
tokenizer.tokenize("She has a terrible disease")

['She', 'has', 'a', 'ter', 'r', 'ib', 'le', 'disease']

## Conversão para IDs

In [605]:
tokens = tokenizer.tokenize("She has a terrible disease")
print(tokenizer.convert_tokens_to_ids(tokens))

[229, 218, 65, 157, 82, 274, 129, 590]


## Embeddings

In [606]:
import torch
import torch.nn as nn

VOCAB_SIZE = 256
EMBEDDING_DIM = 768

tokenizer = MostFrequentWordsTokenizer(vocab_size=256)
tokenizer.build_vocab(sentences)

embedding_layer = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM)

In [608]:
tokens = tokenizer.tokenize("She is sick")
ids = tokenizer.convert_tokens_to_ids(tokens)
embeddings = embedding_layer(torch.tensor(ids))

print(f'Dimensões dos embeddings: {embeddings.shape}\n')

Dimensões dos embeddings: torch.Size([3, 768])



In [609]:
embeddings

tensor([[-0.5989,  0.6897, -1.2648,  ...,  0.4409,  1.7971,  1.6017],
        [ 1.7124,  1.3147, -1.6451,  ...,  0.4380,  0.8684,  2.6722],
        [-2.4460, -0.1759, -0.8460,  ...,  0.1046, -1.8075,  2.0034]],
       grad_fn=<EmbeddingBackward0>)