# Notebook Processo Seletivo Aluno Especial 2S2025 FEEC-UNICAMP
- versão 13 de julho de 2025 - dataset IMDB from hugging face

In [None]:
pip install -U datasets



In [None]:
from datasets import load_dataset

imdb_dic = load_dataset("stanfordnlp/imdb", token=False)

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
imdb_dic

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split

## I - Vocabulário e Tokenização

In [None]:
import re
teste = "I like Pizza ."

In [None]:
# prompt: compute word_counts, the number of distinct words in the training dataset

def pre_process(text):
    # This can be improved
    # return text.split()
    # alterado para remover pontuações e nao considerar diferenças entre letras maiúsculas e minúsculas
    return re.sub(r'[^\w\s]', '', text).lower().split()

all_words = []
for text in imdb_dic['train']['text']:
    all_words.extend(pre_process(text))

word_counts = Counter(all_words)

print(f"Number of distinct words: {len(word_counts)}")

Number of distinct words: 121045


In [None]:
pre_process(teste)

['i', 'like', 'pizza']

In [None]:
# limit the vocabulary size to 20000 most frequent tokens
vocab_size_max = 20000

# create a vocabulary of the 20000 most frequent tokens
most_frequent_words = sorted(word_counts, key=word_counts.get, reverse=True)[:vocab_size_max]
vocab = {word: i for i, word in enumerate(most_frequent_words, 1)} # words indexed from 1 to 20000
vocab_size = len(vocab)
vocab_size

20000

In [None]:
def tokenizer(sentence, vocab):
    # incluindo pre_process para tokenizar
    tokens = pre_process(sentence)
    return [vocab.get(word, 0) for word in tokens]  # 0 para OOV

tokenizer("I like Pizza .", vocab)

[9, 38, 7923]

## II - Dataset

In [None]:
# Dataset Class with Bag of Words
# class alterada para se tornar mais eficiente:
# loop de one-hot encoding removido do __getitem__ e movido para __init__
# evita que o loop seja executado a cada chamada de __getitem__ (ou seja, a cada batch)
# o que estava tornando o loop de treinamento lento e impedindo melhor otimização de tempo ao usar a GPU
# tambem é importante criar as saidas (X e labels) como tensores do pytorch para melhorar a eficiencia, pois o pytorch é otimizado para trabalhar com tensores
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        # le labels
        self.labels = torch.tensor(imdb_dic[split]['label'])
        # le texts
        texts = imdb_dic[split]['text']
        # cria um tensor de zeros com o tamanho do vocabulário
        self.X = torch.zeros((len(texts), len(vocab)+1), dtype=torch.float32)
        # atribui 1 para cada palavra no texto se ela estiver no vocabulário
        for i, line in enumerate(texts):
            for word in tokenizer(line, vocab):
                self.X[i, word] = 1

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.X[idx], self.labels[idx]

In [None]:
# Load Data with Bag of Words
full_train_data = IMDBDataset('train', vocab)
test_data  = IMDBDataset('test',  vocab)

# dividindo test em test (80%) e val (20%)
train_size = int(0.8 * len(full_train_data))
val_size = len(full_train_data) - train_size
train_data, val_data = random_split(full_train_data, [train_size, val_size], generator=torch.Generator().manual_seed(42))

## III - Data Loader

In [None]:
batch_size = 128
# define dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_data, batch_size=batch_size,  shuffle=False)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

## IV - Modelo

In [None]:
class OneHotMLP(nn.Module):
    def __init__(self, vocab_size):
        super(OneHotMLP, self).__init__()

        self.fc1 = nn.Linear(vocab_size+1, 200)
        self.fc2 = nn.Linear(200, 1)

        self.relu = nn.ReLU()

    def forward(self, x):
        o = self.fc1(x.float())
        o = self.relu(o)
        return self.fc2(o)

# Model instantiation
model = OneHotMLP(vocab_size)

## V - Laço de Treinamento - Otimização da função de Perda pelo Gradiente descendente

In [None]:
# Verifica se há uma GPU disponível e define o dispositivo para GPU se possível, caso contrário, usa a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('using CPU')

GPU: Tesla T4


In [None]:
import time

model = model.to(device)
# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
# learning rate alterado para 0.1 (maior LR garante maior convergência e aprendizado)
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Training loop
# Alterado para corrigir cálculo da loss e loop de validação
num_epochs = 5
for epoch in range(num_epochs):
    start_time = time.time()  # Start time of the epoch

    # Treinamento
    model.train()
    total_loss = 0.0
    total_samples = 0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # adicionando cálculo do tamanho do batch para corrigir o cálculo da Loss
        batch_size = labels.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size
    # corrigindo o cálculo da Loss, considerando média por época
    train_loss = total_loss / total_samples
    epoch_duration = time.time() - start_time  # Duration of epoch

    # Validação
    model.eval()
    val_loss = 0.0
    val_samples = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            batch_size = labels.size(0)
            val_loss += loss.item() * batch_size
            val_samples += batch_size
    val_loss = val_loss / val_samples

    print(f'Epoch [{epoch+1}/{num_epochs}], \
            Train Loss: {train_loss:.4f}, \
            Val Loss: {val_loss:.4f}, \
            Elapsed Time: {epoch_duration:.2f} sec')

Epoch [1/5],             Train Loss: 0.5488,             Val Loss: 0.4388,             Elapsed Time: 2.15 sec
Epoch [2/5],             Train Loss: 0.3490,             Val Loss: 0.3925,             Elapsed Time: 1.00 sec
Epoch [3/5],             Train Loss: 0.3116,             Val Loss: 0.3105,             Elapsed Time: 1.03 sec
Epoch [4/5],             Train Loss: 0.2693,             Val Loss: 0.3978,             Elapsed Time: 1.05 sec
Epoch [5/5],             Train Loss: 0.2443,             Val Loss: 0.3025,             Elapsed Time: 1.05 sec


## VI - Avaliação

In [None]:
## evaluation
model.eval()

with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        predicted = torch.round(torch.sigmoid(outputs.squeeze()))
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total}%')

Test Accuracy: 86.772%
