# Processo Seletivo Engenheiro de LLM
versão 19 de janeiro de 2025

### Nome: Nilton Seixas

### E-mail:nfsseixas@gmail.com

## Instalação e importação de pacotes

In [5]:
!pip install datasets -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
import torch
import random
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
import numpy as np

In [7]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
from nltk.tokenize import word_tokenize

In [9]:
from sklearn.decomposition import TruncatedSVD

In [10]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## I - Vocabulário e Tokenização

### Exemplo do dataset

In [11]:
# Build Vocabulary
vocab_size = 20000
counter = Counter()
train_dataset = load_dataset("stanfordnlp/imdb", split="train")
for sample in train_dataset:
    counter.update(word_tokenize(sample["text"].lower()))

# Create a vocabulary of the 20000 most frequent tokens
most_frequent_words = sorted(counter, key=counter.get, reverse=True)[:vocab_size]
vocab = {word: i for i, word in enumerate(most_frequent_words, 1)}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [8]:
len(train_dataset)

25000

In [12]:
def encode_sentence(sentence, vocab):
    words = word_tokenize(sentence.lower())  # Agora considera pontuação e palavras minúsculas
    return [vocab.get(word, 0) for word in words]

encode_sentence("I like Pizza.", vocab)

[15, 49, 7985, 3]

## II - Dataset

In [13]:
#from torch.nn.functional import one_hot
# Dataset Class with One-hot Encoding
class IMDBDataset(Dataset):
    def __init__(self, split, vocab, svd=None):
        self.data = load_dataset("stanfordnlp/imdb", split=split)
        self.vocab = vocab
        self.svd = svd

        # Precompute One-Hot Encodings
        X_data, Y_data = [], []
        for sample in self.data:
            encoded = encode_sentence(sample["text"], self.vocab)
            one_hot = np.zeros(len(self.vocab) + 1, dtype=np.float32)
            for word in encoded:
                one_hot[word] = 1
            X_data.append(one_hot)
            Y_data.append(sample["label"])

        # Apply SVD to Reduce Dimensionality to 10 Features (Only Fit for Training Set)
        if self.svd is None:
            self.svd = TruncatedSVD(n_components=10)
            X_data = self.svd.fit_transform(X_data)
        else:
            X_data = self.svd.transform(X_data)

        self.X_data = torch.tensor(X_data, dtype=torch.float16)
        self.Y_data = torch.tensor(Y_data, dtype=torch.float32)

    def __len__(self):
        return len(self.X_data)

    def __getitem__(self, idx):
        return self.X_data[idx], self.Y_data[idx]


In [14]:
# Load and Split Data (Train/Validation)
full_train_data = IMDBDataset('train', vocab)
train_size = int(0.9 * len(full_train_data))
val_size = len(full_train_data) - train_size
train_data, val_data = random_split(full_train_data, [train_size, val_size])

In [15]:
test_data = IMDBDataset('test', vocab, svd=full_train_data.svd)


## III - Data Loader

In [16]:
# Dataloaders
batch_size = 128
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, pin_memory=True)


## IV - Modelo

In [18]:
class OneHotMLP(nn.Module):
    def __init__(self, vocab_size):
        super(OneHotMLP, self).__init__()

        self.fc1 = nn.Linear(vocab_size, 200)
        self.fc2 = nn.Linear(200, 1)

        self.relu = nn.ReLU()

    def forward(self, x):
        o = self.fc1(x.float())
        o = self.relu(o)
        return self.fc2(o)

# Model instantiation
#model = OneHotMLP(vocab_size)

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OneHotMLP(10).to(device)
print(device)

cuda


## V - Laço de Treinamento - Otimização da função de Perda pelo Gradiente descendente

In [20]:
torch.backends.cudnn.benchmark = True

In [21]:
scaler = torch.cuda.amp.GradScaler()


  scaler = torch.cuda.amp.GradScaler()


In [23]:
import time

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(inputs).squeeze()
            loss = criterion(logits, targets)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        running_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
            logits = model(inputs)
            loss = criterion(logits.squeeze(), targets)
            val_loss += loss.item()

    end_time = time.time()
    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss/len(train_loader):.4f}, "
          f"Validation Loss: {val_loss/len(val_loader):.4f}, Time: {end_time - start_time:.2f} sec")

# Evaluation on Test Set (if required)

  with torch.cuda.amp.autocast():


Epoch [1/5], Training Loss: 0.4753, Validation Loss: 0.4795, Time: 0.69 sec
Epoch [2/5], Training Loss: 0.4742, Validation Loss: 0.4797, Time: 1.59 sec
Epoch [3/5], Training Loss: 0.4739, Validation Loss: 0.4808, Time: 0.59 sec
Epoch [4/5], Training Loss: 0.4726, Validation Loss: 0.4805, Time: 0.44 sec
Epoch [5/5], Training Loss: 0.4725, Validation Loss: 0.4778, Time: 0.45 sec


## VI - Avaliação

In [24]:
# Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        logits = model(inputs)
        predicted = torch.round(torch.sigmoid(logits.squeeze()))
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
print(f'Test Accuracy: {100 * correct / total:.2f}%')

Test Accuracy: 76.86%
