# Processo Seletivo Engenheiro de LLM
versão 19 de janeiro de 2025

### Nome: Nilton Seixas

### E-mail:nfsseixas@gmail.com

## Instalação e importação de pacotes

In [1]:
!pip install datasets -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m21.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency 

In [45]:
import torch
import random
from torch.utils.data import Dataset, DataLoader

from collections import Counter
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset

In [46]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/nilton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
from nltk.tokenize import word_tokenize

In [48]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/nilton/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## I - Vocabulário e Tokenização

### Exemplo do dataset

In [49]:
train_dataset = load_dataset("stanfordnlp/imdb", split="train")

In [50]:
len(train_dataset)

25000

In [51]:
# limit the vocabulary size to 20000 most frequent tokens
vocab_size = 20000
## Corrigindo o tokenizador
counter = Counter()
for sample in train_dataset:
    counter.update(word_tokenize(sample["text"].lower())) ## usando o nltk para tokenizar

# create a vocabulary of the 20000 most frequent tokens
most_frequent_words = sorted(counter, key=counter.get, reverse=True)[:vocab_size]
vocab = {word: i for i, word in enumerate(most_frequent_words, 1)}
vocab_size = len(vocab)

In [52]:
def encode_sentence(sentence, vocab):
    words = word_tokenize(sentence.lower())  # Agora considera pontuação e palavras minúsculas
    return [vocab.get(word, 0) for word in words]

encode_sentence("I like Pizza.", vocab)

[15, 49, 7985, 3]

## II - Dataset

In [87]:
from torch.nn.functional import one_hot
# Dataset Class with One-hot Encoding
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        self.data = load_dataset("stanfordnlp/imdb", split=split)
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        target = torch.tensor(sample["label"], dtype=torch.float32)

        indices = []
        for word in encode_sentence(sample["text"], self.vocab):
            indices.append(word)
            
        values = torch.ones(len(indices))  
        indices = torch.tensor(indices).unsqueeze(0)  
        X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))

        return X, target

# Load Data with One-hot Encoding
batch_size = 128
train_data = IMDBDataset('train', vocab)
test_data = IMDBDataset('test', vocab)

len(train_data), len(test_data)

(25000, 25000)

In [88]:
def sparse_collate(batch):
    inputs, targets = zip(*batch)  # Separate inputs and targets

    # Convert sparse tensors to dense
    dense_inputs = [i.to_dense() for i in inputs]

    # Stack dense tensors into a batch
    batch_inputs = torch.stack(dense_inputs)
    batch_targets = torch.tensor(targets, dtype=torch.float32)

    return batch_inputs, batch_targets

## III - Data Loader

In [89]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True, persistent_workers=True, collate_fn=sparse_collate)
test_loader  = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True, persistent_workers=True, collate_fn=sparse_collate)




## IV - Modelo

In [90]:
class OneHotMLP(nn.Module):
    def __init__(self, vocab_size):
        super(OneHotMLP, self).__init__()

        self.fc1 = nn.Linear(vocab_size+1, 200)
        self.fc2 = nn.Linear(200, 1)

        self.relu = nn.ReLU()

    def forward(self, x):
        o = self.fc1(x.float())
        o = self.relu(o)
        return self.fc2(o)

# Model instantiation
#model = OneHotMLP(vocab_size)

In [97]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OneHotMLP(vocab_size).to(device)
print(device)

cuda


## V - Laço de Treinamento - Otimização da função de Perda pelo Gradiente descendente

In [92]:
# Verifica se há uma GPU disponível e define o dispositivo para GPU se possível,
# caso contrário, usa a CPU

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('using CPU')


GPU: NVIDIA L40S


In [93]:
torch.backends.cudnn.benchmark = True

In [94]:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()


  scaler = GradScaler()


In [95]:
gradient_accumulation_steps = 4

In [98]:
import time

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

# Training loop
num_epochs = 5
scaler = GradScaler()

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)

        with autocast(dtype=torch.float16):
            logits = model(inputs)
            loss = criterion(logits.squeeze(), targets) / gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * gradient_accumulation_steps

    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(train_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s")


  scaler = GradScaler()
  with autocast(dtype=torch.float16):


Epoch [1/5], Loss: 0.6312, Time: 5.43s
Epoch [2/5], Loss: 0.5398, Time: 5.43s
Epoch [3/5], Loss: 0.4930, Time: 5.39s
Epoch [4/5], Loss: 0.4872, Time: 5.40s
Epoch [5/5], Loss: 0.4486, Time: 5.40s


In [None]:
watch -n 1 nvidia-smi

## VI - Avaliação

In [99]:
## evaluation
model.eval()

with torch.no_grad():
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        logits = model(inputs)
        predicted = torch.round(torch.sigmoid(logits.squeeze()))
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    print(f'Test Accuracy: {100 * correct / total}%')

  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))
  X 

Test Accuracy: 82.892%
