# Processo Seletivo Engenheiro de LLM
versão 19 de janeiro de 2025

### Nome: Nilton Seixas

### E-mail:nfsseixas@gmail.com

## Instalação e importação de pacotes

In [2]:
!pip install datasets -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m266.2/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dep

In [1]:
import torch
import random
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/nilton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
from sklearn.decomposition import TruncatedSVD

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/nilton/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## I - Vocabulário e Tokenização

### Exemplo do dataset

In [6]:
train_dataset = load_dataset("stanfordnlp/imdb", split="train")

In [7]:
len(train_dataset)

25000

In [8]:
# limit the vocabulary size to 20000 most frequent tokens
vocab_size = 20000
## Corrigindo o tokenizador
counter = Counter()
for sample in train_dataset:
    counter.update(word_tokenize(sample["text"].lower())) ## usando o nltk para tokenizar

# create a vocabulary of the 20000 most frequent tokens
most_frequent_words = sorted(counter, key=counter.get, reverse=True)[:vocab_size]
vocab = {word: i for i, word in enumerate(most_frequent_words, 1)}


In [9]:
def encode_sentence(sentence, vocab):
    words = word_tokenize(sentence.lower())  # Agora considera pontuação e palavras minúsculas
    return [vocab.get(word, 0) for word in words]

encode_sentence("I like Pizza.", vocab)

[15, 49, 7985, 3]

In [10]:
# Precompute One-Hot Encodings
X_data, Y_data = [], []
for sample in train_dataset:
    encoded = encode_sentence(sample["text"], vocab)
    one_hot = np.zeros(len(vocab) + 1, dtype=np.float32)
    for word in encoded:
        one_hot[word] = 1
    X_data.append(one_hot)
    Y_data.append(sample["label"])

In [11]:
# Apply SVD to Reduce Dimensionality to 10 Features
svd = TruncatedSVD(n_components=10)
X_reduced = torch.tensor(svd.fit_transform(X_data), dtype=torch.float16)
Y_tensor = torch.tensor(Y_data, dtype=torch.float32)

In [12]:
# Split into Training and Validation Sets (90% Train, 10% Validation)
train_size = int(0.9 * len(X_reduced))
val_size = len(X_reduced) - train_size

train_X, val_X = X_reduced[:train_size], X_reduced[train_size:]
train_Y, val_Y = Y_tensor[:train_size], Y_tensor[train_size:]

train_data = TensorDataset(train_X, train_Y)
val_data = TensorDataset(val_X, val_Y)


In [13]:
# DataLoaders with Parallel Loading
batch_size = 128
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

## II - Dataset

In [19]:
"""from torch.nn.functional import one_hot
# Dataset Class with One-hot Encoding
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        self.data = load_dataset("stanfordnlp/imdb", split=split)
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        target = torch.tensor(sample["label"], dtype=torch.float32)

        indices = []
        for word in encode_sentence(sample["text"], self.vocab):
            indices.append(word)

        values = torch.ones(len(indices))
        indices = torch.tensor(indices).unsqueeze(0)
        X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))

        return X, target

# Load Data with One-hot Encoding
#batch_size = 128
#train_data = IMDBDataset('train', vocab)
#test_data = IMDBDataset('test', vocab)

#len(train_data), len(test_data)
"""

'from torch.nn.functional import one_hot\n# Dataset Class with One-hot Encoding\nclass IMDBDataset(Dataset):\n    def __init__(self, split, vocab):\n        self.data = load_dataset("stanfordnlp/imdb", split=split)\n        self.vocab = vocab\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, idx):\n        sample = self.data[idx]\n        target = torch.tensor(sample["label"], dtype=torch.float32)\n\n        indices = []\n        for word in encode_sentence(sample["text"], self.vocab):\n            indices.append(word)\n            \n        values = torch.ones(len(indices))  \n        indices = torch.tensor(indices).unsqueeze(0)  \n        X = torch.sparse.FloatTensor(indices, values, torch.Size([len(self.vocab) + 1]))\n\n        return X, target\n\n# Load Data with One-hot Encoding\n#batch_size = 128\n#train_data = IMDBDataset(\'train\', vocab)\n#test_data = IMDBDataset(\'test\', vocab)\n\n#len(train_data), len(test_data)\n'

In [None]:
def sparse_collate(batch):
    inputs, targets = zip(*batch)  # Separate inputs and targets

    # Convert sparse tensors to dense
    dense_inputs = [i.to_dense() for i in inputs]

    # Stack dense tensors into a batch
    batch_inputs = torch.stack(dense_inputs)
    batch_targets = torch.tensor(targets, dtype=torch.float32)

    return batch_inputs, batch_targets

## III - Data Loader

In [None]:
#train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True, persistent_workers=True, collate_fn=sparse_collate)
#test_loader  = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True, persistent_workers=True, collate_fn=sparse_collate)




## IV - Modelo

In [14]:
class OneHotMLP(nn.Module):
    def __init__(self, vocab_size):
        super(OneHotMLP, self).__init__()

        self.fc1 = nn.Linear(vocab_size, 200)
        self.fc2 = nn.Linear(200, 1)

        self.relu = nn.ReLU()

    def forward(self, x):
        o = self.fc1(x.float())
        o = self.relu(o)
        return self.fc2(o)

# Model instantiation
#model = OneHotMLP(vocab_size)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OneHotMLP(10).to(device)
print(device)

cuda


## V - Laço de Treinamento - Otimização da função de Perda pelo Gradiente descendente

In [21]:
# Verifica se há uma GPU disponível e define o dispositivo para GPU se possível,
# caso contrário, usa a CPU

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('using CPU')


GPU: NVIDIA L40S


In [16]:
torch.backends.cudnn.benchmark = True

In [17]:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()


  scaler = GradScaler()


In [18]:
gradient_accumulation_steps = 4

In [22]:
import time

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

# Training loop
num_epochs = 5
scaler = GradScaler()

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)

        with torch.cuda.amp.autocast():
            logits = model(inputs)
            loss = criterion(logits.squeeze(), targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
            logits = model(inputs)
            loss = criterion(logits.squeeze(), targets)
            val_loss += loss.item()

    epoch_time = time.time() - start_time
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, Time: {epoch_time:.2f}s")

# Evaluation on Test Set (if required)

  scaler = GradScaler()
  with torch.cuda.amp.autocast():


Epoch [1/5], Train Loss: 0.4994, Val Loss: 0.5246, Time: 1.52s
Epoch [2/5], Train Loss: 0.4637, Val Loss: 0.5426, Time: 1.19s
Epoch [3/5], Train Loss: 0.4632, Val Loss: 0.5326, Time: 1.26s
Epoch [4/5], Train Loss: 0.4624, Val Loss: 0.5223, Time: 1.14s
Epoch [5/5], Train Loss: 0.4615, Val Loss: 0.5946, Time: 1.15s


In [None]:
watch -n 1 nvidia-smi

In [23]:
# Load Test Dataset
test_dataset = load_dataset("stanfordnlp/imdb", split="test")

# Convert Test Data into One-Hot Encoding
X_test, Y_test = [], []
for sample in test_dataset:
    encoded = encode_sentence(sample["text"], vocab)
    one_hot = np.zeros(len(vocab) + 1, dtype=np.float32)
    for word in encoded:
        one_hot[word] = 1
    X_test.append(one_hot)
    Y_test.append(sample["label"])

# Apply the SAME SVD Transformation from Training Data
X_test_reduced = torch.tensor(svd.transform(X_test), dtype=torch.float16)  # Use .transform(), NOT .fit_transform()
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)

# Create Test Dataset
test_data = TensorDataset(X_test_reduced, Y_test_tensor)

# Create Test DataLoader
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

## VI - Avaliação

In [24]:
## evaluation
model.eval()

with torch.no_grad():
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        logits = model(inputs)
        predicted = torch.round(torch.sigmoid(logits.squeeze()))
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    print(f'Test Accuracy: {100 * correct / total}%')

Test Accuracy: 76.62%
