## Carregando embeddings e chunks:

In [1]:
import pickle

with open('variaveis_embeddings/embeddings_mat.pkl', 'rb') as f:
    embeddings_mat = pickle.load(f)
with open('variaveis_embeddings/embeddings_port.pkl', 'rb') as f:
    embeddings_port = pickle.load(f)
with open('variaveis_embeddings/chunks_mat.pkl', 'rb') as f:
    chunks_mat = pickle.load(f)
with open('variaveis_embeddings/chunks_port.pkl', 'rb') as f:
    chunks_port = pickle.load(f)

## Reduzindo Dimensionalidade (PCA):

In [2]:
import numpy as np

from sklearn.decomposition import PCA

pca = PCA(n_components=64)

embeddings_mat_port = np.vstack([embeddings_mat, embeddings_port])
pca_mat_port = pca.fit_transform(embeddings_mat_port)

print(pca_mat_port)

[[-0.08555757 -0.19331195  0.06497455 ... -0.0657347  -0.02776902
   0.03534328]
 [ 0.07595198 -0.17994625  0.12204586 ... -0.06843743  0.03015185
   0.05021355]
 [ 0.10219311 -0.04988923  0.22471306 ...  0.01453297  0.03714858
   0.02634114]
 ...
 [ 0.31093809  0.11143037 -0.04247366 ... -0.05224939  0.02923046
   0.0054573 ]
 [ 0.33120463 -0.02168424  0.00551711 ... -0.0234345   0.021756
  -0.02266387]
 [ 0.34626674  0.11101754 -0.20030809 ...  0.02186001 -0.00778661
   0.02728312]]


## Modelo de Classificação Binária (Pytorch):

In [3]:
import torch

import torch.nn as nn

class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

binary_classifier = BinaryClassifier()
print(binary_classifier)

BinaryClassifier(
  (model): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)


In [4]:
from torch.utils.data import TensorDataset, DataLoader

X = torch.tensor(pca_mat_port, dtype=torch.float32)
y = np.concatenate([np.ones(271), np.zeros(373)])
Y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

dataset = TensorDataset(X, Y)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

modelo = binary_classifier
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(modelo.parameters(), lr=1e-3)

epocas = 500
early_stopping = 1e-5
ultima_perda = None
paciencia = 0

for epoca in range(epocas):
    modelo.train()
    perda = 0
    for batch_X, batch_y in loader:
        optimizer.zero_grad()
        outputs = modelo(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()
        perda += loss.item() * batch_X.size(0)
    
    perda_media_epoca = perda/len(dataset)
    if ultima_perda is not None and ultima_perda - perda_media_epoca < early_stopping:
        paciencia += 1
        if paciencia == 5:
            print("Early Stopping!")
            break
    else:
        print(f"Época: {epoca+1}/{epocas}, Loss: {perda_media_epoca:.4f}")
    ultima_perda = perda_media_epoca

Época: 1/500, Loss: 0.6898
Época: 2/500, Loss: 0.6727
Época: 3/500, Loss: 0.6396
Época: 4/500, Loss: 0.5779
Época: 5/500, Loss: 0.4792
Época: 6/500, Loss: 0.3564
Época: 7/500, Loss: 0.2428
Época: 8/500, Loss: 0.1581
Época: 9/500, Loss: 0.1024
Época: 10/500, Loss: 0.0690
Época: 11/500, Loss: 0.0478
Época: 12/500, Loss: 0.0353
Época: 13/500, Loss: 0.0270
Época: 14/500, Loss: 0.0212
Época: 15/500, Loss: 0.0173
Época: 16/500, Loss: 0.0143
Época: 17/500, Loss: 0.0121
Época: 18/500, Loss: 0.0103
Época: 19/500, Loss: 0.0089
Época: 20/500, Loss: 0.0078
Época: 21/500, Loss: 0.0069
Época: 22/500, Loss: 0.0061
Época: 23/500, Loss: 0.0054
Época: 24/500, Loss: 0.0049
Época: 25/500, Loss: 0.0044
Época: 26/500, Loss: 0.0040
Época: 27/500, Loss: 0.0037
Época: 28/500, Loss: 0.0034
Época: 29/500, Loss: 0.0031
Época: 30/500, Loss: 0.0029
Época: 31/500, Loss: 0.0027
Época: 32/500, Loss: 0.0025
Época: 33/500, Loss: 0.0023
Época: 34/500, Loss: 0.0022
Época: 35/500, Loss: 0.0020
Época: 36/500, Loss: 0.0019
É

In [7]:
X_test = torch.tensor(pca_mat_port, dtype=torch.float32)
y_test = y

modelo.eval()
with torch.no_grad():
    outputs_test = modelo(X_test)
    probs = outputs_test.squeeze().numpy()
    eps = 1e-8
    entropy = - (probs * np.log(probs + eps) + (1 - probs) * np.log(1 - probs + eps))


indice_maior_duvida = np.argsort(-entropy)[0]
print(indice_maior_duvida)

3


In [8]:
idx = int(indice_maior_duvida)
if idx < len(chunks_mat):
    print("Chunk com maior dúvida (mat):")
    print(chunks_mat[idx].page_content)
else:
    print("Chunk com maior dúvida (port):")
    print(chunks_port[idx - len(chunks_mat)].page_content)

Chunk com maior dúvida (mat):
Samantha Onofre Lóssio 
Tibério Bezerra Soares
Revisão Textual
Aurea Suely Zavam
Nukácia Meyre Araújo de Almeida
Revisão Web
Antônio Carlos Marques Júnior
Débora Liberato Arruda Hissa
Saulo Garcia
Logística
Francisco Roberto Dias de Aguiar
Virgínia Ferreira Moreira
Secretários
Breno Giovanni Silva Araújo
Francisca Venâncio da Silva
Auxiliar
Ana Paula Gomes Correia
Bernardo Matias de Carvalho
Charlene Oliveira da Silveira
Isabella de Castro Britto
Vivianny de Lima Santiago
Wagner Souto Fernandes
