In [2]:
# Sentimental Analysis Algorithm - Algoritmo de Análise de Sentimento
%pip install torch --quiet
%pip install pandas scikit-learn nltk --quiet


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
# === IMPORTAÇÕES ===
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer
import re
import webbrowser
import pickle

# Downloads necessários para o NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('rslp')

[nltk_data] Downloading package punkt to C:\Users\Luca
[nltk_data]     Flores\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Luca
[nltk_data]     Flores\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to C:\Users\Luca
[nltk_data]     Flores\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to C:\Users\Luca
[nltk_data]     Flores\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [80]:
# === 1. PRÉ-PROCESSAMENTO DE TEXTO ===
def preprocess_text(text):
    # Converte para minúsculas, remove pontuação, tokeniza, remove stopwords e aplica stemmer
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text, language='portuguese')
    stop_words = set(stopwords.words('portuguese'))
    stemmer = RSLPStemmer()
    tokens = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return tokens


# === 2. CARGA E TRATAMENTO DOS DADOS ===
def load_data(path):
    df = pd.read_csv(path, encoding='utf-8')
    df.dropna(inplace=True)
    df['tokens'] = df['texto'].apply(preprocess_text)  # Aplica o pré-processamento
    return df

# Cria vocabulário baseado na frequência mínima dos tokens
def build_vocab(token_lists, min_freq=1):
    freq = {}
    for tokens in token_lists:
        for token in tokens:
            freq[token] = freq.get(token, 0) + 1
    vocab = {word: i + 1 for i, (word, count) in enumerate(freq.items()) if count >= min_freq}
    vocab['<UNK>'] = 0  # Token desconhecido
    return vocab

# Codifica uma lista de tokens para uma sequência de índices do vocabulário
def encode_tokens(tokens, vocab, max_len=50):
    indices = [vocab.get(t, 0) for t in tokens]
    return indices[:max_len] + [0] * max(0, max_len - len(indices))


# === 3. CLASSE DE DATASET PERSONALIZADA ===
class TextDataset(Dataset):
    def __init__(self, df, vocab, label_encoder):
        self.inputs = [encode_tokens(t, vocab) for t in df['tokens']]
        self.labels = label_encoder.transform(df['sentimento'])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


# === 4. DEFINIÇÃO DO MODELO ===
# Rede neural feedforward simples (MLP — Perceptron Multicamadas)
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, hidden_dim,dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, 
                            batch_first=True) # LSTM opcional, mas não usado no original
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)  # lstm_out: (batch, seq_len, hidden_dim)
        pooled = lstm_out[:, -1, :]       # pega o último hidden state
        # pooled = embedded.mean(dim=1)  # Média dos embeddings (pooling)
        dropped = self.dropout(pooled)
        return self.fc(dropped)


# === 5. TREINAMENTO DO MODELO ===
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs, device, label_encoder):
    model.to(device)

    # for epoch in range(epochs):
    #     model.train()
    #     total_loss = 0
    #     for x_batch, y_batch in train_loader:
    #         x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    #         optimizer.zero_grad()
    #         outputs = model(x_batch)
    #         loss = criterion(outputs, y_batch)
    #         loss.backward()
    #         optimizer.step()
    #         total_loss += loss.item()

    #     # Validação
    #     model.eval()
    #     all_preds, all_labels = [], []
    #     with torch.no_grad():
    #         for x_val, y_val in val_loader:
    #             x_val = x_val.to(device)
    #             outputs = model(x_val)
    #             preds = torch.argmax(outputs, dim=1).cpu().numpy()
    #             all_preds.extend(preds)
    #             all_labels.extend(y_val.numpy())

    #     epoch_acc = accuracy_score(all_labels, all_preds)
    #     print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}, Accuracy: {epoch_acc:.2f}")
    #     print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_, zero_division=0))

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct_train = 0                # <-- Adicionado: conta acertos no treino
        total_train = 0                  # <-- Adicionado: conta exemplos no treino
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)           # <-- Adicionado: calcula predições no treino
            correct_train += (preds == y_batch).sum().item()# <-- Adicionado: soma acertos no treino
            total_train += y_batch.size(0)                  # <-- Adicionado: soma total de exemplos no treino

        train_acc = correct_train / total_train             # <-- Adicionado: calcula acurácia de treino

        # Validação
        model.eval()
        all_preds, all_labels = [], []
        val_loss = 0                                       # <-- Adicionado: soma loss de validação
        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                outputs = model(x_val)
                loss = criterion(outputs, y_val)            # <-- Adicionado: calcula loss de validação
                val_loss += loss.item()                     # <-- Adicionado: soma loss de validação
                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(y_val.cpu().numpy())

        val_acc = accuracy_score(all_labels, all_preds)     # <-- Adicionado: calcula acurácia de validação
        print(f"Epoch {epoch + 1}/{epochs}, "
            f"Train Loss: {total_loss / len(train_loader):.4f}, Train Acc: {train_acc:.2f}, "
            f"Val Loss: {val_loss / len(val_loader):.4f}, Val Acc: {val_acc:.2f}") # <-- Adicionado: imprime tudo
    

    # Relatório final
    final_report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_, output_dict=True, zero_division=0)
    final_accuracy = accuracy_score(all_labels, all_preds)
    conf_matrix = confusion_matrix(all_labels, all_preds)

    html_report = classification_report_to_html(final_report, label_encoder.classes_, final_accuracy, conf_matrix)

    html_path = 'resultado_validacao.html'
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_report)

    print(f'Relatório final salvo em {html_path}')
    webbrowser.open(f'file://{os.path.abspath(html_path)}')

    # Salva modelo treinado
    torch.save(model.state_dict(), 'modelo_treinado.pth')
    print("Modelo salvo como modelo_treinado.pth")


# === CONVERSÃO DE RELATÓRIO PARA HTML ===
def classification_report_to_html(report_dict, class_names, accuracy, conf_matrix):
    html = f"""
    <html><head><title>Relatório de Validação</title></head>
    <body>
    <h1>Relatório de Classificação</h1>
    <h2>Acurácia Final: {accuracy:.2f}</h2>

    <h3>Métricas por Classe</h3>
    <table border="1" cellpadding="8">
    <tr><th>Classe</th><th>Precisão</th><th>Recall</th><th>F1-score</th><th>Suporte</th></tr>
    """
    for label in class_names:
        metrics = report_dict[label]
        html += f"<tr><td>{label}</td><td>{metrics['precision']:.2f}</td><td>{metrics['recall']:.2f}</td><td>{metrics['f1-score']:.2f}</td><td>{metrics['support']}</td></tr>"
    html += "</table>"

    # Matriz de confusão
    html += "<h3>Matriz de Confusão</h3><table border='1' cellpadding='8'><tr><th></th>"
    for label in class_names:
        html += f"<th>{label}</th>"
    html += "</tr>"
    for i, row in enumerate(conf_matrix):
        html += f"<tr><th>{class_names[i]}</th>"
        for val in row:
            html += f"<td>{val}</td>"
        html += "</tr>"
    html += "</table>"

    html += "</body></html>"
    return html

In [81]:
# === 6. EXECUÇÃO PRINCIPAL ===
def main():
    filepath = r"C:\Users\Luca Flores\Downloads\posts.csv"  # Caminho para seu dataset
    df = load_data(filepath)

    vocab = build_vocab(df['tokens'])
    label_encoder = LabelEncoder()
    label_encoder.fit(df['sentimento'])

    dataset = TextDataset(df, vocab, label_encoder)

    # Divisão simples: primeiros 16 para treino, 4 para validação
    train_indices = list(range(16))
    val_indices = list(range(16, 20))

    train_data = Subset(dataset, train_indices)
    val_data = Subset(dataset, val_indices)

    train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SentimentClassifier(
        vocab_size=len(vocab),
        embed_dim=25,
        num_classes=len(label_encoder.classes_),
        hidden_dim=16,  # Dimensão do hidden state da LSTM
        dropout=0.5
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=0.01, alpha=0.9)

    train_model(model, train_loader, val_loader, criterion, optimizer, epochs=16, device=device,
                label_encoder=label_encoder)

    # Salva vocabulário
    with open('vocab.pkl', 'wb') as f:
        pickle.dump(vocab, f)

    # Salva codificador de labels
    with open('label_encoder.pkl', 'wb') as f:
        pickle.dump(label_encoder, f)

    print("Vocabulário salvo como vocab.pkl")
    print("LabelEncoder salvo como label_encoder.pkl")


if __name__ == "__main__":
    main()

Epoch 1/16, Train Loss: 1.1515, Train Acc: 0.31, Val Loss: 1.0980, Val Acc: 0.25
Epoch 2/16, Train Loss: 1.2096, Train Acc: 0.31, Val Loss: 1.1028, Val Acc: 0.25
Epoch 3/16, Train Loss: 1.0786, Train Acc: 0.44, Val Loss: 1.1121, Val Acc: 0.25
Epoch 4/16, Train Loss: 1.1834, Train Acc: 0.25, Val Loss: 1.0921, Val Acc: 0.25
Epoch 5/16, Train Loss: 1.2004, Train Acc: 0.25, Val Loss: 1.0910, Val Acc: 0.50
Epoch 6/16, Train Loss: 1.1065, Train Acc: 0.44, Val Loss: 1.0951, Val Acc: 0.25
Epoch 7/16, Train Loss: 1.1411, Train Acc: 0.31, Val Loss: 1.0925, Val Acc: 0.50
Epoch 8/16, Train Loss: 1.1021, Train Acc: 0.31, Val Loss: 1.0875, Val Acc: 0.50
Epoch 9/16, Train Loss: 1.1494, Train Acc: 0.19, Val Loss: 1.0743, Val Acc: 0.50
Epoch 10/16, Train Loss: 1.1335, Train Acc: 0.31, Val Loss: 1.0627, Val Acc: 0.50
Epoch 11/16, Train Loss: 1.1099, Train Acc: 0.44, Val Loss: 1.0666, Val Acc: 0.50
Epoch 12/16, Train Loss: 1.0915, Train Acc: 0.38, Val Loss: 1.0577, Val Acc: 0.50
Epoch 13/16, Train Loss: 