# Classificação de sentimentos de Tweets com Bert

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch import nn, optim, tensor, no_grad, max
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score

### Carregar os dados

In [None]:
df = pd.read_csv('../data/training.1600000.processed.noemoticon.csv', encoding='latin-1', 
                 header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head()

In [None]:
df.info()

### Extrair uma amostra

In [None]:
print(f"Shape of the dataframe before: {df.shape}")

df = df.sample(frac=0.001).reset_index(drop=True)

print(f"Shape of the dataframe after: {df.shape}")

### Dropar colunas irrelevantes

In [None]:
df.drop(['id', 'date', 'flag', 'user'], axis=1, inplace=True)
df.head()

### Divisão dos dados em treino e teste

In [None]:
X, y = df['text'], df['target']
y = y.replace({4: 1})  # Mapear a classe 4 para 1

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Tokenizar os textos

In [None]:
# Carregar tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_batch(batch):
    return tokenizer.batch_encode_plus(
        batch.tolist(),
        add_special_tokens=True,
        max_length=512,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )


# Tokenizar os textos
X_train = tokenize_batch(X_train)
X_test = tokenize_batch(X_test)

### Converter listas para Tensores

In [None]:
X_train_seq = tensor(X_train['input_ids'])
X_train_mask = tensor(X_train['attention_mask'])
y_train_tensor = tensor(y_train.tolist())

X_test_seq = tensor(X_test['input_ids'])
X_test_mask = tensor(X_test['attention_mask'])
y_test_tensor = tensor(y_test.tolist())

### Dataloader

In [None]:
#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(X_train_seq, X_train_mask, y_train_tensor)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
test_data = TensorDataset(X_test_seq, X_test_mask, y_test_tensor)

# sampler for sampling the data during training
test_sampler = SequentialSampler(test_data)

# dataLoader for validation set
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size=batch_size)

### Fine-tuning do modelo BERT

In [None]:
# Carregar modelo
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Adicionar camadas adicionais
model.classifier = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 2),
)

# Definir função de perda e otimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

num_epochs = 5

# Treinar o modelo
for epoch in range(num_epochs):
    model.train()  # Colocar o modelo no modo de treinamento
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = tuple(t for t in batch)

        # Zerar gradientes
        optimizer.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Epoch: {epoch} - Step: {step} - Loss: {loss.item()}")

### Avaliação do modelo

In [None]:
model.eval()

predictions = []
true_labels = []

for idx, batch in enumerate(test_dataloader):
    b_input_ids, b_input_mask, b_labels = tuple(t for t in batch)
    
    with no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    
    logits = outputs.logits
    _, predicted_labels = max(logits, 1)
    
    predictions.extend(predicted_labels.cpu().numpy())
    true_labels.extend(b_labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)

print("Acurácia:", accuracy)
print("Precisão:", precision)
print("Recall:", recall)