In [None]:
!pip install transformers scikit-learn



In [None]:
import pandas as pd

# Try reading with 'latin1' encoding
df = pd.read_csv('/content/dados_rotulados.csv', sep=';', on_bad_lines='warn', encoding='latin1')
# If 'latin1' doesn't work, try 'ISO-8859-1', 'cp1252', or other common encodings

# Ver as primeiras linhas para confirmar
print(df.head())

         id                                               text  label
0  1,91E+18                vai. Omer teu traveco e a janja meu      1
1  1,91E+18  " amiga " basicamente foi um traveco ou seja o...      1
2  1,91E+18  "a mulher travesti" como pode uma pessoa trans...      0
3  1,91E+18  , o que o referido vereador esta mostrando com...      1
4  1,91E+18  A segunda parte quando elu fala que travesti e...      0


In [None]:
from transformers import BertTokenizer

# Carregar o tokenizer
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Função para tokenizar os textos
def tokenize_data(texts, max_length=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

# Tokenizar os textos da segunda coluna
texts = df.iloc[:, 1]
encoded_data = tokenize_data(texts)

In [None]:
import torch
from sklearn.model_selection import train_test_split

# Extrair rótulos da terceira coluna
labels = torch.tensor(df.iloc[:, 2].values)

# Dividir os dados (80% treino, 20% validação)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    encoded_data['input_ids'],
    labels,
    test_size=0.2,
    random_state=42
)

train_masks, val_masks, _, _ = train_test_split(
    encoded_data['attention_mask'],
    labels,
    test_size=0.2,
    random_state=42
)
print(f"Total de exemplos: {len(encoded_data['input_ids'])}")
print(f"Número de exemplos no conjunto de treinamento: {len(train_inputs)}")
print(f"Número de exemplos no conjunto de teste: {len(val_inputs)}")
print(f"Proporção de treinamento: {len(train_inputs) / len(encoded_data['input_ids']) * 100:.2f}%")
print(f"Proporção de teste: {len(val_inputs) / len(encoded_data['input_ids']) * 100:.2f}%")

Total de exemplos: 144
Número de exemplos no conjunto de treinamento: 115
Número de exemplos no conjunto de teste: 29
Proporção de treinamento: 79.86%
Proporção de teste: 20.14%


In [None]:
from torch.utils.data import TensorDataset, DataLoader

# Criar datasets
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

# Criar DataLoaders
batch_size = 16  # Ajuste conforme a memória da GPU
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
from transformers import BertForSequenceClassification

# Carregar o modelo
model = BertForSequenceClassification.from_pretrained(
    'neuralmind/bert-base-portuguese-cased',
    num_labels=2  # 2 classes: 0 e 1
)

# Mover para a GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Loop de treinamento
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Época {epoch + 1}, Perda média: {total_loss / len(train_dataloader)}')

Época 1, Perda média: 0.7071445807814598
Época 2, Perda média: 0.6353238075971603
Época 3, Perda média: 0.5246834754943848
Época 4, Perda média: 0.3669992461800575
Época 5, Perda média: 0.20910758897662163
Época 6, Perda média: 0.10565437050536275
Época 7, Perda média: 0.05805277777835727
Época 8, Perda média: 0.04010451631620526
Época 9, Perda média: 0.02429832168854773
Época 10, Perda média: 0.024494492099620402
Época 11, Perda média: 0.0168396431254223
Época 12, Perda média: 0.015282199252396822
Época 13, Perda média: 0.01178149797488004
Época 14, Perda média: 0.010603722417727113
Época 15, Perda média: 0.00871846079826355
Época 16, Perda média: 0.007753412064630538
Época 17, Perda média: 0.0071799312136135995
Época 18, Perda média: 0.006528369325678796
Época 19, Perda média: 0.0062988538993522525
Época 20, Perda média: 0.005549911875277758


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

# Calcular métricas
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

print(f'Acurácia: {accuracy:.4f}')
print(f'Precisão: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')

Acurácia: 0.7241
Precisão: 0.7619
Recall: 0.8421
F1-score: 0.8000


In [None]:
#ver o balanceamento dos dados


# Contar os rótulos
print(df.iloc[:, 2].value_counts())

label
1    82
0    62
Name: count, dtype: int64


In [None]:
def predict(text):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()
    return 'Transfobia' if pred == 1 else 'Não transfobia'

# Exemplo
texto = "respeito as pessoas trans"
print(predict(texto))

Transfobia


In [None]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

# Verificar a distribuição das previsões
from collections import Counter
print("Distribuição das previsões:", Counter(predictions))
print("Distribuição real:", Counter(true_labels))

Distribuição das previsões: Counter({1: 21, 0: 8})
Distribuição real: Counter({1: 19, 0: 10})
