In [None]:
!pip install transformers
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assuming your data is in a DataFrame called 'train'
# and has columns 'text_cleaned' and 'target_relabeled'

# 1. Prepare the dataset
import pandas as pd
data = pd.read_csv('data.csv')
x = data['tweet']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 2. Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 3. Create the data loaders
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128
batch_size = 16

train_dataset = CustomDataset(X_train, y_train, tokenizer, max_length)
test_dataset = CustomDataset(X_test, y_test, tokenizer, max_length)

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(np.unique(y)))
model = model.to(device)

epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = torch.nn.CrossEntropyLoss().to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from sklearn.metrics import accuracy_score,classification_report

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        losses.append(loss.item())

        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += len(labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    accuracy = correct_predictions.double() / total_predictions
    return np.mean(losses), accuracy.item()

def evaluate(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = loss_fn(outputs.logits, labels)
            losses.append(loss.item())

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += len(labels)
            y_true.extend(labels.cpu().numpy().tolist())
            y_pred.extend(preds.cpu().numpy().tolist())

    accuracy = correct_predictions.double() / total_predictions
    return np.mean(losses), accuracy.item(), y_true, y_pred

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_loss, train_accuracy = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss: {train_loss}')
    print(f'Train accuracy: {train_accuracy}')

    test_loss, test_accuracy, y_true, y_pred = evaluate(
        model,
        test_data_loader,
        loss_fn,
        device
    )

    print(f'Test loss: {test_loss}')
    print(f'Test accuracy: {test_accuracy}')

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)
print("Classification Report: ")
print(classification_report(y_true, y_pred))




Epoch 1/3
----------
Train loss: 0.45049918832115615
Train accuracy: 0.800328407224959
Test loss: 0.37801761960145086
Test accuracy: 0.8496388706500327
Epoch 2/3
----------
Train loss: 0.2872511510177469
Train accuracy: 0.8908045977011495
Test loss: 0.41219829678690684
Test accuracy: 0.8476690741956664
Epoch 3/3
----------
Train loss: 0.19993559769024766
Train accuracy: 0.9320197044334976
Test loss: 0.5482640180465145
Test accuracy: 0.8273145108338804
Train Accuracy:  0.9320197044334976
Test Accuracy:  0.8273145108338804
Classification Report: 
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       874
           1       0.78      0.82      0.80       649

    accuracy                           0.83      1523
   macro avg       0.82      0.83      0.82      1523
weighted avg       0.83      0.83      0.83      1523

