<a href="https://colab.research.google.com/github/mrsidman/CyberBullyingExt/blob/main/AggressiveDetectorNew.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

path = kagglehub.dataset_download("andrewmvd/cyberbullying-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/andrewmvd/cyberbullying-classification?dataset_version_number=1...


100%|██████████| 2.82M/2.82M [00:01<00:00, 2.45MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/andrewmvd/cyberbullying-classification/versions/1





In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('/root/.cache/kagglehub/datasets/andrewmvd/cyberbullying-classification/versions/1/cyberbullying_tweets.csv')
print(df.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [3]:
import re

label_keys = {
    'not_cyberbullying': 0,
    'gender': 1,
    'religion': 2,
    'other_cyberbullying': 3,
    'age': 4,
    'ethnicity': 5
}

processed_messages = []
processed_labels = []

for message, label in zip(df['tweet_text'], df['cyberbullying_type']):
    # Clean the message
    message = re.sub(r'http\S+', '', message)
    message = re.sub(r'@\w+', '', message)
    message = re.sub(r'#', '', message)
    message = re.sub(r'\s+', ' ', message).strip().lower()

    if message == "":
        continue
    if label not in label_keys:
        continue

    processed_messages.append(message)
    processed_labels.append(label_keys[label])


In [4]:
from sklearn.model_selection import train_test_split

train_messages, temp_messages, train_labels, temp_labels = train_test_split(processed_messages, processed_labels, test_size=0.3, random_state=42)
val_messages, test_messages, val_labels, test_labels = train_test_split(temp_messages, temp_labels, test_size=0.5, random_state=42)

In [5]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_messages, truncation=True, max_length=256, padding=True, return_tensors='pt')
val_encodings = tokenizer(val_messages, truncation=True, max_length=256, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_messages, truncation=True, max_length=256, padding=True, return_tensors='pt')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [6]:
import torch
class Messages(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'label': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)
train_dataset = Messages(train_encodings, train_labels)
val_dataset = Messages(val_encodings, val_labels)
test_dataset = Messages(test_encodings, test_labels)

In [7]:
batch_size = 16
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False)

In [8]:
import torch.nn as nn
from transformers import DistilBertModel

class Classifier(nn.Module):
    def __init__(self, hidden_size=768, num_classes=6):
        super(Classifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.5)
        self.norm = nn.LayerNorm(hidden_size)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.norm(pooled_output)
        return self.fc(pooled_output)


In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, get_linear_schedule_with_warmup
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

learning_rate = 5e-6
num_epochs = 30
weight_decay = 0.01
warmup_steps = 0.01 * len(train_loader) * num_epochs
total_steps = len(train_loader) * num_epochs

model = Classifier().to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/cyberbullyingMl/bert_model_2_3.pth'))

for param in model.bert.parameters():
    param.requires_grad = False

for name, param in model.bert.named_parameters():
    if "transformer.layer.2" in name or "transformer.layer.3" in name:
        param.requires_grad = True

criterion = nn.CrossEntropyLoss()

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

min_val_loss = float('inf')
count = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")

    for batch in train_loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels.long())

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        train_loop.set_postfix(loss=loss.item())

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        val_loop = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        for batch in val_loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.long())

            val_loss += loss.item()
            val_loop.set_postfix(val_loss=loss.item())

        val_loss /= len(val_loader)

        if val_loss < min_val_loss:
            min_val_loss = val_loss
            count = 0
            torch.save(model.state_dict(), '/content/drive/MyDrive/cyberbullyingMl/bert_model_2_3.pth')
        else:
            count += 1
            if count >= 5:
                print("Early stopping")
                break

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


Epoch 1/30 [Train]: 100%|██████████| 2077/2077 [07:31<00:00,  4.61it/s, loss=0.453]
Epoch 1/30 [Val]: 100%|██████████| 446/446 [00:48<00:00,  9.20it/s, val_loss=1.2]


Epoch 1/30, Train Loss: 0.3151, Val Loss: 0.4279


Epoch 2/30 [Train]: 100%|██████████| 2077/2077 [07:30<00:00,  4.61it/s, loss=0.379]
Epoch 2/30 [Val]: 100%|██████████| 446/446 [00:48<00:00,  9.19it/s, val_loss=1.72]


Epoch 2/30, Train Loss: 0.3064, Val Loss: 0.4308


Epoch 3/30 [Train]:  14%|█▎        | 284/2077 [01:01<06:27,  4.63it/s, loss=0.47]

In [16]:
import torch
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

# Load the trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Classifier()
model.load_state_dict(torch.load('/content/drive/MyDrive/cyberbullyingMl/bert_model_2_3.pth'))
model = model.to(device)
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    test_loop = tqdm(test_loader, desc="Evaluating on Test Data")
    for batch in test_loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute and print results
accuracy = accuracy_score(all_labels, all_preds)
print(f"\n✅ Test Accuracy: {accuracy:.4f}")

# Generate classification report (includes precision, recall, f1-score per class)
report = classification_report(all_labels, all_preds, digits=4)
print("\n📋 Classification Report:\n")
print(report)


Evaluating on Test Data: 100%|██████████| 446/446 [00:49<00:00,  8.97it/s]


✅ Test Accuracy: 0.8518

📋 Classification Report:

              precision    recall  f1-score   support

           0     0.6613    0.6113    0.6354      1217
           1     0.9163    0.8548    0.8845      1178
           2     0.9633    0.9641    0.9637      1171
           3     0.6385    0.7307    0.6815      1177
           4     0.9755    0.9823    0.9789      1133
           5     0.9806    0.9743    0.9774      1245

    accuracy                         0.8518      7121
   macro avg     0.8559    0.8529    0.8536      7121
weighted avg     0.8552    0.8518    0.8527      7121




