<a href="https://colab.research.google.com/github/mrsidman/CyberBullyingExt/blob/main/AggressiveDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git clone https://github.com/mrsidman/CyberBullyingExt.git
%cd CyberBullyingExt

import pandas as pd

df = pd.read_csv('encoded_messages.csv')
messages = df['0']
labels = df['Label']


Cloning into 'CyberBullyingExt'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 15 (delta 3), reused 7 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (15/15), 20.59 MiB | 12.24 MiB/s, done.
Resolving deltas: 100% (3/3), done.
/content/CyberBullyingExt


In [3]:
df = pd.read_csv('word_index.csv')
word_index = dict(zip(df.iloc[:, 0], df.iloc[:, 1]))
print(list(word_index.items())[:10])

[('<PAD>', 0), ('<UNK>', 1), ('bye', 2), ('dear', 3), ('bajaj', 4), ('I', 5), ('get', 6), ('some', 7), ('well', 8), ('work', 9)]


In [4]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-06-26 16:51:48--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-06-26 16:51:48--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-06-26 16:51:49--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [5]:
print(messages.head())

0    [2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
1    [18, 19, 20, 15, 21, 22, 23, 24, 25, 26, 27, 2...
2    [35, 36, 30, 37, 38, 39, 33, 40, 41, 42, 43, 4...
3                                             [61, 62]
4    [8, 30, 44, 21, 63, 64, 65, 66, 10, 67, 25, 68...
Name: 0, dtype: object


In [6]:
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
import ast

maxlen = int(np.percentile([len(seq) for seq in messages], 95))

padded_message = pad_sequence([torch.tensor(ast.literal_eval(seq), dtype=torch.long) for seq in messages], batch_first=True, padding_value=word_index['<PAD>'])

In [7]:
glove = {}
with open("glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove[word] = vector

embedding_matrx = np.zeros((len(word_index) + 1, 100))

for word, index in word_index.items():
  if word in glove:
    embedding_matrx[index] = glove[word]

In [29]:
import torch.nn as nn
import torch

class Classifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim):
        super().__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix),
            freeze=True,
            padding_idx=word_index['<PAD>']
        )
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            batch_first=True,
            bidirectional=True,
            num_layers=2,
            dropout=0.2
        )
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.rnn(x)
        h_cat = torch.cat((hidden[0], hidden[1]), dim=1)
        h_cat = self.dropout(h_cat)
        x = self.fc(h_cat)
        return x


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(padded_message, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [33]:
from torch.utils.data import DataLoader, TensorDataset
import torch

batch_size = 32
hidden_size = 128
output_size = 1

y_train_tensor = torch.from_numpy(y_train.values).float()
y_val_tensor = torch.from_numpy(y_val.values).float()
y_test_tensor = torch.from_numpy(y_test.values).float()

train_dataset = TensorDataset(X_train, y_train_tensor)
val_dataset = TensorDataset(X_val, y_val_tensor)
test_dataset = TensorDataset(X_test, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
learning_rate = 1e-3
num_epochs = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Classifier(embedding_matrx, hidden_size, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.1, 5)

min_val_loss = float('inf')
count = 0
for epoch in range(num_epochs):
  model.train()
  train_loss = 0
  for inputs, labels in train_loader:
    inputs = inputs.to(device)
    labels = labels.unsqueeze(1)
    labels = labels.to(device)
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
  train_loss /= len(train_loader)

  model.eval()
  with torch.no_grad():
    val_loss = 0
    for inputs, labels in val_loader:
      inputs = inputs.to(device)
      labels = labels.unsqueeze(1)
      labels = labels.to(device)
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      val_loss += loss.item()
    val_loss /= len(val_loader)

    if val_loss < min_val_loss:
      min_val_loss = val_loss
      count = 0
      import os
      if not os.path.exists('/content/CyberBullyingExt/models'):
          os.makedirs('/content/CyberBullyingExt/models')
      torch.save(model.state_dict(), '/content/CyberBullyingExt/models/best_model.pth')
    else:
      count += 1
    if count == 10:
      print("Early stopping triggered.")
      break
    scheduler.step(val_loss)
  print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

Epoch 1/100, Train Loss: 0.3884, Val Loss: 0.3577
Epoch 2/100, Train Loss: 0.3393, Val Loss: 0.3701
Epoch 3/100, Train Loss: 0.3243, Val Loss: 0.3271
Epoch 4/100, Train Loss: 0.3134, Val Loss: 0.3133
Epoch 5/100, Train Loss: 0.3057, Val Loss: 0.3141
Epoch 6/100, Train Loss: 0.2950, Val Loss: 0.3059
Epoch 7/100, Train Loss: 0.2884, Val Loss: 0.3039
Epoch 8/100, Train Loss: 0.2839, Val Loss: 0.3130
Epoch 9/100, Train Loss: 0.2773, Val Loss: 0.3040
Epoch 10/100, Train Loss: 0.2715, Val Loss: 0.3026
Epoch 11/100, Train Loss: 0.2660, Val Loss: 0.3126
Epoch 12/100, Train Loss: 0.2590, Val Loss: 0.2969
Epoch 13/100, Train Loss: 0.2548, Val Loss: 0.3139
Epoch 14/100, Train Loss: 0.2492, Val Loss: 0.3073
Epoch 15/100, Train Loss: 0.2453, Val Loss: 0.3027
Epoch 16/100, Train Loss: 0.2399, Val Loss: 0.3042
