In [4]:
# -----------------------------
# 1. Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score



In [5]:
df = pd.read_csv("dataset_cyberbulling.csv")   # <-- update name as needed
df = df[['tweet_text', 'cyberbullying_type']]

df.head()


Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [6]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['cyberbullying_type'])
num_labels = len(label_encoder.classes_)

print(label_encoder.classes_)


['age' 'ethnicity' 'gender' 'not_cyberbullying' 'other_cyberbullying'
 'religion']


In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()   # we only extract embeddings, no training


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [12]:
def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', 
                       padding='max_length',
                       truncation=True, 
                       max_length=64)

    with torch.no_grad():
        outputs = bert_model(**tokens)
    
    # CLS token representation (768 dims)
    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_embedding.flatten()


In [14]:
embeddings = []

for text in tqdm(df['tweet_text'], desc="Encoding Tweets"):
    emb = get_bert_embedding(text)
    embeddings.append(emb)

X = np.array(embeddings)
y = df['label'].values

print("Embedding shape:", X.shape)


Encoding Tweets: 100%|██████████| 47692/47692 [2:00:07<00:00,  6.62it/s]  


Embedding shape: (47692, 768)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [18]:
svm_clf = SVC(kernel='linear', probability=True)
svm_clf.fit(X_train, y_train)

svm_pred = svm_clf.predict(X_test)

print("\n==== SVM Performance ====")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred, target_names=label_encoder.classes_))



==== SVM Performance ====
Accuracy: 0.7887619247300556
                     precision    recall  f1-score   support

                age       0.90      0.93      0.91      1598
          ethnicity       0.89      0.90      0.90      1592
             gender       0.79      0.79      0.79      1595
  not_cyberbullying       0.60      0.53      0.56      1589
other_cyberbullying       0.61      0.67      0.64      1565
           religion       0.92      0.90      0.91      1600

           accuracy                           0.79      9539
          macro avg       0.79      0.79      0.79      9539
       weighted avg       0.79      0.79      0.79      9539



In [20]:
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf_clf.fit(X_train, y_train)

rf_pred = rf_clf.predict(X_test)

print("\n==== Random Forest Performance ====")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred, target_names=label_encoder.classes_))



==== Random Forest Performance ====
Accuracy: 0.6833001362826292
                     precision    recall  f1-score   support

                age       0.73      0.85      0.79      1598
          ethnicity       0.82      0.78      0.80      1592
             gender       0.74      0.63      0.68      1595
  not_cyberbullying       0.49      0.44      0.46      1589
other_cyberbullying       0.49      0.52      0.51      1565
           religion       0.82      0.88      0.85      1600

           accuracy                           0.68      9539
          macro avg       0.68      0.68      0.68      9539
       weighted avg       0.68      0.68      0.68      9539



In [22]:
import pickle
import joblib
import torch

# Create a folder to store models (optional)
import os
if not os.path.exists("saved_models"):
    os.makedirs("saved_models")


In [24]:
# 1. Save SVM model
# -------------------------
joblib.dump(svm_clf, "saved_models/svm_model.pkl")

# -------------------------
# 2. Save Random Forest model
# -------------------------
joblib.dump(rf_clf, "saved_models/rf_model.pkl")

# -------------------------
# 3. Save Label Encoder
# -------------------------
joblib.dump(label_encoder, "saved_models/label_encoder.pkl")

# -------------------------
# 4. Save BERT tokenizer
# -------------------------
tokenizer.save_pretrained("saved_models/bert_tokenizer/")

# -------------------------
# 5. Save BERT model
# -------------------------
bert_model.save_pretrained("saved_models/bert_model/")

In [26]:
# Load Saved Models
# -------------------------

svm_clf = joblib.load("saved_models/svm_model.pkl")
rf_clf = joblib.load("saved_models/rf_model.pkl")
label_encoder = joblib.load("saved_models/label_encoder.pkl")

tokenizer = BertTokenizer.from_pretrained("saved_models/bert_tokenizer/")
bert_model = BertModel.from_pretrained("saved_models/bert_model/")

print("All models successfully loaded!")

All models successfully loaded!


In [28]:
# -----------------------------
# Prediction Function for Single Tweet
# -----------------------------
def predict_single_tweet(text, model, label_encoder):
    # Get BERT embedding for the input text
    emb = get_bert_embedding(text).reshape(1, -1)

    # Predict label
    pred = model.predict(emb)[0]

    # Predict class probabilities (if model supports it)
    probs = None
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(emb)[0]
    else:
        probs = [0] * len(label_encoder.classes_)

    # Format output
    results = []
    for i, label in enumerate(label_encoder.classes_):
        prob = probs[i] if len(probs) > 1 else probs[0]
        results.append({"label": label, "probability": prob})

    # Sort results based on probability
    results = sorted(results, key=lambda x: x["probability"], reverse=True)

    return results


In [30]:
# -----------------------------
# Interactive Prediction Loop
# -----------------------------
print("\n--- Cyberbullying Detection (BERT + ML Models) ---")

while True:
    text = input("\nEnter tweet (or 'quit'): ")

    if text.strip().lower() in ["quit", "q", "exit"]:
        print("Exiting...")
        break

    print("\nBERT + SVM prediction:")
    out_svm = predict_single_tweet(text, svm_clf, label_encoder)
    for p in out_svm:
        print(f"  {p['label']}: {p['probability']*100:.2f}%")

    print("\nBERT + Random Forest prediction:")
    out_rf = predict_single_tweet(text, rf_clf, label_encoder)
    for p in out_rf:
        print(f"  {p['label']}: {p['probability']*100:.2f}%")

    print("\n------------------------------------")



--- Cyberbullying Detection (BERT + ML Models) ---



Enter tweet (or 'quit'):  u r ugly



BERT + SVM prediction:
  not_cyberbullying: 63.84%
  other_cyberbullying: 31.30%
  gender: 3.60%
  ethnicity: 0.79%
  age: 0.29%
  religion: 0.18%

BERT + Random Forest prediction:
  other_cyberbullying: 36.23%
  not_cyberbullying: 33.43%
  gender: 11.33%
  age: 8.33%
  ethnicity: 7.33%
  religion: 3.33%

------------------------------------



Enter tweet (or 'quit'):  ur voice is sweet



BERT + SVM prediction:
  not_cyberbullying: 40.07%
  other_cyberbullying: 33.92%
  gender: 16.21%
  ethnicity: 6.88%
  age: 1.99%
  religion: 0.93%

BERT + Random Forest prediction:
  not_cyberbullying: 31.81%
  other_cyberbullying: 29.42%
  ethnicity: 13.67%
  gender: 11.44%
  age: 9.00%
  religion: 4.67%

------------------------------------



Enter tweet (or 'quit'):  u r too bad



BERT + SVM prediction:
  not_cyberbullying: 60.89%
  other_cyberbullying: 29.69%
  ethnicity: 5.03%
  age: 2.18%
  gender: 1.72%
  religion: 0.49%

BERT + Random Forest prediction:
  other_cyberbullying: 30.24%
  not_cyberbullying: 27.14%
  ethnicity: 14.00%
  gender: 12.94%
  age: 12.33%
  religion: 3.33%

------------------------------------



Enter tweet (or 'quit'):  go to hell



BERT + SVM prediction:
  not_cyberbullying: 58.68%
  other_cyberbullying: 40.71%
  age: 0.30%
  gender: 0.23%
  ethnicity: 0.04%
  religion: 0.03%

BERT + Random Forest prediction:
  other_cyberbullying: 30.56%
  not_cyberbullying: 24.44%
  ethnicity: 15.33%
  age: 11.67%
  gender: 10.67%
  religion: 7.33%

------------------------------------



Enter tweet (or 'quit'):  such a good weather



BERT + SVM prediction:
  not_cyberbullying: 47.34%
  other_cyberbullying: 42.10%
  gender: 6.84%
  ethnicity: 1.66%
  age: 1.40%
  religion: 0.66%

BERT + Random Forest prediction:
  other_cyberbullying: 31.45%
  not_cyberbullying: 29.65%
  age: 14.33%
  gender: 10.23%
  ethnicity: 10.00%
  religion: 4.33%

------------------------------------



Enter tweet (or 'quit'):  do it fast



BERT + SVM prediction:
  not_cyberbullying: 52.36%
  other_cyberbullying: 47.06%
  age: 0.47%
  gender: 0.09%
  religion: 0.01%
  ethnicity: 0.01%

BERT + Random Forest prediction:
  not_cyberbullying: 32.78%
  other_cyberbullying: 27.61%
  age: 14.12%
  gender: 10.49%
  ethnicity: 9.33%
  religion: 5.67%

------------------------------------



Enter tweet (or 'quit'):  quit


Exiting...
