# Advanced Cyberbullying Detection

In [8]:
# Imports
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf

# Warnings
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Seed
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [9]:
# GPU Availability
print("GPU available:", tf.config.list_physical_devices('GPU'))

GPU available: []


## Loading Dataset

**Load Dataset**:

In [14]:
DATA_DIR = Path("../dataset")

# Check available datasets
csv_paths = sorted(DATA_DIR.glob("*.csv"))

print("Found CSVs:", len(csv_paths))
for f in csv_paths:
    print(f.name)

Found CSVs: 9
aggression_parsed_dataset.csv
attack_parsed_dataset.csv
cyberbullying_tweets.csv
kaggle_parsed_dataset.csv
toxicity_parsed_dataset.csv
twitter_parsed_dataset.csv
twitter_racism_parsed_dataset.csv
twitter_sexism_parsed_dataset.csv
youtube_parsed_dataset.csv


In [16]:
# Read dfs
dfs = {}

for f in csv_paths:
    path = DATA_DIR / f
    dfs[f] = pd.read_csv(path)
    print(f"{f}: shape={dfs[f].shape}")

..\dataset\aggression_parsed_dataset.csv: shape=(115864, 5)
..\dataset\attack_parsed_dataset.csv: shape=(115864, 5)
..\dataset\cyberbullying_tweets.csv: shape=(47692, 2)
..\dataset\kaggle_parsed_dataset.csv: shape=(8799, 4)
..\dataset\toxicity_parsed_dataset.csv: shape=(159686, 5)
..\dataset\twitter_parsed_dataset.csv: shape=(16851, 5)
..\dataset\twitter_racism_parsed_dataset.csv: shape=(13471, 5)
..\dataset\twitter_sexism_parsed_dataset.csv: shape=(14881, 5)
..\dataset\youtube_parsed_dataset.csv: shape=(3464, 10)


In [20]:
# Check Columns
for fname, df in dfs.items():
    print(f"{fname}, Columns: {df.columns}")

..\dataset\aggression_parsed_dataset.csv, Columns: Index(['index', 'Text', 'ed_label_0', 'ed_label_1', 'oh_label'], dtype='str')
..\dataset\attack_parsed_dataset.csv, Columns: Index(['index', 'Text', 'ed_label_0', 'ed_label_1', 'oh_label'], dtype='str')
..\dataset\cyberbullying_tweets.csv, Columns: Index(['tweet_text', 'cyberbullying_type'], dtype='str')
..\dataset\kaggle_parsed_dataset.csv, Columns: Index(['index', 'oh_label', 'Date', 'Text'], dtype='str')
..\dataset\toxicity_parsed_dataset.csv, Columns: Index(['index', 'Text', 'ed_label_0', 'ed_label_1', 'oh_label'], dtype='str')
..\dataset\twitter_parsed_dataset.csv, Columns: Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='str')
..\dataset\twitter_racism_parsed_dataset.csv, Columns: Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='str')
..\dataset\twitter_sexism_parsed_dataset.csv, Columns: Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='str')
..\dataset\youtube_parsed_dataset.csv, Colu

**Clean Dataset**:

In [21]:
# Drop the unwanted columns and rename the column names
cleaned_dfs = {}

for fname, df in dfs.items():
    # Get the text column
    if "tweet_text" in df.columns:
        text_col = "tweet_text"
    else:
        text_col = "Text"

    # Get the label column
    if "cyberbullying_type" in df.columns:
        label_col = "cyberbullying_type"
    else:
        label_col = "oh_label"

    # Keep the columns and rename
    out = df[[text_col, label_col]].copy()
    out = out.rename(columns={text_col: "text", label_col: "is_cyberbullying"})

    # Assign the cleaned dfs
    cleaned_dfs[fname] = out

In [22]:
# Check Columns
for fname, df in cleaned_dfs.items():
    print(f"{fname}, Columns: {df.columns}")

..\dataset\aggression_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\attack_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\cyberbullying_tweets.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\kaggle_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\toxicity_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\twitter_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\twitter_racism_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\twitter_sexism_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')
..\dataset\youtube_parsed_dataset.csv, Columns: Index(['text', 'is_cyberbullying'], dtype='str')


**Combine Dfs**:

In [27]:
# Combine all
df = pd.concat(cleaned_dfs.values(), ignore_index=True)

print("DF Shape:", df.shape)

DF Shape: (496572, 2)


In [28]:
df.head()

Unnamed: 0,text,is_cyberbullying
0,`- This is not ``creative``. Those are the di...,0
1,` :: the term ``standard model`` is itself le...,0
2,"True or false, the situation as of March 200...",0
3,"Next, maybe you could work on being less cond...",0
4,This page will need disambiguation.,0


## Data Preparation

In [30]:
# Unique values in label
df['is_cyberbullying'].unique()

array([0, 1, 'not_cyberbullying', 'gender', 'religion',
       'other_cyberbullying', 'age', 'ethnicity', nan], dtype=object)

In [34]:
# Label Values
cyberbullying = [1, 'gender', 'religion', 'other_cyberbullying', 'age', 'ethnicity']
not_cyberbullying = [0, 'not_cyberbullying']

# Conditions
def convert_binary(label):
    # Check if in cyberbullying
    if label in cyberbullying:
        return 1

    # Check if in not_cyberbullying
    if label in not_cyberbullying:
        return 0

    # NaN case
    return None

df['is_cyberbullying'] = df['is_cyberbullying'].apply(convert_binary)

In [35]:
# Unique values in label
df['is_cyberbullying'].unique()

array([ 0.,  1., nan])

**Drop NaN Values**:

In [36]:
df = df.dropna(subset=["is_cyberbullying"]).reset_index(drop=True)
df["is_cyberbullying"] = df["is_cyberbullying"].astype(int)

In [37]:
df["is_cyberbullying"].unique()

array([0, 1])

In [38]:
print("Missing text:", df["text"].isna().sum())
print("Missing labels:", df["is_cyberbullying"].isna().sum())

Missing text: 0
Missing labels: 0


In [41]:
# Check the distribution of the label
df["is_cyberbullying"].value_counts(normalize=True)

is_cyberbullying
0    0.803857
1    0.196143
Name: proportion, dtype: float64

## Explatory Data Analysis