In [33]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
import torch 

In [34]:
df = pd.read_csv('cyberbullying.csv')

df.info()
df
df['cyberbullying_type']
#Sentiment mapping 
sentiment_mapping = {'not_cyberbullying':0, 
                     'ethnicity':1,
                     'religion':2,
                     'gender':3,
                     'other_cyberbullying':4,
                     'age':5}

<class 'pandas.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   tweet_text          47692 non-null  str  
 1   cyberbullying_type  47692 non-null  str  
dtypes: str(2)
memory usage: 745.3 KB


#### Our tweet_text is input text and the cyberbullying type is label 

### Checking if there is any missing value!

In [35]:
df.isnull()
# it returns False = no missing value 

df.dtype

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [None]:
df.drop_duplicates(inplace=True)

### Normalize the text 

In [None]:
import re

def clean_text(text):
    text = text.lower()                         # lowercase
    text = re.sub(r"http\S+", "", text)         # remove URLs
    text = re.sub(r"@\w+", "", text)            # remove mentions
    text = re.sub(r"#\w+", "", text)            # remove hashtags
    text = re.sub(r"[^a-z\s]", "", text)        # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()    # remove extra spaces
    return text

df['clean_text'] = df['tweet_text'].apply(clean_text)
df.head()

### Checking whether there is any empty value after the cleaning

In [None]:
df = df[df['clean_text'].str.len()>0]
df

Check the class balance 

In [None]:
df['cyberbullying_type'].value_counts()
# The result turns out to be good, the data is balanced 

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2),
                            max_features=10000,
                            stop_words='english'
                            )
X = vectorizer.fit_transform(df['clean_text'])
y = df['cyberbullying_type']


### Train Test Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.5, random_state=42)

### Word Embedding 

In [None]:

from transformers import DistilBertTokenizer, AutoTokenizer

import torch 

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(X_train,
                     truncation=True,
                     padding=True,
                     max_length=128)
val_encodings = tokenizer(X_val,
                     truncation=True,
                     padding=True,
                     max_length=128)



In [None]:

class Cyberbullying(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CyberDataset(train_encodings, X_train)
val_dataset = CyberDataset(val_encodings, X_val)



In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)


### Check if we have GPU or not 

In [None]:

print(torch.cuda.is_available())


### Performance Metric 

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )

    acc = accuracy_score(labels, predictions)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
