In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv("cyberbullying_tweets.csv")
texts = data["tweet_text"].tolist()
labels = data["cyberbullying_type"].tolist()

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(texts)
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [5]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [7]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])



In [8]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 111ms/step - accuracy: 0.5992 - loss: 0.9513 - val_accuracy: 0.8091 - val_loss: 0.4702
Epoch 2/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 108ms/step - accuracy: 0.8434 - loss: 0.3929 - val_accuracy: 0.8213 - val_loss: 0.4558
Epoch 3/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 109ms/step - accuracy: 0.9030 - loss: 0.2806 - val_accuracy: 0.8207 - val_loss: 0.5138
Epoch 4/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 108ms/step - accuracy: 0.9322 - loss: 0.1970 - val_accuracy: 0.8225 - val_loss: 0.5714
Epoch 5/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 109ms/step - accuracy: 0.9447 - loss: 0.1561 - val_accuracy: 0.8105 - val_loss: 0.7191
Epoch 6/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 108ms/step - accuracy: 0.9516 - loss: 0.1229 - val_accuracy: 0.8116 - val_loss:

In [10]:
label = data["cyberbullying_type"].unique().tolist()
label

['not_cyberbullying',
 'gender',
 'religion',
 'other_cyberbullying',
 'age',
 'ethnicity']

In [13]:
X_test

array([[   0,    0,    0, ...,  329,   62,   44],
       [   0,    0,    0, ..., 4183,   79,  784],
       [   0,    0,    0, ...,  283,   23,  160],
       ...,
       [   0,    0,    0, ..., 1588,   13,  577],
       [   0,    0,    0, ...,    4,  230,  235],
       [   0,    0,    0, ...,   85,  114,  422]], dtype=int32)

In [17]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification Report
report = classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_)
print("Classification Report:\n", report)

[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 40ms/step
Classification Report:
                      precision    recall  f1-score   support

                age       0.96      0.97      0.96      1603
          ethnicity       0.97      0.97      0.97      1603
             gender       0.86      0.84      0.85      1531
  not_cyberbullying       0.55      0.51      0.53      1624
other_cyberbullying       0.58      0.65      0.61      1612
           religion       0.95      0.92      0.93      1566

           accuracy                           0.81      9539
          macro avg       0.81      0.81      0.81      9539
       weighted avg       0.81      0.81      0.81      9539



In [18]:
import torch
torch.save(model, 'model_lstm.pth')

In [19]:
torch.save(model, '/content/drive/My Drive/model_lstm.pth')

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mp = torch.load('/content/drive/My Drive/model_cnn.pth', map_location=torch.device('cpu'))

  mp = torch.load('/content/drive/My Drive/model_cnn.pth', map_location=torch.device('cpu'))


In [32]:
def predict_text(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    prediction = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction)
    return label_encoder.classes_[predicted_class]

# Example usage
sample_text = "you are a phsyco"
predicted_label = predict_text(sample_text)
print(f"Predicted Label: {predicted_label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Predicted Label: other_cyberbullying
