In [21]:
import pandas as pd
import numpy as np
import keras
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import classification_report

In [23]:
data = pd.read_csv("cyberbullying_tweets.csv")
print(data.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [24]:
texts = data["tweet_text"].tolist()
labels = data["cyberbullying_type"].tolist()

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

In [25]:
sequences = tokenizer.texts_to_sequences(texts)
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [26]:
padded_sequences

array([[    0,     0,     0, ...,    19, 25052,    79],
       [    0,     0,     0, ..., 13645, 25055,   340],
       [    0,     0,     0, ...,   919, 11442,  7969],
       ...,
       [    0,     0,     0, ...,   377,   287,    23],
       [    0,     0,     0, ...,   110,    28,   137],
       [    0,     0,     0, ...,    44,    93,   346]], dtype=int32)

In [27]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [28]:
labels

array([3, 3, 3, ..., 1, 1, 1])

array([3, 3, 3, ..., 1, 1, 1])

In [30]:
one_hot_labels = keras.utils.to_categorical(labels)

In [31]:
xtrain, xtest, ytrain, ytest = train_test_split(padded_sequences,
                                                one_hot_labels,
                                                test_size=0.2)

In [32]:
model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=128, input_length=max_length))

model.add(Conv1D(filters=128, kernel_size=5, activation="relu", padding="same"))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=128, kernel_size=5, activation="relu", padding="same"))
model.add(GlobalMaxPooling1D())

model.add(Dense(units=128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=len(one_hot_labels[0]), activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

history=model.fit(xtrain, ytrain, epochs=10, batch_size=32, validation_data=(xtest, ytest))




Epoch 1/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 16ms/step - accuracy: 0.6483 - loss: 0.8198 - val_accuracy: 0.8356 - val_loss: 0.4038
Epoch 2/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 14ms/step - accuracy: 0.8880 - loss: 0.2980 - val_accuracy: 0.8391 - val_loss: 0.4215
Epoch 3/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 14ms/step - accuracy: 0.9341 - loss: 0.1781 - val_accuracy: 0.8209 - val_loss: 0.5505
Epoch 4/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.9492 - loss: 0.1236 - val_accuracy: 0.8247 - val_loss: 0.6921
Epoch 5/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 14ms/step - accuracy: 0.9582 - loss: 0.0896 - val_accuracy: 0.8202 - val_loss: 0.7529
Epoch 6/10
[1m1193/1193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.9618 - loss: 0.0794 - val_accuracy: 0.8056 - val_loss: 1.1661
Epoc

In [33]:
label = data["cyberbullying_type"].unique().tolist()

In [34]:
label

['not_cyberbullying',
 'gender',
 'religion',
 'other_cyberbullying',
 'age',
 'ethnicity']

In [35]:
y_pred = model.predict(xtest)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(ytest, axis=1)

report = classification_report(y_true, y_pred_classes, target_names=label)
print("Classification Report:\n", report)

[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Classification Report:
                      precision    recall  f1-score   support

  not_cyberbullying       0.95      0.97      0.96      1592
             gender       0.98      0.96      0.97      1591
           religion       0.90      0.84      0.87      1631
other_cyberbullying       0.48      0.59      0.53      1591
                age       0.52      0.43      0.47      1564
          ethnicity       0.94      0.94      0.94      1570

           accuracy                           0.79      9539
          macro avg       0.79      0.79      0.79      9539
       weighted avg       0.80      0.79      0.79      9539



In [36]:
import torch
torch.save(model, 'model_cnn.pth')

In [37]:
torch.save(model, '/content/drive/My Drive/model_cnn.pth')

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mp = torch.load('/content/drive/My Drive/model_cnn.pth', map_location=torch.device('cpu'))

  mp = torch.load('/content/drive/My Drive/model_cnn.pth', map_location=torch.device('cpu'))


In [39]:
data = pd.read_csv("cyberbullying_tweets.csv")
print(data.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [40]:
texts = data["tweet_text"].tolist()
labels = data["cyberbullying_type"].tolist()

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

In [41]:
sequences = tokenizer.texts_to_sequences(texts)
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [42]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [60]:
input_text="bastard"

# Preprocess the input text
input_sequence = tokenizer.texts_to_sequences([input_text])
padded_input_sequence = pad_sequences(input_sequence, maxlen=max_length)
prediction = mp.predict(padded_input_sequence)
predicted_label = label_encoder.inverse_transform([np.argmax(prediction[0])])
print(predicted_label[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
gender
