In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# Load the dataset
print("Loading dataset...")
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Combine all toxic labels into one
df_train['label'] = df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)
df_test['label'] = df_test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)

Y_train = df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
Y_test = df_test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# Tokenize the text data and convert it to sequences
max_words = 20000  # Vocabulary size
max_len = 100      # Maximum length of each comment

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_train['comment_text'])
X_train = tokenizer.texts_to_sequences(df_train['comment_text'])
X_test = tokenizer.texts_to_sequences(df_test['comment_text'])

# Pad sequences to ensure uniform input size
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
with open('models/tokenizer.pkl', 'wb') as f:
    joblib.dump(tokenizer, f)

Loading dataset...


In [4]:
# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(6, activation='sigmoid'))  # Six output nodes for multi-label classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
print("Training model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Save the model and tokenizer
model.save('models/toxic_rnn_classifier.h5')
print("Model training complete.")

# Evaluate the model
print("Evaluating model on the test dataset...")
Y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred, average='micro')
precision = precision_score(Y_test, Y_pred, average='micro')
recall = recall_score(Y_test, Y_pred, average='micro')
print(f"Accuracy(test): {accuracy}")
print(f"F1 Score(test): {f1}")
print(f"Precision(test): {precision}")
print(f"Recall(test): {recall}")

Training model...
Epoch 1/5




[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 53ms/step - accuracy: 0.7268 - loss: 0.1277 - val_accuracy: 0.9941 - val_loss: 0.0519
Epoch 2/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 55ms/step - accuracy: 0.9216 - loss: 0.0512 - val_accuracy: 0.9941 - val_loss: 0.0497
Epoch 3/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 56ms/step - accuracy: 0.8902 - loss: 0.0447 - val_accuracy: 0.9941 - val_loss: 0.0536
Epoch 4/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 45ms/step - accuracy: 0.8540 - loss: 0.0406 - val_accuracy: 0.9941 - val_loss: 0.0568
Epoch 5/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 42ms/step - accuracy: 0.8343 - loss: 0.0370 - val_accuracy: 0.9941 - val_loss: 0.0580




Model training complete.
Evaluating model on the test dataset...
[1m4787/4787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 13ms/step
Accuracy(test): 0.8058029301924734
F1 Score(test): 0.2492022645393721
Precision(test): 0.15317452785422797
Recall(test): 0.6679542005793903


In [5]:
# Making predictions using the model
from tensorflow.keras.models import load_model
import joblib

model = load_model('models/toxic_rnn_classifier.h5')
with open('models/tokenizer.pkl', 'rb') as f:
    tokenizer = joblib.load(f)

def predict(comment):
    comment_seq = tokenizer.texts_to_sequences([comment])
    comment_pad = pad_sequences(comment_seq, maxlen=max_len)
    prediction = (model.predict(comment_pad) > 0.5).astype(int)
    return prediction[0]

comments = [
    "I really love this!",
    "This is the worst thing ever.",
    "You are an idiot and should be banned!",
    "The government needs to address this issue.",
    "You fucking idiot, I am gonna make you suffer.",
    "that's fucking awesome."
]

print("Classifying example comments...")
for comment in comments:
    labels = predict(comment)
    print(f"Comment: {comment}\nLabels: {labels}\n")




Classifying example comments...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step
Comment: I really love this!
Labels: [0 0 0 0 0 0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Comment: This is the worst thing ever.
Labels: [1 0 0 0 0 0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Comment: You are an idiot and should be banned!
Labels: [1 0 1 0 1 0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Comment: The government needs to address this issue.
Labels: [0 0 0 0 0 0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Comment: You fucking idiot, I am gonna make you suffer.
Labels: [1 0 1 0 1 0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Comment: that's fucking awesome.
Labels: [1 0 1 0 1 0]

