In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GlobalMaxPool1D, Dense, Dropout


In [2]:
data = pd.read_csv('labeled_data.csv')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

data['cleaned_tweet'] = data['comment_text'].apply(clean_text)

X = data['cleaned_tweet'].values
y = data['class'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=100, padding='post')


In [4]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(64, return_sequences=True),
    GlobalMaxPool1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [5]:
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_val_padded, y_val))

Epoch 1/10
[1m4609/4609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 41ms/step - accuracy: 0.9291 - loss: 0.1955 - val_accuracy: 0.9535 - val_loss: 0.1325
Epoch 2/10
[1m4609/4609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 42ms/step - accuracy: 0.9581 - loss: 0.1159 - val_accuracy: 0.9551 - val_loss: 0.1246
Epoch 3/10
[1m4609/4609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 44ms/step - accuracy: 0.9615 - loss: 0.1021 - val_accuracy: 0.9547 - val_loss: 0.1271
Epoch 4/10
[1m4609/4609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 65ms/step - accuracy: 0.9683 - loss: 0.0824 - val_accuracy: 0.9519 - val_loss: 0.1387
Epoch 5/10
[1m4609/4609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 44ms/step - accuracy: 0.9735 - loss: 0.0682 - val_accuracy: 0.9451 - val_loss: 0.1646
Epoch 6/10
[1m4609/4609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 36ms/step - accuracy: 0.9797 - loss: 0.0532 - val_accuracy: 0.9492 - val_loss: 0.195

In [6]:
y_val_pred = model.predict(X_val_padded)
y_val_pred_classes = y_val_pred.argmax(axis=1)

accuracy = accuracy_score(y_val, y_val_pred_classes)
print(f"Validation Accuracy: {accuracy:.2f}")
print(classification_report(y_val, y_val_pred_classes, target_names=['hate_speech', 'neither']))


[1m1153/1153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 16ms/step
Validation Accuracy: 0.94
              precision    recall  f1-score   support

 hate_speech       0.88      0.83      0.85      7311
     neither       0.96      0.97      0.96     29560

    accuracy                           0.94     36871
   macro avg       0.92      0.90      0.91     36871
weighted avg       0.94      0.94      0.94     36871



In [7]:
model.save('hate_speech_model.h5')



In [5]:
loaded_model = tf.keras.models.load_model('hate_speech_model.h5')



In [6]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved successfully.")


Tokenizer saved successfully.


In [8]:
examples = ["you are fat.", "This is a nice comment."]
examples_cleaned = [clean_text(text) for text in examples]
examples_seq = tokenizer.texts_to_sequences(examples_cleaned)
examples_padded = pad_sequences(examples_seq, maxlen=100, padding='post')

predictions = loaded_model.predict(examples_padded)
predicted_classes = predictions.argmax(axis=1)

for text, label in zip(examples, predicted_classes):
    print(f"Comment: {text}")
    print(f"Predicted Label: {'hate_speech' if label == 0 else 'neither'}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Comment: you are fat.
Predicted Label: hate_speech
Comment: This is a nice comment.
Predicted Label: neither
