### Importing the necessary libraries

In [19]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [20]:
hate_data = pd.read_csv('HateSpeechDatasetBalanced.csv')
hate_data.head()

Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1


### Preprocessing data

In [21]:
def clean_content(text):
    text = re.sub(r'http\S+', '', text)  # to remove URLs
    text = re.sub(r'[^A-Za-z0-9\s\']', '', text)  # ro remove special characters
    text = text.lower()  # converting to lowercase
    return text

hate_data['Content'] = hate_data['Content'].apply(clean_content)

In [22]:
hate_data.head()

Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1


### Convertin label to numpy array

In [23]:
labels = hate_data['Label'].values
labels


array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

### Spliting data into training and testing sets

In [24]:
X_train, X_test, y_train, y_test = train_test_split(hate_data['Content'], labels, test_size=0.2, random_state=42)


### we do now text Tokenization and Sequence Padding so that we convert text data to numverical data so that nn model can undrestand

In [25]:
max_words = 10000  # Maximum number of words to consider
max_len = 100  # Maximum length of a sequence

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

### now we define the model

In [26]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 128)          1280000   
                                                                 
 lstm_2 (LSTM)               (None, 100, 64)           49408     
                                                                 
 dropout_2 (Dropout)         (None, 100, 64)           0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,362,497
Trainable params: 1,362,497
No

### training and evaluating model

In [27]:
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Test Loss: {loss:.4f}")

Test Accuracy: 86.54%
Test Loss: 0.3181


### saving the model

In [35]:
# Save the trained model in TensorFlow SavedModel format
model.save('hate_speech_model')




INFO:tensorflow:Assets written to: hate_speech_model\assets


INFO:tensorflow:Assets written to: hate_speech_model\assets


### loading the model just in case

In [None]:
# model = tf.keras.models.load_model('hate_speech_model')

### Save the tokenizer using pickle


In [37]:
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### loading the tokinazeer just in case

In [None]:
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

### testing with function

In [46]:
def predict_hate_speech(text, treshold):
    """
    Function to predict if the given text is hate speech or not.
    """
    # Clean the text
    cleaned_text = clean_content(text)
    
    # Convert text to sequence using the tokenizer
    seq = tokenizer.texts_to_sequences([cleaned_text])
    
    # Pad the sequence to ensure consistent input size
    padded_seq = pad_sequences(seq, maxlen=100)  # Adjust maxlen based on your training
    
    # Get the model's prediction
    prediction = model.predict(padded_seq)
    print(prediction, "xxxx")
    # Return the result
    if prediction[0] > treshold:
        return "Hate Speech Detected"
    else:
        return "No Hate Speech Detected"


In [77]:
# Test with some example phrases
test_phrases = [
    "The sun is shining brightly today, and it's a perfect day for a walk in the park ugly"
]

# Get predictions for each phrase
for phrase in test_phrases:
    result = predict_hate_speech(phrase, treshold=0.5)
    print(f"Text: '{phrase}' => Prediction: {result}")


[[0.7582519]] xxxx
Text: 'The sun is shining brightly today, and it's a perfect day for a walk in the park ugly' => Prediction: Hate Speech Detected


### deploying api for this model using flusk

In [79]:
from flask import Flask, request, jsonify

app = Flask(__name__)
@app.route('/', methods=['GET'])
def home():
    return "Welcome to the Hate Speech Detection API!"

@app.route('/predict', methods=['POST'])
def predict():
    # Get the text from the request
    data = request.get_json()
    text = data['text']

    # Preprocess the input text
    cleaned_text = clean_content(text)  # Assuming you have a clean_text function similar to the one used before
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded_seq = pad_sequences(seq, maxlen=100)

    # Get the prediction from the model
    prediction = model.predict(padded_seq)
    
    # Return the prediction as a JSON response
    result = 'Hate Speech Detected' if prediction[0] > 0.5 else 'No Hate Speech Detected'
    return jsonify({'prediction': result})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with watchdog (windowsapi)


SystemExit: 1