In [2]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


## 1. Data Preprocessing:

In [20]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize the text data
max_words = 1000  # You can change this based on your dataset
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

# Pad sequences for consistent input size
max_length = 30  # You can adjust this value based on your dataset
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Splitting data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['is_hate'], test_size=0.2)

## 2. Building the LSTM Model:

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_dim = 64  # You can adjust this value
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 30, 64)            64000     
                                                                 
 lstm_8 (LSTM)               (None, 30, 64)            33024     
                                                                 
 lstm_9 (LSTM)               (None, 32)                12416     
                                                                 
 dense_8 (Dense)             (None, 24)                792       
                                                                 
 dense_9 (Dense)             (None, 1)                 25        
                                                                 
Total params: 110257 (430.69 KB)
Trainable params: 110257 (430.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## 3. Training the Model:

In [22]:
num_epochs = 10  # You can adjust this value
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 4. Model Evaluation:

In [23]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")


 1/14 [=>............................] - ETA: 0s - loss: 0.5276 - accuracy: 0.8125

Test accuracy: 76.42%


In [24]:
from sklearn.metrics import classification_report

y_pred = (model.predict(X_test) > 0.5).astype("int32")
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.76      0.85      0.81       254
        True       0.76      0.64      0.70       187

    accuracy                           0.76       441
   macro avg       0.76      0.75      0.75       441
weighted avg       0.76      0.76      0.76       441



## 5. Making Predictions:

In [26]:
def predict_hate(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(padded_sequence)
    return "Hate" if prediction[0][0] > 0.5 else "Not Hate"

text_input = "kalakanni deshapaluwo"
print(predict_hate(text_input))


Hate


In [27]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import pickle
import numpy as np
# values for confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

# values for ROC curve
# Convert model output to probabilities and plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict(X_test))
roc_auc = auc(fpr, tpr)

# save the values to a file
with open('2.7 RNN with LSTM.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage, 'report': report
    }, f)

