In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


In [2]:
# Load and preprocess your data
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load data (sample code, you might already have this df loaded)
# df = pd.read_csv('your_dataset.csv')

# Splitting the data into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['is_hate'], test_size=0.2, random_state=42)

# Tokenizing and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

max_length = max([len(seq) for seq in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1


In [3]:
# Download and create FastText embedding matrix
import gensim.downloader
import numpy as np

# Download FastText model
fasttext_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

# Create embedding matrix
embedding_dim = 300  # as we are using FastText with 300 dimensions
embedding_matrix = np.zeros((vocab_size, embedding_dim))
words_not_found = []
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = fasttext_model[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    except KeyError:
        # word not in FastText model, leaving as zero vector
        words_not_found.append(word)
        pass

print('Missing word count:',len(words_not_found))

Missing word count: 5509


TypeError: object of type 'Tokenizer' has no len()

In [4]:
# Build the CNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 82, 300)           2065200   
                                                                 
 conv1d (Conv1D)             (None, 78, 128)           192128    
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 2258629 (8.62 MB)
Trainable params: 193429 (755.58 KB)
Non-trainable params: 2065200 (7.88 MB)
_____________

In [5]:
#Train the CNN model
model.fit(train_padded, train_labels, epochs=5, validation_data=(test_padded, test_labels))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x29c381790>

In [6]:
# Make predictions
predictions = (model.predict(test_padded) > 0.5).astype('int32').flatten()
print(predictions)

[0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1
 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1
 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1
 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 0 1 0 1 1 0 0
 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1
 1 0 0 0 1 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0
 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1
 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 0 0 0]


In [8]:
# Calculate and print the metrics
from sklearn.metrics import classification_report

report = classification_report(test_labels, predictions)
print(report)


              precision    recall  f1-score   support

       False       0.68      0.82      0.75       260
        True       0.64      0.46      0.53       181

    accuracy                           0.67       441
   macro avg       0.66      0.64      0.64       441
weighted avg       0.67      0.67      0.66       441



In [15]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import pickle
# values for confusion matrix
cm = confusion_matrix(test_labels, predictions)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

# values for ROC curve
# Convert model output to probabilities and plot ROC curve
y_pred_prob = model.predict(test_padded)
print(y_pred_prob)
fpr, tpr, thresholds = roc_curve(test_labels, y_pred_prob)
roc_auc = auc(fpr, tpr)



# save the values to a file
with open('2.3 CNN with fasttext existing model.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage, 'report': report
    }, f)

[[0.34647718]
 [0.52539986]
 [0.39229622]
 [0.10973633]
 [0.19986606]
 [0.3698384 ]
 [0.5548922 ]
 [0.41199905]
 [0.847774  ]
 [0.23248836]
 [0.6201677 ]
 [0.9179381 ]
 [0.08489243]
 [0.3483674 ]
 [0.92974085]
 [0.96216625]
 [0.3716352 ]
 [0.26907384]
 [0.34647718]
 [0.3241305 ]
 [0.14807296]
 [0.06422706]
 [0.3605152 ]
 [0.28082797]
 [0.621121  ]
 [0.34647718]
 [0.3961661 ]
 [0.34647718]
 [0.2208572 ]
 [0.76908267]
 [0.34647718]
 [0.7306307 ]
 [0.26260352]
 [0.3583401 ]
 [0.94860786]
 [0.34647718]
 [0.8750165 ]
 [0.37917215]
 [0.34647718]
 [0.67104685]
 [0.23639774]
 [0.6850735 ]
 [0.38228804]
 [0.07126072]
 [0.5083216 ]
 [0.75651824]
 [0.705012  ]
 [0.06680728]
 [0.25946864]
 [0.38202903]
 [0.3200393 ]
 [0.8097526 ]
 [0.85879093]
 [0.7204374 ]
 [0.85863876]
 [0.29777345]
 [0.3143661 ]
 [0.34647718]
 [0.07665051]
 [0.26036027]
 [0.2440533 ]
 [0.08683968]
 [0.2222391 ]
 [0.10737071]
 [0.32681686]
 [0.02765866]
 [0.20241868]
 [0.48780254]
 [0.26751933]
 [0.07477791]
 [0.917093  ]
 [0.03