In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


In [2]:
# Load and preprocess your data
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load data (sample code, you might already have this df loaded)
# df = pd.read_csv('your_dataset.csv')

# Splitting the data into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['is_hate'], test_size=0.2, random_state=42)

# Tokenizing and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

max_length = max([len(seq) for seq in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1


In [3]:
# Train a FastText Model with Gensim
import gensim
from gensim.models import FastText

# Tokenize sentences
sentences = df['text'].apply(gensim.utils.simple_preprocess)

# Train FastText model
ft_model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [4]:
# Create an Embedding Matrix
import numpy as np

embedding_dim = 100  # Matches the vector_size parameter used when training the FastText model
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = ft_model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        print(word)
        # word not in FastText model, leaving as zero vector
        pass


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 82, 100)           688400    
                                                                 
 conv1d (Conv1D)             (None, 78, 128)           64128     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 753829 (2.88 MB)
Trainable params: 65429 (255.58 KB)
Non-trainable params: 688400 (2.63 MB)
________________

In [6]:
#Train the CNN model
model.fit(train_padded, train_labels, epochs=5, validation_data=(test_padded, test_labels))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x28f920310>

In [18]:
# Make predictions
print(model.predict(test_padded))
predictions = (model.predict(test_padded) > 0.5).astype('int32').flatten()
print(predictions)

[[0.4269493 ]
 [0.454769  ]
 [0.4888233 ]
 [0.3408548 ]
 [0.29977724]
 [0.35851875]
 [0.4888233 ]
 [0.32511926]
 [0.4348218 ]
 [0.43150494]
 [0.39510137]
 [0.41197506]
 [0.41088197]
 [0.38103807]
 [0.38491568]
 [0.46349105]
 [0.4078083 ]
 [0.4888233 ]
 [0.23064591]
 [0.4569959 ]
 [0.3239627 ]
 [0.3801457 ]
 [0.42797872]
 [0.34066918]
 [0.4888233 ]
 [0.37638223]
 [0.4609834 ]
 [0.3252383 ]
 [0.4888233 ]
 [0.435411  ]
 [0.35809487]
 [0.34275714]
 [0.45861515]
 [0.46229878]
 [0.4888233 ]
 [0.3448648 ]
 [0.4888233 ]
 [0.4888233 ]
 [0.47834533]
 [0.4888233 ]
 [0.30226204]
 [0.47676623]
 [0.40603128]
 [0.376303  ]
 [0.44517708]
 [0.4888233 ]
 [0.30885115]
 [0.32423347]
 [0.42652956]
 [0.4409944 ]
 [0.41821426]
 [0.4888233 ]
 [0.3394037 ]
 [0.44287235]
 [0.4888233 ]
 [0.3955396 ]
 [0.39323568]
 [0.44096822]
 [0.384202  ]
 [0.46852863]
 [0.39788705]
 [0.26962206]
 [0.33837238]
 [0.37899104]
 [0.36099255]
 [0.27832526]
 [0.36771366]
 [0.4123094 ]
 [0.35872996]
 [0.32456473]
 [0.4081808 ]
 [0.40

In [16]:
# Calculate and print the metrics
from sklearn.metrics import classification_report
print(predictions)

report = classification_report(test_labels, predictions)
print(report)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

       False       0.59      1.00      0.74       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import pickle
# values for confusion matrix
cm = confusion_matrix(test_labels, predictions)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix
print(cm_percentage)

# values for ROC curve
# Convert model output to probabilities and plot ROC curve
y_pred_prob = model.predict(test_padded)
fpr, tpr, thresholds = roc_curve(test_labels, y_pred_prob)
roc_auc = auc(fpr, tpr)



# save the values to a file
with open('2.4 CNN with fasttext model training.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage, 'report': report
    }, f)

[[1. 0.]
 [1. 0.]]
