In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

# Display the DataFrame
print(df)

                                                   text  is_hate
0     ponnayo danne kellek aduwa gaman laga inna kol...     True
1                  ape harak samjeta eka honda adrshyak    False
2          tpita pisuda yako man htuwe atta kiyala aiyo    False
3     kimbak eduwoth ape untath amma thaththawath pe...     True
4     lisan nathawa yanna puluwan yako api dannawa o...    False
...                                                 ...      ...
2200                                      sajith nariya     True
2201   mechchra deyak wela goiyoi bayyoi thama innwaneh     True
2202  rajapassa kalakannie reala chandayak pavathval...     True
2203  kaputa hitiyatah wada hodata wajabanawa ahinsa...     True
2204                  rata kapu kaputo un thama palanye     True

[2205 rows x 2 columns]


In [2]:
##### Data Splitting
from sklearn.model_selection import train_test_split

X = df['text']
y = df['is_hate']  # Labels

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the split datasets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (1764,), y_train shape: (1764,)
X_test shape: (441,), y_test shape: (441,)


In [3]:
# Write the train data to file
with open("train_data.txt", "w", encoding="utf-8") as f:
    for i in range(len(X_train)):
        f.write("__label__" + str(y_train.iloc[i]) + " " + X_train.iloc[i] + "\n")
 
# Write the test data to file
with open("test_data.txt", "w", encoding="utf-8") as f:
    for i in range(len(X_test)):
        f.write("__label__" + str(y_test.iloc[i]) + " " + X_test.iloc[i] + "\n")


In [4]:
import fasttext
# Train the model
# https://towardsdatascience.com/fasttext-bag-of-tricks-for-efficient-text-classification-513ba9e302e7
model = fasttext.train_supervised(input="train_data.txt", epoch=130, wordNgrams=4, lr=1.0)
# Save the model
model.save_model('text_classification_model.bin')

Read 0M words
Number of words:  6884
Number of labels: 2
Progress: 100.0% words/sec/thread: 1381374 lr:  0.000000 avg.loss:  0.006667 ETA:   0h 0m 0s


In [5]:
# Test the model
result = model.test("test_data.txt")
print("Test Accuracy:", result[1])
print(result)

Test Accuracy: 0.7777777777777778
(441, 0.7777777777777778, 0.7777777777777778)


In [23]:
# Make predictions on new data
text = "jeewite lassanai kalakanni"
labels, prob = model.predict(text)
print("Label:", labels[0])
print("Probability:", prob[0])
print(model.predict(text)[0][0])

Label: __label__True
Probability: 1.0000100135803223
__label__True


In [36]:
import numpy as np
import pickle

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
y_pred = [model.predict(x)[0][0] == '__label__True' for x in X_test]

# Compute the classification report
report = classification_report(y_test, y_pred)

# values for confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

# save the values to a file
with open('2.1 Fasttext.pkl', 'wb') as f:
    pickle.dump({
        'fpr': None, 'tpr': None, 'roc_auc': None, 'cm_percentage': cm_percentage, 'report': report
    }, f)

