In [3]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


In [4]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming df is your DataFrame
# df = pd.read_csv('your_dataset.csv')  # Uncomment this if reading from a CSV

# 1. Data preprocessing
# Convert boolean to integer for the 'is_hate' column
df['is_hate'] = df['is_hate'].astype(int)

# 2. Splitting the dataset into training and testing sets
X = df[['text']]  # Note the double brackets which will give us a DataFrame
y = df['is_hate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = Pool(data=X_train, label=y_train, text_features=['text'])
test_dataset = Pool(data=X_test, label=y_test, text_features=['text'])

# 3. Building and training a CatBoost classifier
model = CatBoostClassifier(iterations=1000, # or more iterations based on your dataset size and complexity
                           learning_rate=0.05,
                           depth=7,
                           loss_function='Logloss',
                           eval_metric='AUC',
                           verbose=100,
                           text_features=[0]) # Use GPU for faster training, remove if you don't have GPU

model.fit(train_dataset, eval_set=test_dataset, plot=False)

# 4. Evaluating the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# To predict for a new text string
# new_text = ["Your new text here"]
# prediction = model.predict(new_text)
# print("Hate speech" if prediction[0] == 1 else "Not hate speech")


0:	test: 0.7841160	best: 0.7841160 (0)	total: 82.9ms	remaining: 1m 22s
100:	test: 0.8664365	best: 0.8683702 (25)	total: 3.03s	remaining: 27s
200:	test: 0.8673608	best: 0.8683702 (25)	total: 5.83s	remaining: 23.2s
300:	test: 0.8674139	best: 0.8683702 (25)	total: 8.75s	remaining: 20.3s
400:	test: 0.8657034	best: 0.8683702 (25)	total: 11.8s	remaining: 17.6s
500:	test: 0.8604335	best: 0.8683702 (25)	total: 14.6s	remaining: 14.6s
600:	test: 0.8589885	best: 0.8683702 (25)	total: 17.4s	remaining: 11.6s
700:	test: 0.8566617	best: 0.8683702 (25)	total: 20.2s	remaining: 8.6s
800:	test: 0.8549618	best: 0.8683702 (25)	total: 23.3s	remaining: 5.78s
900:	test: 0.8535805	best: 0.8683702 (25)	total: 26.1s	remaining: 2.87s
999:	test: 0.8518168	best: 0.8683702 (25)	total: 28.9s	remaining: 0us

bestTest = 0.8683701657
bestIteration = 25

Shrink model to first 26 iterations.
Accuracy: 0.8117913832199547
              precision    recall  f1-score   support

           0       0.78      0.95      0.86     

In [5]:
# Let's assume your model is trained and is named `model`.

# For predicting a single text string:
new_text = pd.DataFrame({"text": ["kalakanni deshapaluwo"]})
prediction = model.predict(new_text)

print("Hate speech" if prediction[0] == 1 else "Not hate speech")

# For predicting multiple text strings:
new_texts = pd.DataFrame({"text": ["kalakanni deshapaluwo", "pissu kelinna epa", "okun marila yanna ona"]})
predictions = model.predict(new_texts)

for idx, pred in enumerate(predictions):
    print(f"Text {idx + 1} is {'Hate speech' if pred == 1 else 'Not hate speech'}")


Hate speech
Text 1 is Hate speech
Text 2 is Not hate speech
Text 3 is Hate speech


In [11]:
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
import pickle
import numpy as np

report = classification_report(y_test, y_pred, target_names=['True', 'False'])
print(report)
# values for confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

# values for ROC curve
# Convert model output to probabilities and plot ROC curve
y_prob = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# save the values to a file
with open('2.10 CatBoost.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage, 'report': report
    }, f)

              precision    recall  f1-score   support

        True       0.78      0.95      0.86       260
       False       0.90      0.61      0.73       181

    accuracy                           0.81       441
   macro avg       0.84      0.78      0.79       441
weighted avg       0.83      0.81      0.80       441

