In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


In [2]:
# Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load Data
# Assuming your DataFrame is named df
# df = pd.read_csv('your_data.csv')

# Preprocess Data
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['is_hate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

# Prediction & Evaluation
y_pred = xgb.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Model Optimization using GridSearch
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(xgb, param_grid, scoring='accuracy', cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Print best parameters
print(grid_search.best_params_)

# Use the best model
best_xgb = grid_search.best_estimator_

# Make predictions with the optimized model
y_pred_optimized = best_xgb.predict(X_test)
print(f"Optimized Accuracy: {accuracy_score(y_test, y_pred_optimized)}")
print(classification_report(y_test, y_pred_optimized))


Accuracy: 0.746031746031746
              precision    recall  f1-score   support

       False       0.73      0.90      0.81       260
        True       0.78      0.53      0.63       181

    accuracy                           0.75       441
   macro avg       0.76      0.71      0.72       441
weighted avg       0.75      0.75      0.73       441

Fitting 5 folds for each of 18 candidates, totalling 90 fits
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
Optimized Accuracy: 0.7482993197278912
              precision    recall  f1-score   support

       False       0.72      0.94      0.81       260
        True       0.84      0.48      0.61       181

    accuracy                           0.75       441
   macro avg       0.78      0.71      0.71       441
weighted avg       0.77      0.75      0.73       441



In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import pickle
import numpy as np

report = classification_report(y_test, y_pred_optimized)
# values for confusion matrix
cm = confusion_matrix(y_test, y_pred_optimized)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

# values for ROC curve
# Convert model output to probabilities and plot ROC curve
y_probs = best_xgb.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

# save the values to a file
with open('2.8 XGBoost.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage, 'report': report
    }, f)