In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
column_names = ["text", "is_hate"]

df = pd.read_csv('1.preprocessed_data.csv', on_bad_lines='skip', sep=",", encoding='iso-8859-1', header=0, names=column_names)
df['is_hate'] = df['is_hate'].astype(bool)
df['text'] = df['text'].astype('str')
df.head()

Unnamed: 0,text,is_hate
0,ponnayo danne kellek aduwa gaman laga inna kol...,True
1,ape harak samjeta eka honda adrshyak,False
2,tpita pisuda yako man htuwe atta kiyala aiyo,False
3,kimbak eduwoth ape untath amma thaththawath pe...,True
4,lisan nathawa yanna puluwan yako api dannawa o...,False


In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# 1. Data Preprocessing
# Using TfidfVectorizer to convert text data into numerical format
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # you can adjust the number of features
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['is_hate']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Model Training
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

num_round = 100
# bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)
# bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10, verbose_eval=10)
evals_result = {}  # to store results of the training process

bst = lgb.train(
    params,
    train_data,
    num_round,
    valid_sets=[val_data]
)

# To implement early stopping, we check the results
best_round = bst.best_iteration
if best_round:
    print(f"Early stopping at round {best_round}")
else:
    print("No early stopping, used all iterations")


# 3. Evaluation
y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)
y_pred_binary = np.round(y_pred)

print(accuracy_score(y_val, y_pred_binary))
report = classification_report(y_val, y_pred_binary)
print(report)

# 4. Prediction
def predict_hate(text):
    transformed_text = tfidf_vectorizer.transform([text])
    prediction = bst.predict(transformed_text, num_iteration=bst.best_iteration)
    return "Hate" if np.round(prediction)[0] else "Not Hate"

# Test the prediction function
print(predict_hate("kalakanni deshapaluwo"))


[LightGBM] [Info] Number of positive: 721, number of negative: 1043
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1352
[LightGBM] [Info] Number of data points in the train set: 1764, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.408730 -> initscore=-0.369217
[LightGBM] [Info] Start training from score -0.369217
No early stopping, used all iterations
0.7074829931972789
              precision    recall  f1-score   support

       False       0.70      0.88      0.78       260
        True       0.73      0.46      0.56       181

    accuracy                           0.71       441
   macro avg       0.71      0.67      0.67       441
weighted avg       0.71      0.71      0.69       441

Hate


In [None]:
# Test the prediction function
print(predict_hate("kalakanni deshapaluwo"))

Hate


In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import pickle
import numpy as np

# values for confusion matrix
cm = confusion_matrix(y_val, y_pred_binary)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # normalize the confusion matrix

# values for ROC curve
# Convert model output to probabilities and plot ROC curve
y_probs = bst.predict(X_val, num_iteration=bst.best_iteration)
fpr, tpr, thresholds = roc_curve(y_val, y_probs)
roc_auc = auc(fpr, tpr)

# save the values to a file
with open('2.9 LightGBM.pkl', 'wb') as f:
    pickle.dump({
        'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'cm_percentage': cm_percentage, 'report': report
    }, f)