In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# read the model dataframe
df = pd.read_csv('model_data.csv')

In [3]:
data = df[['lemmatized_tweets']]
target = df[['cyberbullying_type']]

In [4]:
label_codes = {'age':1, 
               'gender':2, 
               'ethnicity':3, 
               'religion':4, 
               'other_cyberbullying':5, 
               'not_cyberbullying':0
              }
target = target.replace({'cyberbullying_type':label_codes})

In [5]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(np.array(data['lemmatized_tweets']), 
                                                    np.array(target['cyberbullying_type']), 
                                                    test_size=0.30, random_state=42)

In [8]:
from xgboost import XGBClassifier

model = XGBClassifier(
    use_label_encoder=False,
    max_depth=4,
    min_child_weight=2,
    gamma=0.3,
    subsample=0.7,
    colsample_bytree=0.6,
    reg_alpha=0.1,
)


In [9]:
model.fit(train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=2,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0.1, ...)

In [10]:
y_pred = model.predict(test)

In [11]:
# classification report for multiclass classification
report = classification_report(y_test, y_pred, output_dict=True)
class_report = pd.DataFrame(report).transpose()
class_report['cuberbullying_type'] = ['not_cyberbulling', 'age', 'gender',
                                     'ethnicity', 'religion', 'other_cyberbullying', '', '', '']
class_report = class_report[['cuberbullying_type', 'precision', 'recall',
                            'f1-score', 'support']]
class_report

Unnamed: 0,cuberbullying_type,precision,recall,f1-score,support
0,not_cyberbulling,0.569993,0.629264,0.598164,2433.0
1,age,0.985776,0.973191,0.979443,2350.0
2,gender,0.924869,0.825902,0.872589,2355.0
3,ethnicity,0.987957,0.985093,0.986523,2415.0
4,religion,0.966995,0.947501,0.957149,2381.0
5,other_cyberbullying,0.580795,0.602193,0.591301,2280.0
accuracy,,0.828127,0.828127,0.828127,0.828127
macro avg,,0.836064,0.827191,0.830861,14214.0
weighted avg,,0.836779,0.828127,0.831684,14214.0


In [13]:
# Serialize and save the model
filename = 'model/XGBoost/model.sav'
pickle.dump(model, open(filename, 'wb'))