In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import TweetTokenizer
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# read the model dataframe
df = pd.read_csv('model_data.csv')

#### XGBoost with Word2Vec

In [3]:
# tokenize tweets using gensim inbuilt library
from gensim.models import Word2Vec
tokenizer = TweetTokenizer(preserve_case=False, 
                           strip_handles=True,
                           reduce_len=True)

In [4]:
class MeanEmbeddingVectorizer():
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(next(iter(word2vec.values())))

    def transform(self, X):
        return np.array([np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in X])
    
    def fit(self, X, y):
        return self


model = Word2Vec(df["lemmatized_tweets"], min_count=1)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors)) 
modelw = MeanEmbeddingVectorizer(w2v)

In [5]:
data = df[['lemmatized_tweets']]
target = df[['cyberbullying_type']]

In [6]:
label_codes = {'age':1, 
               'gender':2, 
               'ethnicity':3, 
               'religion':4, 
               'other_cyberbullying':5, 
               'not_cyberbullying':0
              }
target = target.replace({'cyberbullying_type':label_codes})

In [7]:
df["tokenized"] = [tokenizer.tokenize(i) for i in df['lemmatized_tweets']]
model = Word2Vec(df["tokenized"], min_count=1)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors)) 
modelw = MeanEmbeddingVectorizer(w2v)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df["tokenized"],
                                                    np.array(target['cyberbullying_type']),
                                                    test_size=0.3,
                                                    shuffle=True, 
                                                    stratify=df["cyberbullying_type"], 
                                                    random_state=32)

In [9]:
# Convert train and test sets to vectors
X_train_vectors_w2v = modelw.transform(X_train)
X_test_vectors_w2v = modelw.transform(X_test)

In [10]:
from xgboost import XGBClassifier

model = XGBClassifier(
    use_label_encoder=False,
    max_depth=4,
    min_child_weight=2,
    gamma=0.3,
    subsample=0.7,
    colsample_bytree=0.6,
    reg_alpha=0.1,
)

model.fit(X_train_vectors_w2v, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=2,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0.1, ...)

In [12]:
predictions = model.predict(X_test_vectors_w2v)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
score2=accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, output_dict=True)
class_report = pd.DataFrame(report).transpose()
class_report['cuberbullying_type'] = ['not_cyberbulling', 'age', 'gender',
                                     'ethnicity', 'religion', 'other_cyberbullying', '', '', '']
class_report = class_report[['cuberbullying_type', 'precision', 'recall',
                            'f1-score', 'support']]
class_report

Accuracy: 0.7569297875334178


Unnamed: 0,cuberbullying_type,precision,recall,f1-score,support
0,not_cyberbulling,0.487805,0.475786,0.48172,2354.0
1,age,0.948887,0.959967,0.954395,2398.0
2,gender,0.856598,0.753878,0.801963,2385.0
3,ethnicity,0.895191,0.91206,0.903547,2388.0
4,religion,0.893018,0.91163,0.902228,2399.0
5,other_cyberbullying,0.467543,0.512664,0.489065,2290.0
accuracy,,0.75693,0.75693,0.75693,0.75693
macro avg,,0.758174,0.754331,0.755486,14214.0
weighted avg,,0.761042,0.75693,0.758222,14214.0


#### XGBoost with TFIDF

In [13]:
vec = TfidfVectorizer(use_idf=True, min_df=0.00002, max_df=0.6)
train = vec.fit_transform(X_train.astype('U'))
test = vec.transform(X_test.astype('U'))

In [14]:
model.fit(train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=2,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0.1, ...)

In [16]:
y_pred = model.predict(test)


In [17]:
# classification report for multiclass classification
report = classification_report(y_test, y_pred, output_dict=True)
class_report = pd.DataFrame(report).transpose()
class_report['cuberbullying_type'] = ['not_cyberbulling', 'age', 'gender',
                                     'ethnicity', 'religion', 'other_cyberbullying', '', '', '']
class_report = class_report[['cuberbullying_type', 'precision', 'recall',
                            'f1-score', 'support']]
class_report

Unnamed: 0,cuberbullying_type,precision,recall,f1-score,support
0,not_cyberbulling,0.555913,0.661003,0.60392,2354.0
1,age,0.98687,0.971643,0.979197,2398.0
2,gender,0.916436,0.832285,0.872336,2385.0
3,ethnicity,0.99034,0.987437,0.988887,2388.0
4,religion,0.965032,0.94331,0.954047,2399.0
5,other_cyberbullying,0.599907,0.566376,0.582659,2290.0
accuracy,,0.829394,0.829394,0.829394,0.829394
macro avg,,0.83575,0.827009,0.830174,14214.0
weighted avg,,0.838234,0.829394,0.832614,14214.0


In [18]:
# Serialize and save the model
filename = 'model/XGBoost/model.sav'
pickle.dump(model, open(filename, 'wb'))