In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

In [33]:
# read the model dataframe
df = pd.read_csv('model_data.csv')

#### CatBoost

In [34]:
data = df[['lemmatized_tweets']]
target = df[['cyberbullying_type']]

In [35]:
label_codes = {'age':1, 
               'gender':2, 
               'ethnicity':3, 
               'religion':4, 
               'other_cyberbullying':5, 
               'not_cyberbullying':0
              }
target = target.replace({'cyberbullying_type':label_codes})

In [36]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(np.array(data['lemmatized_tweets']), 
                                                    np.array(target['cyberbullying_type']), 
                                                    test_size=0.30, 
                                                    random_state=42)

In [37]:
vec = TfidfVectorizer(use_idf=True, min_df=0.00002, max_df=0.6)
train = vec.fit_transform(X_train.astype('U'))
test = vec.transform(X_test.astype('U'))

In [38]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=20,
                           learning_rate=0.01,
                           depth=10,
                           loss_function='MultiClass')

In [39]:
model.fit(train, y_train)

0:	learn: 1.7652293	total: 6.27s	remaining: 1m 59s
1:	learn: 1.7399789	total: 12.6s	remaining: 1m 53s
2:	learn: 1.7166159	total: 18.9s	remaining: 1m 46s
3:	learn: 1.6934738	total: 25.2s	remaining: 1m 40s
4:	learn: 1.6712449	total: 31.9s	remaining: 1m 35s
5:	learn: 1.6509936	total: 38.2s	remaining: 1m 29s
6:	learn: 1.6300688	total: 44.6s	remaining: 1m 22s
7:	learn: 1.6109616	total: 50.9s	remaining: 1m 16s
8:	learn: 1.5917443	total: 57.2s	remaining: 1m 9s
9:	learn: 1.5730925	total: 1m 3s	remaining: 1m 3s
10:	learn: 1.5544461	total: 1m 9s	remaining: 57.2s
11:	learn: 1.5372324	total: 1m 16s	remaining: 50.8s
12:	learn: 1.5206757	total: 1m 22s	remaining: 44.5s
13:	learn: 1.5056836	total: 1m 28s	remaining: 38.1s
14:	learn: 1.4903315	total: 1m 35s	remaining: 31.8s
15:	learn: 1.4753357	total: 1m 41s	remaining: 25.5s
16:	learn: 1.4609310	total: 1m 48s	remaining: 19.1s
17:	learn: 1.4473233	total: 1m 54s	remaining: 12.7s
18:	learn: 1.4329909	total: 2m 1s	remaining: 6.38s
19:	learn: 1.4192310	total

<catboost.core.CatBoostClassifier at 0x7f99370f0d90>

In [40]:
y_pred = model.predict(test)

In [41]:
# classification report for multiclass classification
report = classification_report(y_test, y_pred, output_dict=True)
class_report = pd.DataFrame(report).transpose()
class_report['cuberbullying_type'] = ['not_cyberbulling', 'age', 'gender',
                                     'ethnicity', 'religion', 'other_cyberbullying', '', '', '']
class_report = class_report[['cuberbullying_type', 'precision', 'recall',
                            'f1-score', 'support']]
class_report

Unnamed: 0,cuberbullying_type,precision,recall,f1-score,support
0,not_cyberbulling,0.555851,0.085902,0.148807,2433.0
1,age,0.936421,0.946383,0.941376,2350.0
2,gender,0.969769,0.694692,0.8095,2355.0
3,ethnicity,0.98125,0.845135,0.90812,2415.0
4,religion,0.971817,0.839983,0.901104,2381.0
5,other_cyberbullying,0.386485,0.955702,0.550392,2280.0
accuracy,,0.723864,0.723864,0.723864,0.723864
macro avg,,0.800265,0.727966,0.709883,14214.0
weighted avg,,0.802137,0.723864,0.708751,14214.0


In [42]:
# Serialize and save the model
filename = 'model/CatBoost/model.sav'
pickle.dump(model, open(filename, 'wb'))