In [1]:
'''AdaBoost Classifier Classifier w/o Polarity Feature'''
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
# read the model dataframe
df = pd.read_csv('model_data.csv')

In [3]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,clean_tweets,lemmatized_tweets,polarity_nltk,polarity_textblob
0,"In other words #katandandre, your food was cra...",not_cyberbullying,words katandandre food crapilicious,word katandandre food crapilicious,0.0,0.0
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,aussietv white theblock imacelebrityau today s...,aussietv white theblock imacelebrityau today s...,0.0,0.0
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,classy whore red velvet cupcakes,classy whore red velvet cupcake,-0.34,0.05
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,meh p thanks heads concerned another angry dud...,meh p thanks head concerned another angry dude...,-0.1779,-0.15
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,isis account pretending kurdish account like i...,isi account pretending kurdish account like is...,0.4404,0.0


In [4]:
data = df[['lemmatized_tweets']]
target = df[['cyberbullying_type']]

In [5]:
label_codes = {'age':1, 
               'gender':2, 
               'ethnicity':3, 
               'religion':4, 
               'other_cyberbullying':5, 
               'not_cyberbullying':0
              }

In [6]:
# encoding target labels
target = target.replace({'cyberbullying_type':label_codes})

vec = TfidfVectorizer()
feats = vec.fit_transform(data['lemmatized_tweets'].astype('U'))

In [7]:
data.head()

Unnamed: 0,lemmatized_tweets
0,word katandandre food crapilicious
1,aussietv white theblock imacelebrityau today s...
2,classy whore red velvet cupcake
3,meh p thanks head concerned another angry dude...
4,isi account pretending kurdish account like is...


In [8]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(np.array(data['lemmatized_tweets']), 
                                                    np.array(target['cyberbullying_type']), 
                                                    test_size=0.30, random_state=42)

In [9]:
# Vectorize the text data for model
vec = TfidfVectorizer(use_idf=True, min_df=0.00002, max_df=0.6)
train = vec.fit_transform(X_train.astype('U'))
test = vec.transform(X_test.astype('U'))

In [10]:
# RandomForest model for Classification
model = AdaBoostClassifier(random_state=99)

In [11]:
model.fit(train, y_train)

AdaBoostClassifier(random_state=99)

In [12]:
y_pred = model.predict(test)

In [13]:
print('Overall model accruacy: ', accuracy_score(y_test, y_pred))

Overall model accruacy:  0.7740256085549458


In [14]:
# classification report for multiclass classification
report = classification_report(y_test, y_pred, output_dict=True)
class_report = pd.DataFrame(report).transpose()
class_report['cuberbullying_type'] = ['not_cyberbulling', 'age', 'gender',
                                     'ethnicity', 'religion', 'other_cyberbullying', '', '', '']
class_report = class_report[['cuberbullying_type', 'precision', 'recall',
                            'f1-score', 'support']]

In [15]:
class_report

Unnamed: 0,cuberbullying_type,precision,recall,f1-score,support
0,not_cyberbulling,0.447755,0.852446,0.58712,2433.0
1,age,0.974003,0.956596,0.965221,2350.0
2,gender,0.934644,0.746921,0.830304,2355.0
3,ethnicity,0.975256,0.946584,0.960706,2415.0
4,religion,0.957746,0.942461,0.950042,2381.0
5,other_cyberbullying,0.55461,0.171491,0.261977,2280.0
accuracy,,0.774026,0.774026,0.774026,0.774026
macro avg,,0.807336,0.769416,0.759228,14214.0
weighted avg,,0.807621,0.774026,0.762035,14214.0


In [16]:
# Serialize and save the model
filename = 'model/AdaBoost/model.sav'
pickle.dump(model, open(filename, 'wb'))