In [None]:
'''Random Forest Classifier Classifier w/o Polarity Feature'''
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [None]:
# read the model dataframe
df = pd.read_csv('model_data.csv')

In [None]:
df.head()

In [None]:
data = df[['lemmatized_tweets']]
target = df[['cyberbullying_type']]

In [None]:
label_codes = {'age':1, 
               'gender':2, 
               'ethnicity':3, 
               'religion':4, 
               'other_cyberbullying':5, 
               'not_cyberbullying':0
              }

In [None]:
# encoding target labels
target = target.replace({'cyberbullying_type':label_codes})

vec = TfidfVectorizer()
feats = vec.fit_transform(data['lemmatized_tweets'].astype('U'))

In [None]:
data.head()

In [None]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(np.array(data['lemmatized_tweets']), 
                                                    np.array(target['cyberbullying_type']), 
                                                    test_size=0.30, random_state=42)

In [None]:
# Vectorize the text data for model
vec = TfidfVectorizer(use_idf=True, min_df=0.00002, max_df=0.6)
train = vec.fit_transform(X_train.astype('U'))
test = vec.transform(X_test.astype('U'))

In [None]:
# RandomForest model for Classification
model = RandomForestClassifier(criterion='entropy')

In [None]:
model.fit(train, y_train)

In [None]:
y_pred = model.predict(test)

In [None]:
print('Overall model accruacy: ', accuracy_score(y_test, y_pred))

In [None]:
# classification report for multiclass classification
report = classification_report(y_test, y_pred, output_dict=True)
class_report = pd.DataFrame(report).transpose()
class_report['cuberbullying_type'] = ['not_cyberbulling', 'age', 'gender',
                                     'ethnicity', 'religion', 'other_cyberbullying', '', '', '']
class_report = class_report[['cuberbullying_type', 'precision', 'recall',
                            'f1-score', 'support']]

In [None]:
class_report

In [None]:
# Serialize and save the model
filename = 'model/RandomForest/model.sav'
pickle.dump(model, open(filename, 'wb'))