# XGBost for Hate Speech Detection

### Install the library

In [None]:
# !pip install xgboost

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from evaluate_classification import EvaluateBinaryClassification

C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


# Loading Data

In [3]:
BASE = 'D:\\ResearchDataGtx1060\\SentimentData\\Hate\\'
fins_train = ['random_hate_train.csv']
fins_test = ['eastasian_hate_test.csv']
track = 0

In [4]:
# We apply only this preprocessing because our data is already preprocessed
def cleanNonAscii(text):
    '''
    Remove Non ASCII characters from the dataset.
    Arguments:
        text: str
    returns: 
        text: str
    '''
    return ''.join(i for i in text if ord(i) < 128)

In [5]:
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head()

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,ã ¢ â  â ¤ ã ¯ â ¸ â  ã ¢ â  â ¤ ã ¯ â ¸ â ...


In [6]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2242
1,2242


In [7]:
df_train['text'] = df_train['text'].apply(cleanNonAscii)
df_train.head(5)

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,<hashtag> ...


In [8]:
X_train, y_train = df_train['text'].values, df_train['label'].values

In [9]:
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head()

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [10]:
df_test.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,3898
1,3898


In [11]:
df_test['text'] = df_test['text'].apply(cleanNonAscii)
df_test.head(5)

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [12]:
X_test, y_test = df_test['text'].values, df_test['label'].values

### Transforming data suitable for model format

In [13]:
vectorizer = CountVectorizer()
Xtrain = vectorizer.fit_transform(X_train)
Xtrain = Xtrain.toarray()

In [17]:
Xtrain.shape

(4484, 9447)

In [19]:
Xtrain

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
Xtest = vectorizer.transform(X_test)
Xtest = Xtest.toarray()

### You can set hyper parameters to search

In [None]:
param_grid = {
                 'n_estimators': [180],
                 'max_depth': [140]
             }# best value for english

# param_grid = {
#                  'n_estimators': [120],
#                  'max_depth': [130]
#              }# best value for german

# param_grid = {
#                  'n_estimators': [120],
#                  'max_depth': [130]
#              }# best value for hindi

## define and fit the model

In [20]:
# define model without training data
# clf_XGBClassifier = XGBClassifier()
# search for bet hyper parameter values
# grid_xgbc = GridSearchCV(clf_XGBClassifier, param_grid, cv=10)

#print("Best parameters set found on development set:\n")
#print(grid_xgbc.best_params_)

grid_xgbc = XGBClassifier(n_estimators=125, max_depth=180, n_jobs=8)
grid_xgbc.fit(Xtrain, y_train)

print(grid_xgbc)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=180,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=125, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


### Evaluating the model with test dataset

In [21]:
actual, predicted = y_test, grid_xgbc.predict(Xtest)

In [22]:
ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
print(ebc.get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	7796
Positive Samples	3898
Negative Samples	3898
True Positive	2366
True Negative	1873
False Positive	2025
False Negative	1532
Accuracy	0.5437403796818882
Precision	0.5388294238214529
Recall	0.6069779374037968
F1 Measure	0.5708770659910725
Cohen Kappa Score	0.0874807593637763
Area Under Curve	0.5437403796818882

              precision    recall  f1-score   support

           0       0.55      0.48      0.51      3898
           1       0.54      0.61      0.57      3898

    accuracy                           0.54      7796
   macro avg       0.54      0.54      0.54      7796
weighted avg       0.54      0.54      0.54      7796



In [23]:
ebc.save_full_report(model_name='XGB', path='C:\\Users\\User\\JupyterPythonPredator\\COVID19\\domain_adaptation_rerun_randomhate_eastasianhate_')