In [None]:
# !pip install xgboost

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

# Initialise Random variables and Tensor Board

In [None]:
SEED = 123

# Loading Data

In [None]:
BASE = 'D:\\ResearchDataGtx1060\\HASOC2020Datasets\\'

fins_train = ['train'+str(i+1)+'_prepro_hasoc_2020_en_train.csv' for i in range(10)]
fins_test = ['eval'+str(i+1)+'_prepro_hasoc_2020_en_train.csv' for i in range(10)]

In [None]:
df = pd.read_csv(BASE+fins_train[0])
df['task1'].unique()

In [None]:
def load_task_one(fin):
    examples = []
    labels = []
    df = pd.read_csv(fin)
    for idx in df.index:
        examples.append(df.loc[idx, 'text'])
        if df.loc[idx, 'task1']=='NOT':
            labels.append(0)
        elif df.loc[idx, 'task1']=='HOF':
            labels.append(1)
    return np.array(examples), np.array(labels)

In [None]:
X_train, y_train = load_task_one(BASE+fins_train[0])
X_test, y_test = load_task_one(BASE+fins_test[0])

In [None]:
vectorizer = CountVectorizer()
Xtrain = vectorizer.fit_transform(X_train)
Xtrain = Xtrain.toarray()

In [None]:
Xtest = vectorizer.transform(X_test)
Xtest = Xtest.toarray()

# Set hyper parameters to search

In [None]:
param_grid = {
                 'n_estimators': [130],
                 'max_depth': [140]
             }# best value for english

# param_grid = {
#                  'n_estimators': [120],
#                  'max_depth': [130]
#              }# best value for german

# param_grid = {
#                  'n_estimators': [120],
#                  'max_depth': [130]
#              }# best value for hindi

# fit the model

In [None]:
# fit model no training data
clf_XGBClassifier = XGBClassifier()
#grid_xgbc = GridSearchCV(clf_XGBClassifier, param_grid, cv=1)
grid_xgbc = XGBClassifier(n_estimators=125, max_depth=140, n_jobs=8)
grid_xgbc.fit(Xtrain, y_train)

print("Best parameters set found on development set:")
print()
#print(grid_xgbc.best_params_)
print()

print(grid_xgbc)

# Evaluating the model with test dataset

In [None]:
actual, predicted = y_test, grid_xgbc.predict(Xtest)

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report

predicted = np.array(predicted)

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)

print('True Positive\t' + str(tp))
print('True Negative\t' + str(tn))
print('False Positive\t' + str(fp))
print('False Negative\t' + str(fn))

accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy\t' + str(accuracy))
print('Precision\t' + str(precision))
print('Recall\t' + str(recall))
print('f-measure\t' + str(fmeasure))
print('cohen_kappa_score\t' + str(cohen_kappa_score))
print('auc\t' + str(auc_val))
print('roc_auc\t' + str(roc_auc_val))

#print("Average of ROC-AUC score: %.3f" % roc_auc_score(ytest, predictions))

In [None]:
model_name = 'XGBoost'

In [None]:
import datetime
now = datetime.datetime.now()

out_string = '=========='+str(now)+'==============\n'
out_string += 'Language:\t'+'\n'
out_string += 'Dataset:\t' + '\n'
out_string += 'Task:\t' + '\n'
out_string += str('Model Name:\t' + model_name+'\n')
out_string += '-------------------------------------------------' + '\n'

out_string += 'Total Samples:\t' + str(len(actual)) + '\n'
out_string += 'Positive Samples:\t' + str(sum(actual)) + '\n'
out_string += 'Negative Samples:\t' + str(len(actual)-sum(actual)) + '\n'

out_string += 'True Positive:\t' + str(tp) + '\n'
out_string += 'True Negative:\t' + str(tn) + '\n'
out_string += 'False Positive:\t' + str(fp) + '\n'
out_string += 'False Negative:\t' + str(fn) + '\n'

out_string += 'Accuracy:\t' + str(accuracy) + '\n'
out_string += 'Precision:\t' + str(precision) + '\n'
out_string += 'Recall:\t' + str(recall) + '\n'
out_string += 'F-measure:\t' + str(fmeasure) + '\n'
out_string += 'Cohen_Kappa_Score:\t' + str(cohen_kappa_score) + '\n'
out_string += 'AUC:\t' + str(auc_val) + '\n'
out_string += 'ROC_AUC:\t' + str(roc_auc_val) + '\n'
out_string += '\n'
out_string += classification_report(actual, predicted)
out_string += '\n'
print(out_string)
with open('BaselineResults_eng_task1.txt', 'a+') as FO:
    FO.write(out_string)