# GPU Extreme gradient boosting trained on timestamp text data-set

1. Same emotion dataset from [NLP-dataset](https://github.com/huseinzol05/NLP-Dataset)
2. Same splitting 80% training, 20% testing, may vary depends on randomness
3. Same regex substitution '[^\"\'A-Za-z0-9 ]+'

## Example

Based on sorted dictionary position

text: 'module into which all the refactored classes', matrix: [167, 143, 12, 3, 4, 90]

In [1]:
import numpy as np
import sklearn.datasets
import re
import time
import xgboost as xgb
import pickle
from sklearn.cross_validation import train_test_split
import json



In [2]:
def clearstring(string):
    string = re.sub('[^\"\'A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset_data.data, trainset_data.target = separate_dataset(trainset_data)

In [4]:
with open('dictionary_emotion.p', 'rb') as fopen:
    dict_emotion = pickle.load(fopen)

In [5]:
maxlen = 50
data_X = np.zeros((len(trainset_data.data), maxlen))

In [6]:
for i in range(data_X.shape[0]):
    tokens = trainset_data.data[i].split()[:maxlen]
    for no, text in enumerate(tokens[::-1]):
        try:
            data_X[i, -1 - no] = dict_emotion[text]
        except:
            continue

In [7]:
train_X, test_X, train_Y, test_Y = train_test_split(data_X, trainset_data.target, test_size = 0.2)

In [8]:
params_xgd = {
    'min_child_weight': 10.0,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': len(trainset_data.target_names),
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.03,
    'gamma': 0.65,
    'num_boost_round' : 700,
    'gpu_id': 0,
    'tree_method': 'gpu_hist'
    }

In [9]:
d_train = xgb.DMatrix(train_X, train_Y)
d_valid = xgb.DMatrix(test_X, test_Y)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
#with open('clf.p', 'rb') as fopen:
#    clf = pickle.load(fopen)
t=time.time()
clf = xgb.train(params_xgd, d_train, 100000, watchlist, early_stopping_rounds=200, maximize=False, verbose_eval=100)
print(round(time.time()-t, 3), 'Seconds to train xgb')

[0]	train-mlogloss:1.78348	valid-mlogloss:1.78357
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 200 rounds.
[100]	train-mlogloss:1.53407	valid-mlogloss:1.545
[200]	train-mlogloss:1.49369	valid-mlogloss:1.51609
[300]	train-mlogloss:1.46646	valid-mlogloss:1.50022
[400]	train-mlogloss:1.44417	valid-mlogloss:1.48872
[500]	train-mlogloss:1.42395	valid-mlogloss:1.47904
[600]	train-mlogloss:1.40675	valid-mlogloss:1.47175
[700]	train-mlogloss:1.39027	valid-mlogloss:1.46504
[800]	train-mlogloss:1.37454	valid-mlogloss:1.45882
[900]	train-mlogloss:1.36001	valid-mlogloss:1.45368
[1000]	train-mlogloss:1.34602	valid-mlogloss:1.44885
[1100]	train-mlogloss:1.33241	valid-mlogloss:1.4442
[1200]	train-mlogloss:1.31944	valid-mlogloss:1.44019
[1300]	train-mlogloss:1.30672	valid-mlogloss:1.4362
[1400]	train-mlogloss:1.29459	valid-mlogloss:1.43276
[1500]	train-mlogloss:1.2826	valid-mlogloss:1.42936
[1600]	train-ml

In [10]:
np.mean(test_Y == np.argmax(clf.predict(xgb.DMatrix(test_X), ntree_limit=clf.best_ntree_limit), axis = 1))

0.46755116240013433

In [11]:
clf.save_model('xgb-timestamp50.model')

In [12]:
bst = xgb.Booster(params_xgd)
bst.load_model('xgb-timestamp50.model')
with open('xgb-timestamp-param', 'w') as fopen:
    fopen.write(json.dumps(params_xgd))

In [13]:
np.mean(test_Y == np.argmax(bst.predict(xgb.DMatrix(test_X)), axis = 1))

0.46752717065329524

In [14]:
from sklearn import metrics
print(metrics.classification_report(test_Y, np.argmax(bst.predict(xgb.DMatrix(test_X)), axis = 1), target_names = trainset_data.target_names))

             precision    recall  f1-score   support

      anger       0.48      0.21      0.30     11320
       fear       0.45      0.18      0.25      9658
        joy       0.47      0.72      0.57     28342
       love       0.27      0.08      0.12      6901
    sadness       0.48      0.57      0.52     24103
   surprise       0.22      0.07      0.11      3038

avg / total       0.45      0.47      0.43     83362

