# GPU Light gradient boosting trained on TF-IDF reduced 50 dimensions

1. Same emotion dataset from [NLP-dataset](https://github.com/huseinzol05/NLP-Dataset)
2. Same splitting 80% training, 20% testing, may vary depends on randomness
3. Same regex substitution '[^\"\'A-Za-z0-9 ]+'

## Example

Based on Term-frequency Inverse document frequency

After that we apply SVD to reduce the dimensions, n_components = 50

In [8]:
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import re
import time
from sklearn.cross_validation import train_test_split
import sklearn.datasets
from sklearn import pipeline
from sklearn.model_selection import StratifiedKFold

In [2]:
def clearstring(string):
    string = re.sub('[^\"\'A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset_data.data, trainset_data.target = separate_dataset(trainset_data)

In [4]:
train_X, test_X, train_Y, test_Y = train_test_split(trainset_data.data, trainset_data.target, test_size = 0.2)

In [5]:
decompose = pipeline.Pipeline([('count', TfidfVectorizer()),
                               ('svd', TruncatedSVD(n_components=50))]).fit(trainset_data.data)

In [6]:
params_lgb = {
    'max_depth': 27, 
    'learning_rate': 0.03,
    'verbose': 50, 
    'early_stopping_round': 200,
    'metric': 'multi_logloss',
    'objective': 'multiclass',
    'num_classes': len(trainset_data.target_names),
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0
    }

In [10]:
train_X = decompose.transform(train_X)
test_X = decompose.transform(test_X)

In [11]:
d_train = lgb.Dataset(train_X, train_Y)
d_valid = lgb.Dataset(test_X, test_Y)
watchlist = [d_train, d_valid]
t=time.time()
clf = lgb.train(params_lgb, d_train, 100000, watchlist, early_stopping_rounds=200, verbose_eval=100)
print(round(time.time()-t, 3), 'Seconds to train lgb')

Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 1.51641	valid_1's multi_logloss: 1.52991
[200]	training's multi_logloss: 1.48039	valid_1's multi_logloss: 1.50404
[300]	training's multi_logloss: 1.46016	valid_1's multi_logloss: 1.49379
[400]	training's multi_logloss: 1.44402	valid_1's multi_logloss: 1.48755
[500]	training's multi_logloss: 1.43032	valid_1's multi_logloss: 1.4837
[600]	training's multi_logloss: 1.41806	valid_1's multi_logloss: 1.4811
[700]	training's multi_logloss: 1.40679	valid_1's multi_logloss: 1.4791
[800]	training's multi_logloss: 1.39626	valid_1's multi_logloss: 1.47765
[900]	training's multi_logloss: 1.38603	valid_1's multi_logloss: 1.4765
[1000]	training's multi_logloss: 1.37627	valid_1's multi_logloss: 1.47559
[1100]	training's multi_logloss: 1.36678	valid_1's multi_logloss: 1.47482
[1200]	training's multi_logloss: 1.35761	valid_1's multi_logloss: 1.4741
[1300]	training's multi_logloss: 1.34862	valid_1's multi_loglos

In [12]:
from sklearn import metrics
print(metrics.classification_report(test_Y, np.argmax(clf.predict(test_X), axis = 1), target_names = trainset_data.target_names))

             precision    recall  f1-score   support

      anger       0.38      0.05      0.09     11460
       fear       0.32      0.06      0.10      9545
        joy       0.44      0.73      0.55     28052
       love       0.17      0.01      0.02      7015
    sadness       0.39      0.54      0.45     24291
   surprise       0.09      0.01      0.01      2999

avg / total       0.37      0.42      0.34     83362



In [13]:
clf.save_model('lgb-tfidf-svd50.model')