In [1]:
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
import lightgbm as lgb



In [2]:
lgb.__version__

'2.0.10'

In [3]:
# clear string
def clearstring(string):
    string = re.sub('[^A-Za-z ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string.lower()

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [4]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
416809
416809


In [5]:
tfidf = TfidfVectorizer(min_df=10).fit(trainset.data)

In [6]:
out = tfidf.transform(trainset.data)

In [7]:
trainset.target = np.array(trainset.target)
train_X, test_X, train_Y, test_Y = train_test_split(out, trainset.target, test_size = 0.2)

In [11]:
params_lgd = {
    'boosting_type': 'dart',
    'objective': 'multiclass',
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'silent': False,
    'n_estimators': 10000,
    'reg_lambda': 0.0005,
    'device':'gpu'
    }
clf = lgb.LGBMClassifier(**params_lgd)
clf.fit(train_X,train_Y, eval_set=[(train_X,train_Y), (test_X,test_Y)], 
        eval_metric='logloss', early_stopping_rounds=20, verbose=True)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[1]	valid_0's multi_logloss: 1.71011	valid_1's multi_logloss: 1.71044
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's multi_logloss: 1.62928	valid_1's multi_logloss: 1.62947
[3]	valid_0's multi_logloss: 1.56092	valid_1's multi_logloss: 1.56093
[4]	valid_0's multi_logloss: 1.4939	valid_1's multi_logloss: 1.4937
[5]	valid_0's multi_logloss: 1.43481	valid_1's multi_logloss: 1.43446
[6]	valid_0's multi_logloss: 1.38164	valid_1's multi_logloss: 1.38169
[7]	valid_0's multi_logloss: 1.33566	valid_1's multi_logloss: 1.33626
[8]	valid_0's multi_logloss: 1.35761	valid_1's multi_logloss: 1.35815
[9]	valid_0's multi_logloss: 1.30537	valid_1's multi_logloss: 1.30536
[10]	valid_0's multi_logloss: 1.2587	valid_1's multi_logloss: 1.25924
[11]	valid_0's multi_logloss: 1.21731	valid_1's multi_logloss: 1.21784
[12]	valid_0's multi_logloss: 1.23303	valid_1's multi_logloss: 1.23349
[13]	valid_0's multi_logloss: 1.1928	valid_1's multi_logloss: 1.19306
[14]	valid_0's multi_logloss

[114]	valid_0's multi_logloss: 0.644173	valid_1's multi_logloss: 0.645697
[115]	valid_0's multi_logloss: 0.633144	valid_1's multi_logloss: 0.634696
[116]	valid_0's multi_logloss: 0.622581	valid_1's multi_logloss: 0.624198
[117]	valid_0's multi_logloss: 0.627606	valid_1's multi_logloss: 0.629208
[118]	valid_0's multi_logloss: 0.618894	valid_1's multi_logloss: 0.620438
[119]	valid_0's multi_logloss: 0.62397	valid_1's multi_logloss: 0.625505
[120]	valid_0's multi_logloss: 0.630002	valid_1's multi_logloss: 0.631532
[121]	valid_0's multi_logloss: 0.635848	valid_1's multi_logloss: 0.637364
[122]	valid_0's multi_logloss: 0.642156	valid_1's multi_logloss: 0.643655
[123]	valid_0's multi_logloss: 0.63234	valid_1's multi_logloss: 0.633817
[124]	valid_0's multi_logloss: 0.623056	valid_1's multi_logloss: 0.624474
[125]	valid_0's multi_logloss: 0.628607	valid_1's multi_logloss: 0.63002
[126]	valid_0's multi_logloss: 0.63363	valid_1's multi_logloss: 0.635061
[127]	valid_0's multi_logloss: 0.624202	va

LGBMClassifier(boosting_type='dart', colsample_bytree=0.4, device='gpu',
        learning_rate=0.1, max_bin=255, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=10000,
        n_jobs=-1, num_leaves=31, objective='multiclass',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0005, silent=False,
        subsample=0.8, subsample_for_bin=200000, subsample_freq=1)

In [12]:
from sklearn import metrics
predicted = clf.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.902245627504
             precision    recall  f1-score   support

      anger       0.91      0.91      0.91     11550
       fear       0.84      0.89      0.86      9455
        joy       0.92      0.92      0.92     28299
       love       0.77      0.88      0.82      6910
    sadness       0.96      0.92      0.94     24111
   surprise       0.82      0.70      0.75      3037

avg / total       0.90      0.90      0.90     83362

