Here I will to show how to use linear model stochastic gradient descent on multi-class classification/discrimination

import class sklearn.linear_model.SGDClassifier

In [1]:
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
import xgboost as xgb



In [2]:
xgb.__version__

'0.4'

Define some functions to help us on preprocessing

In [3]:
# clear string
def clearstring(string):
    string = re.sub('[^A-Za-z ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string.lower()

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [4]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
416809
416809


In [5]:
bow = CountVectorizer(min_df=10).fit(trainset.data)

In [10]:
out = bow.transform(trainset.data)

In [11]:
trainset.target = np.array(trainset.target)
train_X, test_X, train_Y, test_Y = train_test_split(out, trainset.target, test_size = 0.2)

In [12]:
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'nthread': -1,
    'silent': False,
    'n_estimators': 10000
    }
clf = xgb.XGBClassifier(**params_xgd)
clf.fit(train_X,train_Y, eval_set=[(train_X,train_Y), (test_X,test_Y)], 
        eval_metric='mlogloss', early_stopping_rounds=20, verbose=True)

Will train until validation_1 error hasn't decreased in 20 rounds.
[0]	validation_0-mlogloss:1.748559	validation_1-mlogloss:1.748552
[1]	validation_0-mlogloss:1.708754	validation_1-mlogloss:1.708001
[2]	validation_0-mlogloss:1.669859	validation_1-mlogloss:1.669619
[3]	validation_0-mlogloss:1.634101	validation_1-mlogloss:1.634130
[4]	validation_0-mlogloss:1.603129	validation_1-mlogloss:1.602402
[5]	validation_0-mlogloss:1.570976	validation_1-mlogloss:1.570432
[6]	validation_0-mlogloss:1.541222	validation_1-mlogloss:1.541072
[7]	validation_0-mlogloss:1.513381	validation_1-mlogloss:1.513232
[8]	validation_0-mlogloss:1.487926	validation_1-mlogloss:1.487376
[9]	validation_0-mlogloss:1.461228	validation_1-mlogloss:1.460954
[10]	validation_0-mlogloss:1.438190	validation_1-mlogloss:1.437731
[11]	validation_0-mlogloss:1.415609	validation_1-mlogloss:1.415365
[12]	validation_0-mlogloss:1.392168	validation_1-mlogloss:1.391810
[13]	validation_0-mlogloss:1.372123	validation_1-mlogloss:1.371460
[14]	

[122]	validation_0-mlogloss:0.649581	validation_1-mlogloss:0.652214
[123]	validation_0-mlogloss:0.646926	validation_1-mlogloss:0.649606
[124]	validation_0-mlogloss:0.644286	validation_1-mlogloss:0.647010
[125]	validation_0-mlogloss:0.641706	validation_1-mlogloss:0.644418
[126]	validation_0-mlogloss:0.639058	validation_1-mlogloss:0.641840
[127]	validation_0-mlogloss:0.636535	validation_1-mlogloss:0.639315
[128]	validation_0-mlogloss:0.634055	validation_1-mlogloss:0.636809
[129]	validation_0-mlogloss:0.631465	validation_1-mlogloss:0.634351
[130]	validation_0-mlogloss:0.628967	validation_1-mlogloss:0.631872
[131]	validation_0-mlogloss:0.626510	validation_1-mlogloss:0.629464
[132]	validation_0-mlogloss:0.623993	validation_1-mlogloss:0.626986
[133]	validation_0-mlogloss:0.621553	validation_1-mlogloss:0.624572
[134]	validation_0-mlogloss:0.619137	validation_1-mlogloss:0.622276
[135]	validation_0-mlogloss:0.616687	validation_1-mlogloss:0.619864
[136]	validation_0-mlogloss:0.614301	validation_

[243]	validation_0-mlogloss:0.443033	validation_1-mlogloss:0.449633
[244]	validation_0-mlogloss:0.441999	validation_1-mlogloss:0.448593
[245]	validation_0-mlogloss:0.440910	validation_1-mlogloss:0.447516
[246]	validation_0-mlogloss:0.439865	validation_1-mlogloss:0.446490
[247]	validation_0-mlogloss:0.438780	validation_1-mlogloss:0.445441
[248]	validation_0-mlogloss:0.437709	validation_1-mlogloss:0.444392
[249]	validation_0-mlogloss:0.436685	validation_1-mlogloss:0.443364
[250]	validation_0-mlogloss:0.435646	validation_1-mlogloss:0.442337
[251]	validation_0-mlogloss:0.434600	validation_1-mlogloss:0.441304
[252]	validation_0-mlogloss:0.433576	validation_1-mlogloss:0.440327
[253]	validation_0-mlogloss:0.432568	validation_1-mlogloss:0.439341
[254]	validation_0-mlogloss:0.431555	validation_1-mlogloss:0.438377
[255]	validation_0-mlogloss:0.430533	validation_1-mlogloss:0.437412
[256]	validation_0-mlogloss:0.429518	validation_1-mlogloss:0.436444
[257]	validation_0-mlogloss:0.428466	validation_

[364]	validation_0-mlogloss:0.345212	validation_1-mlogloss:0.355271
[365]	validation_0-mlogloss:0.344620	validation_1-mlogloss:0.354724
[366]	validation_0-mlogloss:0.344036	validation_1-mlogloss:0.354196
[367]	validation_0-mlogloss:0.343454	validation_1-mlogloss:0.353650
[368]	validation_0-mlogloss:0.342831	validation_1-mlogloss:0.353086
[369]	validation_0-mlogloss:0.342287	validation_1-mlogloss:0.352542
[370]	validation_0-mlogloss:0.341731	validation_1-mlogloss:0.352005
[371]	validation_0-mlogloss:0.341173	validation_1-mlogloss:0.351485
[372]	validation_0-mlogloss:0.340639	validation_1-mlogloss:0.350969
[373]	validation_0-mlogloss:0.340069	validation_1-mlogloss:0.350424
[374]	validation_0-mlogloss:0.339519	validation_1-mlogloss:0.349913
[375]	validation_0-mlogloss:0.338964	validation_1-mlogloss:0.349394
[376]	validation_0-mlogloss:0.338399	validation_1-mlogloss:0.348855
[377]	validation_0-mlogloss:0.337871	validation_1-mlogloss:0.348321
[378]	validation_0-mlogloss:0.337330	validation_

[485]	validation_0-mlogloss:0.289510	validation_1-mlogloss:0.302822
[486]	validation_0-mlogloss:0.289159	validation_1-mlogloss:0.302486
[487]	validation_0-mlogloss:0.288781	validation_1-mlogloss:0.302126
[488]	validation_0-mlogloss:0.288439	validation_1-mlogloss:0.301810
[489]	validation_0-mlogloss:0.288095	validation_1-mlogloss:0.301497
[490]	validation_0-mlogloss:0.287754	validation_1-mlogloss:0.301185
[491]	validation_0-mlogloss:0.287420	validation_1-mlogloss:0.300868
[492]	validation_0-mlogloss:0.287062	validation_1-mlogloss:0.300543
[493]	validation_0-mlogloss:0.286730	validation_1-mlogloss:0.300203
[494]	validation_0-mlogloss:0.286386	validation_1-mlogloss:0.299885
[495]	validation_0-mlogloss:0.286060	validation_1-mlogloss:0.299563
[496]	validation_0-mlogloss:0.285711	validation_1-mlogloss:0.299247
[497]	validation_0-mlogloss:0.285373	validation_1-mlogloss:0.298955
[498]	validation_0-mlogloss:0.285038	validation_1-mlogloss:0.298637
[499]	validation_0-mlogloss:0.284700	validation_

[606]	validation_0-mlogloss:0.254309	validation_1-mlogloss:0.270414
[607]	validation_0-mlogloss:0.254089	validation_1-mlogloss:0.270219
[608]	validation_0-mlogloss:0.253869	validation_1-mlogloss:0.270001
[609]	validation_0-mlogloss:0.253629	validation_1-mlogloss:0.269790
[610]	validation_0-mlogloss:0.253372	validation_1-mlogloss:0.269553
[611]	validation_0-mlogloss:0.253141	validation_1-mlogloss:0.269358
[612]	validation_0-mlogloss:0.252920	validation_1-mlogloss:0.269161
[613]	validation_0-mlogloss:0.252612	validation_1-mlogloss:0.268868
[614]	validation_0-mlogloss:0.252392	validation_1-mlogloss:0.268683
[615]	validation_0-mlogloss:0.252179	validation_1-mlogloss:0.268497
[616]	validation_0-mlogloss:0.251965	validation_1-mlogloss:0.268306
[617]	validation_0-mlogloss:0.251753	validation_1-mlogloss:0.268106
[618]	validation_0-mlogloss:0.251532	validation_1-mlogloss:0.267884
[619]	validation_0-mlogloss:0.251301	validation_1-mlogloss:0.267676
[620]	validation_0-mlogloss:0.251087	validation_

[727]	validation_0-mlogloss:0.230022	validation_1-mlogloss:0.248998
[728]	validation_0-mlogloss:0.229857	validation_1-mlogloss:0.248866
[729]	validation_0-mlogloss:0.229689	validation_1-mlogloss:0.248738
[730]	validation_0-mlogloss:0.229542	validation_1-mlogloss:0.248614
[731]	validation_0-mlogloss:0.229374	validation_1-mlogloss:0.248477
[732]	validation_0-mlogloss:0.229197	validation_1-mlogloss:0.248329
[733]	validation_0-mlogloss:0.229004	validation_1-mlogloss:0.248152
[734]	validation_0-mlogloss:0.228827	validation_1-mlogloss:0.247989
[735]	validation_0-mlogloss:0.228669	validation_1-mlogloss:0.247864
[736]	validation_0-mlogloss:0.228509	validation_1-mlogloss:0.247727
[737]	validation_0-mlogloss:0.228350	validation_1-mlogloss:0.247601
[738]	validation_0-mlogloss:0.228177	validation_1-mlogloss:0.247449
[739]	validation_0-mlogloss:0.227987	validation_1-mlogloss:0.247291
[740]	validation_0-mlogloss:0.227819	validation_1-mlogloss:0.247139
[741]	validation_0-mlogloss:0.227673	validation_

[848]	validation_0-mlogloss:0.212841	validation_1-mlogloss:0.234601
[849]	validation_0-mlogloss:0.212699	validation_1-mlogloss:0.234469
[850]	validation_0-mlogloss:0.212587	validation_1-mlogloss:0.234374
[851]	validation_0-mlogloss:0.212462	validation_1-mlogloss:0.234267
[852]	validation_0-mlogloss:0.212354	validation_1-mlogloss:0.234179
[853]	validation_0-mlogloss:0.212178	validation_1-mlogloss:0.234062
[854]	validation_0-mlogloss:0.212038	validation_1-mlogloss:0.233930
[855]	validation_0-mlogloss:0.211924	validation_1-mlogloss:0.233850
[856]	validation_0-mlogloss:0.211815	validation_1-mlogloss:0.233749
[857]	validation_0-mlogloss:0.211704	validation_1-mlogloss:0.233659
[858]	validation_0-mlogloss:0.211569	validation_1-mlogloss:0.233566
[859]	validation_0-mlogloss:0.211456	validation_1-mlogloss:0.233467
[860]	validation_0-mlogloss:0.211337	validation_1-mlogloss:0.233387
[861]	validation_0-mlogloss:0.211212	validation_1-mlogloss:0.233298
[862]	validation_0-mlogloss:0.211091	validation_

[969]	validation_0-mlogloss:0.200022	validation_1-mlogloss:0.224537
[970]	validation_0-mlogloss:0.199938	validation_1-mlogloss:0.224461
[971]	validation_0-mlogloss:0.199852	validation_1-mlogloss:0.224401
[972]	validation_0-mlogloss:0.199764	validation_1-mlogloss:0.224347
[973]	validation_0-mlogloss:0.199677	validation_1-mlogloss:0.224279
[974]	validation_0-mlogloss:0.199591	validation_1-mlogloss:0.224210
[975]	validation_0-mlogloss:0.199510	validation_1-mlogloss:0.224154
[976]	validation_0-mlogloss:0.199407	validation_1-mlogloss:0.224070
[977]	validation_0-mlogloss:0.199324	validation_1-mlogloss:0.224001
[978]	validation_0-mlogloss:0.199241	validation_1-mlogloss:0.223943
[979]	validation_0-mlogloss:0.199154	validation_1-mlogloss:0.223888
[980]	validation_0-mlogloss:0.199072	validation_1-mlogloss:0.223825
[981]	validation_0-mlogloss:0.198979	validation_1-mlogloss:0.223754
[982]	validation_0-mlogloss:0.198885	validation_1-mlogloss:0.223677
[983]	validation_0-mlogloss:0.198805	validation_

[1089]	validation_0-mlogloss:0.190667	validation_1-mlogloss:0.217957
[1090]	validation_0-mlogloss:0.190602	validation_1-mlogloss:0.217905
[1091]	validation_0-mlogloss:0.190543	validation_1-mlogloss:0.217860
[1092]	validation_0-mlogloss:0.190478	validation_1-mlogloss:0.217815
[1093]	validation_0-mlogloss:0.190417	validation_1-mlogloss:0.217774
[1094]	validation_0-mlogloss:0.190356	validation_1-mlogloss:0.217730
[1095]	validation_0-mlogloss:0.190273	validation_1-mlogloss:0.217689
[1096]	validation_0-mlogloss:0.190209	validation_1-mlogloss:0.217652
[1097]	validation_0-mlogloss:0.190140	validation_1-mlogloss:0.217613
[1098]	validation_0-mlogloss:0.190072	validation_1-mlogloss:0.217561
[1099]	validation_0-mlogloss:0.189996	validation_1-mlogloss:0.217522
[1100]	validation_0-mlogloss:0.189934	validation_1-mlogloss:0.217482
[1101]	validation_0-mlogloss:0.189864	validation_1-mlogloss:0.217440
[1102]	validation_0-mlogloss:0.189803	validation_1-mlogloss:0.217412
[1103]	validation_0-mlogloss:0.189

[1208]	validation_0-mlogloss:0.183447	validation_1-mlogloss:0.213494
[1209]	validation_0-mlogloss:0.183393	validation_1-mlogloss:0.213469
[1210]	validation_0-mlogloss:0.183342	validation_1-mlogloss:0.213441
[1211]	validation_0-mlogloss:0.183290	validation_1-mlogloss:0.213405
[1212]	validation_0-mlogloss:0.183241	validation_1-mlogloss:0.213391
[1213]	validation_0-mlogloss:0.183186	validation_1-mlogloss:0.213359
[1214]	validation_0-mlogloss:0.183139	validation_1-mlogloss:0.213325
[1215]	validation_0-mlogloss:0.183088	validation_1-mlogloss:0.213293
[1216]	validation_0-mlogloss:0.183037	validation_1-mlogloss:0.213269
[1217]	validation_0-mlogloss:0.182979	validation_1-mlogloss:0.213254
[1218]	validation_0-mlogloss:0.182932	validation_1-mlogloss:0.213221
[1219]	validation_0-mlogloss:0.182883	validation_1-mlogloss:0.213184
[1220]	validation_0-mlogloss:0.182828	validation_1-mlogloss:0.213151
[1221]	validation_0-mlogloss:0.182778	validation_1-mlogloss:0.213118
[1222]	validation_0-mlogloss:0.182

[1327]	validation_0-mlogloss:0.177695	validation_1-mlogloss:0.210526
[1328]	validation_0-mlogloss:0.177656	validation_1-mlogloss:0.210512
[1329]	validation_0-mlogloss:0.177614	validation_1-mlogloss:0.210489
[1330]	validation_0-mlogloss:0.177562	validation_1-mlogloss:0.210466
[1331]	validation_0-mlogloss:0.177524	validation_1-mlogloss:0.210443
[1332]	validation_0-mlogloss:0.177485	validation_1-mlogloss:0.210422
[1333]	validation_0-mlogloss:0.177446	validation_1-mlogloss:0.210403
[1334]	validation_0-mlogloss:0.177396	validation_1-mlogloss:0.210383
[1335]	validation_0-mlogloss:0.177348	validation_1-mlogloss:0.210359
[1336]	validation_0-mlogloss:0.177313	validation_1-mlogloss:0.210339
[1337]	validation_0-mlogloss:0.177270	validation_1-mlogloss:0.210305
[1338]	validation_0-mlogloss:0.177229	validation_1-mlogloss:0.210274
[1339]	validation_0-mlogloss:0.177186	validation_1-mlogloss:0.210258
[1340]	validation_0-mlogloss:0.177148	validation_1-mlogloss:0.210237
[1341]	validation_0-mlogloss:0.177

[1446]	validation_0-mlogloss:0.173010	validation_1-mlogloss:0.208354
[1447]	validation_0-mlogloss:0.172978	validation_1-mlogloss:0.208342
[1448]	validation_0-mlogloss:0.172940	validation_1-mlogloss:0.208326
[1449]	validation_0-mlogloss:0.172900	validation_1-mlogloss:0.208316
[1450]	validation_0-mlogloss:0.172870	validation_1-mlogloss:0.208300
[1451]	validation_0-mlogloss:0.172839	validation_1-mlogloss:0.208289
[1452]	validation_0-mlogloss:0.172796	validation_1-mlogloss:0.208278
[1453]	validation_0-mlogloss:0.172765	validation_1-mlogloss:0.208259
[1454]	validation_0-mlogloss:0.172722	validation_1-mlogloss:0.208251
[1455]	validation_0-mlogloss:0.172688	validation_1-mlogloss:0.208240
[1456]	validation_0-mlogloss:0.172649	validation_1-mlogloss:0.208228
[1457]	validation_0-mlogloss:0.172602	validation_1-mlogloss:0.208193
[1458]	validation_0-mlogloss:0.172568	validation_1-mlogloss:0.208171
[1459]	validation_0-mlogloss:0.172533	validation_1-mlogloss:0.208160
[1460]	validation_0-mlogloss:0.172

[1565]	validation_0-mlogloss:0.169085	validation_1-mlogloss:0.207030
[1566]	validation_0-mlogloss:0.169056	validation_1-mlogloss:0.207027
[1567]	validation_0-mlogloss:0.169020	validation_1-mlogloss:0.207014
[1568]	validation_0-mlogloss:0.168992	validation_1-mlogloss:0.207006
[1569]	validation_0-mlogloss:0.168967	validation_1-mlogloss:0.206999
[1570]	validation_0-mlogloss:0.168937	validation_1-mlogloss:0.206990
[1571]	validation_0-mlogloss:0.168911	validation_1-mlogloss:0.206981
[1572]	validation_0-mlogloss:0.168888	validation_1-mlogloss:0.206974
[1573]	validation_0-mlogloss:0.168862	validation_1-mlogloss:0.206958
[1574]	validation_0-mlogloss:0.168832	validation_1-mlogloss:0.206947
[1575]	validation_0-mlogloss:0.168799	validation_1-mlogloss:0.206943
[1576]	validation_0-mlogloss:0.168772	validation_1-mlogloss:0.206931
[1577]	validation_0-mlogloss:0.168743	validation_1-mlogloss:0.206929
[1578]	validation_0-mlogloss:0.168715	validation_1-mlogloss:0.206924
[1579]	validation_0-mlogloss:0.168

[1684]	validation_0-mlogloss:0.165744	validation_1-mlogloss:0.206226
[1685]	validation_0-mlogloss:0.165721	validation_1-mlogloss:0.206225
[1686]	validation_0-mlogloss:0.165688	validation_1-mlogloss:0.206226
[1687]	validation_0-mlogloss:0.165668	validation_1-mlogloss:0.206220
[1688]	validation_0-mlogloss:0.165642	validation_1-mlogloss:0.206219
[1689]	validation_0-mlogloss:0.165612	validation_1-mlogloss:0.206217
[1690]	validation_0-mlogloss:0.165583	validation_1-mlogloss:0.206208
[1691]	validation_0-mlogloss:0.165553	validation_1-mlogloss:0.206197
[1692]	validation_0-mlogloss:0.165533	validation_1-mlogloss:0.206191
[1693]	validation_0-mlogloss:0.165507	validation_1-mlogloss:0.206185
[1694]	validation_0-mlogloss:0.165481	validation_1-mlogloss:0.206188
[1695]	validation_0-mlogloss:0.165457	validation_1-mlogloss:0.206176
[1696]	validation_0-mlogloss:0.165434	validation_1-mlogloss:0.206177
[1697]	validation_0-mlogloss:0.165404	validation_1-mlogloss:0.206163
[1698]	validation_0-mlogloss:0.165

[1803]	validation_0-mlogloss:0.162854	validation_1-mlogloss:0.205920
[1804]	validation_0-mlogloss:0.162835	validation_1-mlogloss:0.205927
[1805]	validation_0-mlogloss:0.162815	validation_1-mlogloss:0.205925
[1806]	validation_0-mlogloss:0.162790	validation_1-mlogloss:0.205927
[1807]	validation_0-mlogloss:0.162769	validation_1-mlogloss:0.205913
[1808]	validation_0-mlogloss:0.162748	validation_1-mlogloss:0.205909
[1809]	validation_0-mlogloss:0.162723	validation_1-mlogloss:0.205907
[1810]	validation_0-mlogloss:0.162705	validation_1-mlogloss:0.205905
[1811]	validation_0-mlogloss:0.162679	validation_1-mlogloss:0.205913
[1812]	validation_0-mlogloss:0.162653	validation_1-mlogloss:0.205906
[1813]	validation_0-mlogloss:0.162634	validation_1-mlogloss:0.205910
[1814]	validation_0-mlogloss:0.162614	validation_1-mlogloss:0.205901
[1815]	validation_0-mlogloss:0.162582	validation_1-mlogloss:0.205902
[1816]	validation_0-mlogloss:0.162562	validation_1-mlogloss:0.205894
Stopping. Best iteration:
[1796]	v

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4,
       gamma=0.65, learning_rate=0.1, max_delta_step=1.8, max_depth=7,
       min_child_weight=10.0, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8)

In [13]:
from sklearn import metrics
predicted = clf.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.895132074566
             precision    recall  f1-score   support

      anger       0.88      0.92      0.90     11421
       fear       0.83      0.84      0.84      9505
        joy       0.93      0.91      0.92     28132
       love       0.76      0.79      0.78      6801
    sadness       0.95      0.94      0.94     24481
   surprise       0.70      0.72      0.71      3022

avg / total       0.90      0.90      0.90     83362

