Here I will to show how to use linear model stochastic gradient descent on multi-class classification/discrimination

import class sklearn.linear_model.SGDClassifier

In [1]:
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
import xgboost as xgb



In [2]:
xgb.__version__

'0.4'

Define some functions to help us on preprocessing

In [3]:
# clear string
def clearstring(string):
    string = re.sub('[^A-Za-z ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string.lower()

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [4]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
416809
416809


In [5]:
bow = TfidfVectorizer(min_df=10).fit(trainset.data)

In [6]:
out = bow.transform(trainset.data)

In [7]:
trainset.target = np.array(trainset.target)
train_X, test_X, train_Y, test_Y = train_test_split(out, trainset.target, test_size = 0.2)

In [8]:
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 7,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'nthread': -1,
    'silent': False,
    'n_estimators': 10000
    }
clf = xgb.XGBClassifier(**params_xgd)
clf.fit(train_X,train_Y, eval_set=[(train_X,train_Y), (test_X,test_Y)], 
        eval_metric='mlogloss', early_stopping_rounds=20, verbose=True)

Will train until validation_1 error hasn't decreased in 20 rounds.
[0]	validation_0-mlogloss:1.748563	validation_1-mlogloss:1.748771
[1]	validation_0-mlogloss:1.708156	validation_1-mlogloss:1.708515
[2]	validation_0-mlogloss:1.670217	validation_1-mlogloss:1.670610
[3]	validation_0-mlogloss:1.634315	validation_1-mlogloss:1.634714
[4]	validation_0-mlogloss:1.603540	validation_1-mlogloss:1.603368
[5]	validation_0-mlogloss:1.571127	validation_1-mlogloss:1.571520
[6]	validation_0-mlogloss:1.541245	validation_1-mlogloss:1.542299
[7]	validation_0-mlogloss:1.513506	validation_1-mlogloss:1.514591
[8]	validation_0-mlogloss:1.487730	validation_1-mlogloss:1.489031
[9]	validation_0-mlogloss:1.461161	validation_1-mlogloss:1.462647
[10]	validation_0-mlogloss:1.438024	validation_1-mlogloss:1.439336
[11]	validation_0-mlogloss:1.415658	validation_1-mlogloss:1.416670
[12]	validation_0-mlogloss:1.392244	validation_1-mlogloss:1.393468
[13]	validation_0-mlogloss:1.371884	validation_1-mlogloss:1.373062
[14]	

[122]	validation_0-mlogloss:0.645132	validation_1-mlogloss:0.656173
[123]	validation_0-mlogloss:0.642527	validation_1-mlogloss:0.653661
[124]	validation_0-mlogloss:0.639873	validation_1-mlogloss:0.651042
[125]	validation_0-mlogloss:0.637184	validation_1-mlogloss:0.648354
[126]	validation_0-mlogloss:0.634528	validation_1-mlogloss:0.645705
[127]	validation_0-mlogloss:0.631929	validation_1-mlogloss:0.643201
[128]	validation_0-mlogloss:0.629423	validation_1-mlogloss:0.640803
[129]	validation_0-mlogloss:0.626832	validation_1-mlogloss:0.638293
[130]	validation_0-mlogloss:0.624345	validation_1-mlogloss:0.635882
[131]	validation_0-mlogloss:0.621879	validation_1-mlogloss:0.633484
[132]	validation_0-mlogloss:0.619398	validation_1-mlogloss:0.631075
[133]	validation_0-mlogloss:0.616862	validation_1-mlogloss:0.628561
[134]	validation_0-mlogloss:0.614368	validation_1-mlogloss:0.626198
[135]	validation_0-mlogloss:0.611941	validation_1-mlogloss:0.623843
[136]	validation_0-mlogloss:0.609550	validation_

[243]	validation_0-mlogloss:0.436463	validation_1-mlogloss:0.454514
[244]	validation_0-mlogloss:0.435403	validation_1-mlogloss:0.453491
[245]	validation_0-mlogloss:0.434285	validation_1-mlogloss:0.452449
[246]	validation_0-mlogloss:0.433183	validation_1-mlogloss:0.451370
[247]	validation_0-mlogloss:0.432103	validation_1-mlogloss:0.450366
[248]	validation_0-mlogloss:0.431040	validation_1-mlogloss:0.449303
[249]	validation_0-mlogloss:0.429993	validation_1-mlogloss:0.448316
[250]	validation_0-mlogloss:0.428950	validation_1-mlogloss:0.447310
[251]	validation_0-mlogloss:0.427910	validation_1-mlogloss:0.446356
[252]	validation_0-mlogloss:0.426901	validation_1-mlogloss:0.445392
[253]	validation_0-mlogloss:0.425828	validation_1-mlogloss:0.444408
[254]	validation_0-mlogloss:0.424814	validation_1-mlogloss:0.443432
[255]	validation_0-mlogloss:0.423791	validation_1-mlogloss:0.442432
[256]	validation_0-mlogloss:0.422776	validation_1-mlogloss:0.441431
[257]	validation_0-mlogloss:0.421779	validation_

[364]	validation_0-mlogloss:0.337987	validation_1-mlogloss:0.362297
[365]	validation_0-mlogloss:0.337378	validation_1-mlogloss:0.361734
[366]	validation_0-mlogloss:0.336809	validation_1-mlogloss:0.361219
[367]	validation_0-mlogloss:0.336229	validation_1-mlogloss:0.360692
[368]	validation_0-mlogloss:0.335671	validation_1-mlogloss:0.360166
[369]	validation_0-mlogloss:0.334893	validation_1-mlogloss:0.359437
[370]	validation_0-mlogloss:0.334319	validation_1-mlogloss:0.358932
[371]	validation_0-mlogloss:0.333727	validation_1-mlogloss:0.358369
[372]	validation_0-mlogloss:0.333170	validation_1-mlogloss:0.357862
[373]	validation_0-mlogloss:0.332604	validation_1-mlogloss:0.357357
[374]	validation_0-mlogloss:0.332036	validation_1-mlogloss:0.356848
[375]	validation_0-mlogloss:0.331441	validation_1-mlogloss:0.356321
[376]	validation_0-mlogloss:0.330893	validation_1-mlogloss:0.355809
[377]	validation_0-mlogloss:0.330335	validation_1-mlogloss:0.355329
[378]	validation_0-mlogloss:0.329764	validation_

[485]	validation_0-mlogloss:0.281086	validation_1-mlogloss:0.310591
[486]	validation_0-mlogloss:0.280738	validation_1-mlogloss:0.310255
[487]	validation_0-mlogloss:0.280365	validation_1-mlogloss:0.309927
[488]	validation_0-mlogloss:0.280020	validation_1-mlogloss:0.309636
[489]	validation_0-mlogloss:0.279675	validation_1-mlogloss:0.309317
[490]	validation_0-mlogloss:0.279258	validation_1-mlogloss:0.308972
[491]	validation_0-mlogloss:0.278900	validation_1-mlogloss:0.308642
[492]	validation_0-mlogloss:0.278546	validation_1-mlogloss:0.308318
[493]	validation_0-mlogloss:0.278201	validation_1-mlogloss:0.308025
[494]	validation_0-mlogloss:0.277837	validation_1-mlogloss:0.307719
[495]	validation_0-mlogloss:0.277491	validation_1-mlogloss:0.307428
[496]	validation_0-mlogloss:0.277166	validation_1-mlogloss:0.307128
[497]	validation_0-mlogloss:0.276819	validation_1-mlogloss:0.306823
[498]	validation_0-mlogloss:0.276483	validation_1-mlogloss:0.306516
[499]	validation_0-mlogloss:0.276157	validation_

[606]	validation_0-mlogloss:0.244753	validation_1-mlogloss:0.278774
[607]	validation_0-mlogloss:0.244525	validation_1-mlogloss:0.278570
[608]	validation_0-mlogloss:0.244287	validation_1-mlogloss:0.278372
[609]	validation_0-mlogloss:0.244051	validation_1-mlogloss:0.278158
[610]	validation_0-mlogloss:0.243737	validation_1-mlogloss:0.277897
[611]	validation_0-mlogloss:0.243507	validation_1-mlogloss:0.277693
[612]	validation_0-mlogloss:0.243261	validation_1-mlogloss:0.277493
[613]	validation_0-mlogloss:0.243020	validation_1-mlogloss:0.277286
[614]	validation_0-mlogloss:0.242765	validation_1-mlogloss:0.277095
[615]	validation_0-mlogloss:0.242465	validation_1-mlogloss:0.276849
[616]	validation_0-mlogloss:0.242238	validation_1-mlogloss:0.276665
[617]	validation_0-mlogloss:0.241949	validation_1-mlogloss:0.276413
[618]	validation_0-mlogloss:0.241724	validation_1-mlogloss:0.276240
[619]	validation_0-mlogloss:0.241497	validation_1-mlogloss:0.276063
[620]	validation_0-mlogloss:0.241270	validation_

[727]	validation_0-mlogloss:0.219920	validation_1-mlogloss:0.258422
[728]	validation_0-mlogloss:0.219753	validation_1-mlogloss:0.258293
[729]	validation_0-mlogloss:0.219587	validation_1-mlogloss:0.258162
[730]	validation_0-mlogloss:0.219426	validation_1-mlogloss:0.258033
[731]	validation_0-mlogloss:0.219274	validation_1-mlogloss:0.257904
[732]	validation_0-mlogloss:0.219107	validation_1-mlogloss:0.257776
[733]	validation_0-mlogloss:0.218946	validation_1-mlogloss:0.257651
[734]	validation_0-mlogloss:0.218789	validation_1-mlogloss:0.257537
[735]	validation_0-mlogloss:0.218632	validation_1-mlogloss:0.257423
[736]	validation_0-mlogloss:0.218482	validation_1-mlogloss:0.257307
[737]	validation_0-mlogloss:0.218298	validation_1-mlogloss:0.257149
[738]	validation_0-mlogloss:0.218141	validation_1-mlogloss:0.257012
[739]	validation_0-mlogloss:0.217979	validation_1-mlogloss:0.256888
[740]	validation_0-mlogloss:0.217815	validation_1-mlogloss:0.256759
[741]	validation_0-mlogloss:0.217652	validation_

[848]	validation_0-mlogloss:0.202248	validation_1-mlogloss:0.245092
[849]	validation_0-mlogloss:0.202131	validation_1-mlogloss:0.245014
[850]	validation_0-mlogloss:0.201997	validation_1-mlogloss:0.244910
[851]	validation_0-mlogloss:0.201851	validation_1-mlogloss:0.244801
[852]	validation_0-mlogloss:0.201730	validation_1-mlogloss:0.244717
[853]	validation_0-mlogloss:0.201582	validation_1-mlogloss:0.244624
[854]	validation_0-mlogloss:0.201464	validation_1-mlogloss:0.244531
[855]	validation_0-mlogloss:0.201348	validation_1-mlogloss:0.244426
[856]	validation_0-mlogloss:0.201229	validation_1-mlogloss:0.244339
[857]	validation_0-mlogloss:0.201116	validation_1-mlogloss:0.244251
[858]	validation_0-mlogloss:0.201011	validation_1-mlogloss:0.244160
[859]	validation_0-mlogloss:0.200871	validation_1-mlogloss:0.244083
[860]	validation_0-mlogloss:0.200759	validation_1-mlogloss:0.244007
[861]	validation_0-mlogloss:0.200634	validation_1-mlogloss:0.243928
[862]	validation_0-mlogloss:0.200502	validation_

[969]	validation_0-mlogloss:0.188959	validation_1-mlogloss:0.235768
[970]	validation_0-mlogloss:0.188867	validation_1-mlogloss:0.235702
[971]	validation_0-mlogloss:0.188776	validation_1-mlogloss:0.235643
[972]	validation_0-mlogloss:0.188685	validation_1-mlogloss:0.235572
[973]	validation_0-mlogloss:0.188571	validation_1-mlogloss:0.235485
[974]	validation_0-mlogloss:0.188460	validation_1-mlogloss:0.235411
[975]	validation_0-mlogloss:0.188351	validation_1-mlogloss:0.235330
[976]	validation_0-mlogloss:0.188206	validation_1-mlogloss:0.235207
[977]	validation_0-mlogloss:0.188071	validation_1-mlogloss:0.235079
[978]	validation_0-mlogloss:0.187961	validation_1-mlogloss:0.234992
[979]	validation_0-mlogloss:0.187850	validation_1-mlogloss:0.234902
[980]	validation_0-mlogloss:0.187767	validation_1-mlogloss:0.234843
[981]	validation_0-mlogloss:0.187642	validation_1-mlogloss:0.234730
[982]	validation_0-mlogloss:0.187539	validation_1-mlogloss:0.234656
[983]	validation_0-mlogloss:0.187432	validation_

[1089]	validation_0-mlogloss:0.178502	validation_1-mlogloss:0.229093
[1090]	validation_0-mlogloss:0.178418	validation_1-mlogloss:0.229072
[1091]	validation_0-mlogloss:0.178352	validation_1-mlogloss:0.229028
[1092]	validation_0-mlogloss:0.178261	validation_1-mlogloss:0.228995
[1093]	validation_0-mlogloss:0.178194	validation_1-mlogloss:0.228959
[1094]	validation_0-mlogloss:0.178102	validation_1-mlogloss:0.228934
[1095]	validation_0-mlogloss:0.178039	validation_1-mlogloss:0.228884
[1096]	validation_0-mlogloss:0.177975	validation_1-mlogloss:0.228843
[1097]	validation_0-mlogloss:0.177893	validation_1-mlogloss:0.228799
[1098]	validation_0-mlogloss:0.177825	validation_1-mlogloss:0.228748
[1099]	validation_0-mlogloss:0.177746	validation_1-mlogloss:0.228712
[1100]	validation_0-mlogloss:0.177676	validation_1-mlogloss:0.228679
[1101]	validation_0-mlogloss:0.177598	validation_1-mlogloss:0.228628
[1102]	validation_0-mlogloss:0.177528	validation_1-mlogloss:0.228597
[1103]	validation_0-mlogloss:0.177

[1208]	validation_0-mlogloss:0.170660	validation_1-mlogloss:0.225341
[1209]	validation_0-mlogloss:0.170600	validation_1-mlogloss:0.225317
[1210]	validation_0-mlogloss:0.170538	validation_1-mlogloss:0.225277
[1211]	validation_0-mlogloss:0.170481	validation_1-mlogloss:0.225240
[1212]	validation_0-mlogloss:0.170418	validation_1-mlogloss:0.225207
[1213]	validation_0-mlogloss:0.170355	validation_1-mlogloss:0.225170
[1214]	validation_0-mlogloss:0.170299	validation_1-mlogloss:0.225150
[1215]	validation_0-mlogloss:0.170244	validation_1-mlogloss:0.225121
[1216]	validation_0-mlogloss:0.170192	validation_1-mlogloss:0.225103
[1217]	validation_0-mlogloss:0.170140	validation_1-mlogloss:0.225069
[1218]	validation_0-mlogloss:0.170085	validation_1-mlogloss:0.225038
[1219]	validation_0-mlogloss:0.170036	validation_1-mlogloss:0.225005
[1220]	validation_0-mlogloss:0.169976	validation_1-mlogloss:0.224984
[1221]	validation_0-mlogloss:0.169910	validation_1-mlogloss:0.224964
[1222]	validation_0-mlogloss:0.169

[1327]	validation_0-mlogloss:0.164337	validation_1-mlogloss:0.222906
[1328]	validation_0-mlogloss:0.164291	validation_1-mlogloss:0.222891
[1329]	validation_0-mlogloss:0.164245	validation_1-mlogloss:0.222875
[1330]	validation_0-mlogloss:0.164194	validation_1-mlogloss:0.222866
[1331]	validation_0-mlogloss:0.164148	validation_1-mlogloss:0.222855
[1332]	validation_0-mlogloss:0.164106	validation_1-mlogloss:0.222831
[1333]	validation_0-mlogloss:0.164051	validation_1-mlogloss:0.222814
[1334]	validation_0-mlogloss:0.164011	validation_1-mlogloss:0.222798
[1335]	validation_0-mlogloss:0.163970	validation_1-mlogloss:0.222783
[1336]	validation_0-mlogloss:0.163927	validation_1-mlogloss:0.222766
[1337]	validation_0-mlogloss:0.163878	validation_1-mlogloss:0.222750
[1338]	validation_0-mlogloss:0.163836	validation_1-mlogloss:0.222743
[1339]	validation_0-mlogloss:0.163786	validation_1-mlogloss:0.222739
[1340]	validation_0-mlogloss:0.163749	validation_1-mlogloss:0.222716
[1341]	validation_0-mlogloss:0.163

[1446]	validation_0-mlogloss:0.159058	validation_1-mlogloss:0.221434
[1447]	validation_0-mlogloss:0.159011	validation_1-mlogloss:0.221437
[1448]	validation_0-mlogloss:0.158971	validation_1-mlogloss:0.221425
[1449]	validation_0-mlogloss:0.158931	validation_1-mlogloss:0.221414
[1450]	validation_0-mlogloss:0.158893	validation_1-mlogloss:0.221398
[1451]	validation_0-mlogloss:0.158853	validation_1-mlogloss:0.221402
[1452]	validation_0-mlogloss:0.158809	validation_1-mlogloss:0.221389
[1453]	validation_0-mlogloss:0.158772	validation_1-mlogloss:0.221378
[1454]	validation_0-mlogloss:0.158731	validation_1-mlogloss:0.221386
[1455]	validation_0-mlogloss:0.158681	validation_1-mlogloss:0.221381
[1456]	validation_0-mlogloss:0.158635	validation_1-mlogloss:0.221367
[1457]	validation_0-mlogloss:0.158602	validation_1-mlogloss:0.221357
[1458]	validation_0-mlogloss:0.158558	validation_1-mlogloss:0.221352
[1459]	validation_0-mlogloss:0.158525	validation_1-mlogloss:0.221343
[1460]	validation_0-mlogloss:0.158

[1565]	validation_0-mlogloss:0.154657	validation_1-mlogloss:0.220751
[1566]	validation_0-mlogloss:0.154626	validation_1-mlogloss:0.220748
[1567]	validation_0-mlogloss:0.154579	validation_1-mlogloss:0.220740
[1568]	validation_0-mlogloss:0.154541	validation_1-mlogloss:0.220739
[1569]	validation_0-mlogloss:0.154512	validation_1-mlogloss:0.220735
[1570]	validation_0-mlogloss:0.154478	validation_1-mlogloss:0.220730
[1571]	validation_0-mlogloss:0.154443	validation_1-mlogloss:0.220732
[1572]	validation_0-mlogloss:0.154404	validation_1-mlogloss:0.220730
[1573]	validation_0-mlogloss:0.154373	validation_1-mlogloss:0.220722
[1574]	validation_0-mlogloss:0.154337	validation_1-mlogloss:0.220719
[1575]	validation_0-mlogloss:0.154307	validation_1-mlogloss:0.220721
[1576]	validation_0-mlogloss:0.154277	validation_1-mlogloss:0.220728
[1577]	validation_0-mlogloss:0.154250	validation_1-mlogloss:0.220724
[1578]	validation_0-mlogloss:0.154212	validation_1-mlogloss:0.220734
[1579]	validation_0-mlogloss:0.154

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4,
       gamma=0.65, learning_rate=0.1, max_delta_step=1.8, max_depth=7,
       min_child_weight=10.0, missing=None, n_estimators=10000, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=0.8)

In [9]:
from sklearn import metrics
predicted = clf.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.885415417097
             precision    recall  f1-score   support

      anger       0.88      0.90      0.89     11414
       fear       0.81      0.83      0.82      9584
        joy       0.92      0.91      0.91     28269
       love       0.74      0.77      0.75      6878
    sadness       0.95      0.93      0.94     24196
   surprise       0.66      0.67      0.67      3021

avg / total       0.89      0.89      0.89     83362

