Here I will to show how to use linear model stochastic gradient descent on multi-class classification/discrimination

import class sklearn.linear_model.SGDClassifier

In [1]:
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split



Define some functions to help us on preprocessing

In [2]:
# clear string
def clearstring(string):
    string = re.sub('[^\"\'A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
416809
416809


In [4]:
# bag-of-word
bow = CountVectorizer().fit_transform(trainset.data)

#tf-idf, must get from BOW first
tfidf = TfidfTransformer().fit_transform(bow)

#hashing, default n_features, probability cannot divide by negative
hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)



#### loss function got {'modified_huber', 'hinge', 'log', 'squared_hinge', 'perceptron'}

default is hinge, will give you classic SVM

perceptron in linear loss

huber and log both logistic classifier

#### penalty got {'l1', 'l2'}, to prevent overfitting

l1 = MAE (mean absolute error)

l2 = RMSE (root mean square error)

#### alpha is learning rate

#### n_iter is number of epoch

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))



accuracy validation set:  0.898586886111
             precision    recall  f1-score   support

      anger       0.91      0.88      0.90     11422
       fear       0.84      0.87      0.86      9495
        joy       0.90      0.94      0.92     28138
       love       0.84      0.74      0.79      6970
    sadness       0.93      0.94      0.94     24380
   surprise       0.85      0.65      0.73      2957

avg / total       0.90      0.90      0.90     83362



In [6]:
train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))



accuracy validation set:  0.850915285142
             precision    recall  f1-score   support

      anger       0.93      0.75      0.83     11542
       fear       0.88      0.73      0.79      9610
        joy       0.79      0.97      0.87     28110
       love       0.92      0.55      0.69      6883
    sadness       0.88      0.94      0.91     24230
   surprise       0.91      0.46      0.61      2987

avg / total       0.86      0.85      0.84     83362



In [7]:
train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))



accuracy validation set:  0.791163839639
             precision    recall  f1-score   support

      anger       0.92      0.64      0.76     11592
       fear       0.87      0.59      0.70      9557
        joy       0.71      0.97      0.82     28068
       love       0.94      0.40      0.56      6933
    sadness       0.83      0.90      0.87     24273
   surprise       0.91      0.34      0.49      2939

avg / total       0.82      0.79      0.78     83362



Always BOW got the highest accuracy among other vectorization

Now let we use linear model to do classifers, I will use BOW as vectorizer

In [8]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

svm = SGDClassifier(penalty = 'l2', alpha = 1e-3, n_iter = 10).fit(train_X, train_Y)
predicted = svm.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))



accuracy validation set:  0.896079748566
             precision    recall  f1-score   support

      anger       0.91      0.88      0.89     11440
       fear       0.86      0.85      0.86      9525
        joy       0.89      0.95      0.92     28411
       love       0.89      0.69      0.77      6839
    sadness       0.92      0.95      0.93     24074
   surprise       0.89      0.63      0.74      3073

avg / total       0.90      0.90      0.89     83362



In [9]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

sq_hinge = SGDClassifier(loss = 'squared_hinge', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = sq_hinge.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))



accuracy validation set:  0.787709028094
             precision    recall  f1-score   support

      anger       0.90      0.75      0.82     11364
       fear       0.83      0.49      0.62      9536
        joy       0.68      0.95      0.79     28475
       love       0.74      0.29      0.41      6945
    sadness       0.91      0.93      0.92     24034
   surprise       0.83      0.30      0.45      3008

avg / total       0.80      0.79      0.77     83362



In [10]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

perceptron = SGDClassifier(loss = 'perceptron', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = perceptron.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))



accuracy validation set:  0.889506009933
             precision    recall  f1-score   support

      anger       0.90      0.89      0.90     11551
       fear       0.85      0.83      0.84      9358
        joy       0.90      0.92      0.91     28254
       love       0.79      0.75      0.77      7003
    sadness       0.93      0.93      0.93     24162
   surprise       0.71      0.77      0.74      3034

avg / total       0.89      0.89      0.89     83362



But how to get probability of our output?

Only applicable if your loss = {'log', 'modified_huber'} because both are logistic regression

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

# get probability for first 2 sentence in our dataset
print(trainset.data[:2])
print(trainset.target[:2])
print(mod_huber.predict_proba(bow[:2, :]))



accuracy validation set:  0.896751517478
             precision    recall  f1-score   support

      anger       0.90      0.89      0.89     11368
       fear       0.85      0.85      0.85      9440
        joy       0.90      0.95      0.92     28329
       love       0.86      0.73      0.79      6977
    sadness       0.93      0.94      0.94     24278
   surprise       0.78      0.69      0.73      2970

avg / total       0.90      0.90      0.90     83362

['i m already feeling somewhat strange given that i get very good and while i can not open my eyes', 'i myself smiling through loving simple dialog child logic explain situation feelings it s funny']
[5, 5]
[[ 0.          0.4859605   0.10990839  0.          0.          0.40413111]
 [ 0.          0.          0.          0.45115266  0.          0.54884734]]
