Here I will to show how to use bayes on multi-class classification/discrimination

import class sklearn.naive_bayes.MultinomialNB for Multinomial logistic regression (logistic regression of multi-class)

But if you want to classify binary/boolean class, it is better to use BernoulliNB 

I will use also compare accuracy for using BOW, TF-IDF, and HASHING for vectorizing technique

In [1]:
# to get f1 score
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split



Define some function to help us for preprocessing

In [2]:
# clear string
def clearstring(string):
    string = re.sub('[^\'\"A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
416809
416809


In [4]:
# bag-of-word
bow = CountVectorizer().fit_transform(trainset.data)

#tf-idf, must get from BOW first
tfidf = TfidfTransformer().fit_transform(bow)

#hashing, default n_features, probability cannot divide by negative
hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)



Feed Naive Bayes using BOW

but split it first into train-set (80% of our data-set), and validation-set (20% of our data-set)

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.859072479067
             precision    recall  f1-score   support

      anger       0.90      0.84      0.87     11464
       fear       0.84      0.81      0.82      9455
        joy       0.85      0.93      0.89     28246
       love       0.82      0.61      0.70      6920
    sadness       0.87      0.94      0.91     24263
   surprise       0.84      0.34      0.49      3014

avg / total       0.86      0.86      0.85     83362



Feed Naive Bayes using TF-IDF

but split it first into train-set (80% of our data-set), and validation-set (20% of our data-set)

In [6]:
train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.734855209808
             precision    recall  f1-score   support

      anger       0.93      0.54      0.69     11336
       fear       0.91      0.37      0.53      9603
        joy       0.68      0.98      0.80     28062
       love       0.96      0.16      0.27      7085
    sadness       0.74      0.94      0.83     24278
   surprise       0.94      0.04      0.08      2998

avg / total       0.79      0.73      0.69     83362



Feed Naive Bayes using hashing

but split it first into train-set (80% of our data-set), and validation-set (20% of our data-set)

In [7]:
train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.578524987404
             precision    recall  f1-score   support

      anger       0.93      0.07      0.12     11449
       fear       0.96      0.03      0.05      9533
        joy       0.49      1.00      0.66     28047
       love       1.00      0.00      0.01      6967
    sadness       0.76      0.79      0.78     24408
   surprise       0.00      0.00      0.00      2958

avg / total       0.71      0.58      0.47     83362



  'precision', 'predicted', average, warn_for)
