# Text Classification 
- This is a comprehensive tutorial covering intermediate (and advanced) topics in text classification

In [1]:
%matplotlib inline
import nltk
import matplotlib.pyplot as plt
import csv
import pandas as pd
import sklearn
import numpy as np
from collections import Counter

# feature engineering (words to vectors)
from sklearn.feature_extraction.text import TfidfVectorizer

# classification algorithms (or classifiers)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

# build a pipeline
from sklearn.pipeline import Pipeline

# for gridsearch ... buiild many models with different parameters (e.g., with/without bi-gram)
from sklearn.grid_search import GridSearchCV

# model evaluation, validation
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn import metrics
from sklearn.metrics import roc_curve, auc



# Step 1: Loading the dataset

The data is in a single csv file. We can use Pandas, a python package, to load the data

In [2]:
sms_data = []
sms_labels = []
sms = []

openfile = open('data/spam.csv', 'rb')
r = csv.reader(openfile)
for i in r:
    sms.append(i)
    sms_data.append(i[0])
    sms_labels.append(i[1])    
openfile.close()

This corpus (in a csv file) contains spam emails and regular emails. Two **labels** or **y values** are 0 (regular email) and 1 (spam email). 

We'll **train a predictive (or machine learning) model to learn to discriminate between spam emails and regular emails automatically**. Then, **the predictive model will be used to classify a set of unknown (or unlabeled) emails as either spam or regular**. 

In [3]:
#entire data
print len(sms)
#texts only
print len(sms_data)
#labels only
print len(sms_labels)

5574
5574
5574


In [4]:
#first row
sms[0]

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 '0']

In [5]:
# how many spams in the dataset
print sms_labels.count('0')
print sms_labels.count('1')  #1 refers to spam emails

4827
747


747 spam emails & 4827 regular emails in the dataset

# Step 2: Preprocessing & Step 3: Feature Engineering (Words to Vectors)

In [13]:
tfidf_vectorizer = TfidfVectorizer(decode_error ='ignore')
tfidf = tfidf_vectorizer.fit_transform(sms_data)
print tfidf.shape
print tfidf

(5574, 8713)
  (0, 3571)	0.147879386867
  (0, 8084)	0.230018923239
  (0, 4374)	0.326467645747
  (0, 5958)	0.255351135361
  (0, 2338)	0.252829676565
  (0, 1316)	0.244190457021
  (0, 5571)	0.156034537655
  (0, 4114)	0.107000554123
  (0, 1767)	0.275803221504
  (0, 3655)	0.180346776875
  (0, 8548)	0.220834221079
  (0, 4501)	0.275803221504
  (0, 1765)	0.311649251596
  (0, 2061)	0.275803221504
  (0, 7694)	0.155520993044
  (0, 3615)	0.1530562397
  (0, 1082)	0.326467645747
  (0, 8324)	0.182416008163
  (1, 5538)	0.271904356737
  (1, 4537)	0.40832852092
  (1, 4342)	0.523676940648
  (1, 8450)	0.43163099771
  (1, 5567)	0.546619596648
  (2, 4114)	0.0790778808417
  (2, 3373)	0.113114957061
  :	:
  (5572, 4245)	0.122078808454
  (5572, 8371)	0.187304281983
  (5572, 1097)	0.112250676563
  (5572, 4642)	0.159548788302
  (5572, 7089)	0.184334185597
  (5572, 3323)	0.121464328313
  (5572, 7674)	0.102211246877
  (5572, 1451)	0.142906084668
  (5572, 5367)	0.210111258217
  (5572, 2606)	0.184334185597
  (5572, 

In [7]:
tfidf_vectorizer.get_feature_names()[0:10]

[u'00',
 u'000',
 u'000pes',
 u'008704050406',
 u'0089',
 u'0121',
 u'01223585236',
 u'01223585334',
 u'0125698789',
 u'02']

In [8]:
tfidf_vectorizer.get_feature_names()[-10:]

[u'zhong',
 u'zindgi',
 u'zoe',
 u'zogtorius',
 u'zoom',
 u'zouk',
 u'zyada',
 u'\xe8n',
 u'\xfa1',
 u'\u3028ud']

## Document Term Matrix (DTM)

In [9]:
tfidf_matrix = tfidf_vectorizer.fit_transform(sms_data).toarray()
tfidf_matrix

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
# Document Term Matrix
pd.DataFrame(tfidf_matrix,index=sms_data,columns=tfidf_vectorizer.get_feature_names()).head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ok lar... Joking wif u oni...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U dun say so early hor... U c already then say...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Nah I don't think he goes to usf, he lives around here though",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 4 Model Building / Model Validation / Model Evaluation

## Naive Bayes

Now that we have our features, we can train a classifier to try to predict the category of a post. Let’s start with a naïve Bayes classifier, which provides a nice baseline for this task. scikit-learn includes several variants of this classifier; the one most suitable for word counts is the multinomial variant:

In [11]:
# the below model does not use "split validation", meaning this model is NOT validated. 
# We create this invalidated model to illustrate the concep called "overfitting"
# overfitting happends that the model fits extremely well into the existing dataset, but would not be suitable for new datasets
# Simply, the model is not much generalizable.

nb = MultinomialNB().fit(tfidf, sms_labels)

predicted = nb.predict(tfidf)
np.mean(predicted == sms_labels)  

0.97613921779691426

In [12]:
print '\n overall accuracy:'
print metrics.accuracy_score(sms_labels, predicted)

print '\n confusion_matrix:'
print metrics.confusion_matrix(sms_labels, predicted)

print '\n Here is the classification report:'
print metrics.classification_report(sms_labels, predicted)


 overall accuracy:
0.976139217797

 confusion_matrix:
[[4827    0]
 [ 133  614]]

 Here is the classification report:
             precision    recall  f1-score   support

          0       0.97      1.00      0.99      4827
          1       1.00      0.82      0.90       747

avg / total       0.98      0.98      0.98      5574



- 4827 regular mails correctly predicted as regular & zero false positive 
- 614 spam correctly as spam & 133 false negative ==> This indicates that you would expect some spam mails in your inbox.
- About different measures (e.g., prediction) for model accuracy, https://en.wikipedia.org/wiki/Precision_and_recall

In [None]:
#using scikit-plot python package (https://github.com/reiinakano/scikit-plot) ... you need to install 
# !pip install scikit-plot
from scikitplot import plotters as skplt

skplt.plot_confusion_matrix(y_true=sms_labels, y_pred=predicted)
plt.show()

In [None]:
# true positive rate = 614 / 747 = 82%
# false positive rate = 0 / 4827 = 0%

probas = nb.predict_proba(tfidf)
skplt.plot_roc_curve(y_true=sms_labels, y_probas=probas)
plt.show()

The above ROC curve shows our Naive Bayesian classification model is almost perfect. 
<br><br>
The diagonal line represents pure guessing.

## Split validation

In Step 4, we built a model using different machine learning (or classification) algorithms. However, it is a **bad idea to evaluate the performance of the model on the same dataset we train the model on**. Thus, we'll use a validation method called ** split validation**.

First, we split the dataset to two parts: **training dataset (70% of the original dataset)** and **testing dataset (30% of the original dataset)**. We build a model using training dataset and apply the model to testing dataset and measure the accuracy of the model. You could have a 80-20 split or a 50-50 split.

We will build a predictive model using **x_train** and **y_train**, which are called as **training dataset**.

Then, we will apply the model to **x_test** and **y_test** (**testing dataset**) and this will tell us the performance or quality of the model

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "http://www.developer.com/imagesvr_ce/6793/ML4.png")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(sms_data, sms_labels, test_size=0.3, random_state=0)
len(x_train), len(y_train), len(x_test), len(y_test)

In [None]:
X_train = tfidf_vectorizer.fit_transform(x_train)
X_test = tfidf_vectorizer.transform(x_test)
print X_train[:2]
print "+++++++++++++++++++++++++++++++++++++++++"
print X_test[:2]

In [None]:
nb = MultinomialNB().fit(X_train, y_train)

predicted = nb.predict(X_test)
np.mean(predicted == y_test)  

In [None]:
print '\n overall accuracy:'
print metrics.accuracy_score(y_test, predicted)

print '\n confusion_matrix:'
print metrics.confusion_matrix(y_test, predicted)

print '\n Here is the classification report:'
print metrics.classification_report(y_test, predicted)

- true positive rate is only 69%
- false negative rate is very high, meaning a large number of spams will be found in your inbox.

In [None]:
#using scikit-plot python package (https://github.com/reiinakano/scikit-plot) ... you need to install 
# !pip install scikit-plot
from scikitplot import plotters as skplt

skplt.plot_confusion_matrix(y_true=np.array(y_test), y_pred=predicted)
plt.show()

### Summary of split validation:
* The accuracy of Naive Bayes-based model on this dataset was about 97%. This performance is based on when the predictive model was tested on the same dataset on which the model was built. **again, this is a bad practice since the model appears to be better than what it is**. This is called **"model overfitting**.

* The **true accuracy** of Naive Bayes-based model on this dataset turns out to be about 95%, according to **split validation**. 

# Step 5: Model Deployment

In [None]:
docs_new = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question",
            "Even my brother is not like to speak with me. They treat me like aids patent.",
             "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9",
            "hello, thank you",
           "To claim txt DIS to 87121"]

unlabeled_tfidf = tfidf_vectorizer.transform(docs_new)

predicted = nb.predict(unlabeled_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

In [None]:
#let's find the probability

predicted_prob = nb.predict_proba(unlabeled_tfidf)

for doc, category in zip(docs_new, predicted_prob):
    print('%r => %s' % (doc, category))

# Appendixes
- We need to understand the following topics as well.

## Appendix. Building a pipeline

As you have seen, building a classification (or predictive) model requires several steps. **Pipeline** is what chains these different steps together, and thus streamline the predictive modeling process. Using pipeline we write less codes.

Also, Pipeline can build several predictive models through a loop to find the best one.

http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
http://scikit-learn.org/stable/modules/pipeline.html

In [None]:
# combining feature engineering and model building
nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore')), ('clf', MultinomialNB())])

In [None]:
nb_pipeline = nb_pipeline.fit(x_train, y_train)
nb_pipeline

In [None]:
predicted = nb_pipeline.predict(x_test)
np.mean(predicted == y_test)  

In [None]:
# a complete pipeline

nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore')), ('clf', MultinomialNB())])
nb_pipeline = nb_pipeline.fit(x_train, y_train)
predicted = nb_pipeline.predict(x_test)
np.mean(predicted == y_test)  

## Appendix. Other Classifiers: How to find a better model with different classifiers or classification algorithms?

<img src="http://scikit-learn.org/stable/_static/ml_map.png">

### kNeighbors Classifier (kNN)
- http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
- http://scikit-learn.org/stable/modules/neighbors.html

In [None]:
knn_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore')), ('clf', KNeighborsClassifier())])
knn_pipeline = knn_pipeline.fit(x_train, y_train)
predicted = knn_pipeline.predict(x_test)
np.mean(predicted == y_test)

### Support Vector Machine (SVM)
- http://scikit-learn.org/stable/modules/svm.html

In [None]:
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore')), ('clf', SVC(kernel='linear', probability=True))])
svm_pipeline = svm_pipeline.fit(x_train, y_train)
predicted = svm_pipeline.predict(x_test)
np.mean(predicted == y_test)

This shows that SVM works best for this dataset. You can use this svm model for model deployment

In [None]:
# to figure out processing time

import timeit
start_time = timeit.default_timer()

svm_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore')), ('clf', SVC(kernel='linear', probability=True))])
svm_pipeline = svm_pipeline.fit(x_train, y_train)
predicted = svm_pipeline.predict(x_test)
print np.mean(predicted == y_test)

elapsed = timeit.default_timer() - start_time
print "%s %s" % ("this processing has taken", elapsed)

In my computer the above processing took about 2.63 seconds. How about yours?

In [None]:
docs_new = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question",
            "Even my brother is not like to speak with me. They treat me like aids patent.",
             "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9",
            "hello, thank you",
           "To claim txt DIS to 87121"]

#unlabeled_tfidf = tfidf_vectorizer.transform(docs_new)
predicted = svm_pipeline.predict(docs_new)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

In [None]:
#let's find the probability

predicted_prob = svm_pipeline.predict_proba(docs_new)

for doc, category in zip(docs_new, predicted_prob):
    print('%s => %s' % (doc, (category)))

## Appendix. Changing Parameters: How to improve this model? How to find a better model? How to deploy the predictive model in practice?

### Improving the Naive Bayes-based predictive model by changing parameter values?

See the parameters in Naive Bayes classifier

- TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None, token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True, smooth_idf=True, sublinear_tf=False)

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

- max_df : float in range [0.0, 1.0] or int, default=1.0 
When building the vocabulary ignore terms that have a **document frequency** strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. 
    - For example, max_df = 0.7 ==> This removes words which appear in more than 70% of the corpus (**removing frequent words**).
<br><br>    
- min_df : float in range [0.0, 1.0] or int, default=1
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. 
    - For example, min_df = 5 ==> This removes words which appear in less than five documents (**removing rare words**).

### Using different parameter values (e.g., removing stopwords, using stemming words, using ngrams, removing too frequent words, removing too rare words), the model accuracy can be improved.

In [None]:
# removing stopwords

nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english')), ('clf', MultinomialNB())])
nb_pipeline = nb_pipeline.fit(x_train, y_train)
predicted = nb_pipeline.predict(x_test)
np.mean(predicted == y_test)  

In [None]:
# removing stopwords & rare words

nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english', min_df=2)), ('clf', MultinomialNB())])
nb_pipeline = nb_pipeline.fit(x_train, y_train)
predicted = nb_pipeline.predict(x_test)
np.mean(predicted == y_test)  

In [None]:
# removing stopwords

knn_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english')), ('clf', KNeighborsClassifier())])
knn_pipeline = knn_pipeline.fit(x_train, y_train)
predicted = knn_pipeline.predict(x_test)
np.mean(predicted == y_test)

In [None]:
# removing stopwords

svm_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english')), ('clf', SVC(kernel='linear', probability=True))])
svm_pipeline = svm_pipeline.fit(x_train, y_train)
predicted = svm_pipeline.predict(x_test)
np.mean(predicted == y_test)

In [None]:
# removing stopwords & using bigram as well

knn_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english', ngram_range=(1, 2))), ('clf', KNeighborsClassifier())])
knn_pipeline = knn_pipeline.fit(x_train, y_train)
predicted = knn_pipeline.predict(x_test)
np.mean(predicted == y_test)

In [None]:
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english', ngram_range=(1, 2))), ('clf', SVC(kernel='linear', probability=True))])
svm_pipeline = svm_pipeline.fit(x_train, y_train)
predicted = svm_pipeline.predict(x_test)
np.mean(predicted == y_test)

It looks like the best model uses **TFIDF (and bigram) with SVM after removing stopwords**

### Pipeline with multiple classifiers (or algorithms)
- Then, can we test the accuracy of all classifiers at once?

In [None]:
classifiers = [MultinomialNB(), KNeighborsClassifier(), SVC(kernel='linear', probability=True)]

In [None]:
for clf in classifiers:
    pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english', ngram_range=(1, 2))), 
                         ('clf', clf)])
    classifiers_pipeline = pipeline.fit(x_train, y_train)
    predicted = classifiers_pipeline.predict(x_test)
    print "%s --> %s" % (clf, np.mean(predicted == y_test))
    print " ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ "

## Appendix. k-fold Cross validation (CV) 

### 10-fold CV
* The basic idea is that, rather than testing the model quality **only once**, cross validation (or 10-fold CV) tests the model **10 times** with 10 different testing datasets.

#### How?
* The training (or original) dataset is randomly partitioned into 10 equal sized subsamples (see the figure below). 
* At each time, one subsample is set aside as the **testing** or **validation dataset** and the other 9 subsamples are used as the training dataset. 
* A model is built using the training dataset and tested with the testing dataset. This is done 10 times. 
* This leads to 10 evaluation scores (mean squared error). The final score is based on the average of the scores.

<img src="https://chrisjmccormick.files.wordpress.com/2013/07/10_fold_cv.png">


In [None]:
nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english')), ('clf', MultinomialNB())])
nb_pipeline = nb_pipeline.fit(sms_data, sms_labels)

scores = cross_val_score(nb_pipeline, sms_data, sms_labels, scoring='accuracy', cv=10)
print scores
print scores.mean()

In [None]:
knn_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english')), ('clf', KNeighborsClassifier())])
knn_pipeline = knn_pipeline.fit(sms_data, sms_labels)

scores = cross_val_score(knn_pipeline, sms_data, sms_labels, scoring='accuracy', cv=10)
print scores
print scores.mean()

In [None]:
import timeit
start_time = timeit.default_timer()

svm_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english', ngram_range=(1, 2))), ('clf', SVC(kernel='linear', probability=True))])
svm_pipeline = svm_pipeline.fit(sms_data, sms_labels)

scores = cross_val_score(svm_pipeline, sms_data, sms_labels, scoring='accuracy', cv=10)
print scores
print scores.mean()

elapsed = timeit.default_timer() - start_time
print "%s %s" % ("this processing has taken", elapsed)

This process has taken 72 seconds in my machine. 

## Appendix. Parameter tuning using Grid Search (Requiring high computational power!!!) 
## Warning: This process is likely to slow down your computer

" We’ve already encountered some parameters such as stopwords and ngram in the TfidfTransformer. Classifiers tend to have many parameters as well; e.g., MultinomialNB includes a smoothing parameter alpha and SGDClassifier has a penalty parameter alpha and configurable loss and penalty terms in the objective function (see the module documentation, or use the Python help function, to get a description of these).

Instead of tweaking the parameters of the various components of the chain, it is possible to run an exhaustive search of the best parameters on a grid of possible values. We try out all classifiers on either words or bigrams, with or without idf, and with a penalty parameter of either 0.01 or 0.001 for the linear SVM:"

http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error ='ignore', stop_words='english')), ('clf', MultinomialNB())])

In [None]:
parameters = {'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'tfidf__min_df': [1, 2, 5],
              'tfidf__max_df': [0.9, 0.8, 0.7],
              'clf__alpha': (0.01, 0.001, 1),
}

**TfidfVectorizer** 

- ngram_range : tuple (min_n, max_n)
    - The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.

- min_df : float in range [0.0, 1.0] or int, default=1
    - When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts.

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

**MultinomialNB()** MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)

- alpha : float, optional (default=1.0)
    - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).

http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [None]:
gs_clf = GridSearchCV(nb_pipeline, parameters)
gs_clf.fit(x_train, y_train)

In [None]:
gs_clf.best_score_

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

Now, we can use the best model recommended by gridsearch.

In [None]:
# split validation

nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error='ignore', stop_words='english', min_df=1, 
                                                  ngram_range=(1,2), max_df=0.9)), 
                        ('clf', MultinomialNB(alpha=0.01))])
nb_pipeline = nb_pipeline.fit(x_train, y_train)
predicted = nb_pipeline.predict(x_test)
np.mean(predicted == y_test) 

In [None]:
# 10 fold cross validation

nb_pipeline = Pipeline([('tfidf', TfidfVectorizer(decode_error='ignore', stop_words='english', min_df=1, 
                                                  ngram_range=(1,2), max_df=0.9)), 
                        ('clf', MultinomialNB(alpha=0.01))])
nb_pipeline = nb_pipeline.fit(sms_data, sms_labels)

scores = cross_val_score(nb_pipeline, sms_data, sms_labels, scoring='accuracy', cv=10)
print scores
print scores.mean()

## Appendix get most informative features for scikit-learn classifier (Naive Bayes) for different class?

In [None]:
# since we know the parameters for the best NB model, let's initialize TfidfVectorizer again

tfidf_vectorizer = TfidfVectorizer(decode_error='ignore', stop_words='english', min_df=1, ngram_range=(1,2), max_df=0.9)

In [None]:
# split validation again

x_train, x_test, y_train, y_test = train_test_split(sms_data, sms_labels, test_size=0.2, random_state=0)
len(x_train), len(y_train), len(x_test), len(y_test)

In [None]:
# transform traing data and testing data to tfidf format 

X_train = tfidf_vectorizer.fit_transform(x_train)
X_test = tfidf_vectorizer.transform(x_test)

In [None]:
# now we have more vectors due to bi-grams
print X_train.shape
print X_test.shape

In [None]:
print tfidf_vectorizer.get_feature_names()[0:5]
print tfidf_vectorizer.get_feature_names()[-5:]

In [None]:
nb = MultinomialNB().fit(X_train, y_train)

predicted = nb.predict(X_test)
np.mean(predicted == y_test)  

In [None]:
probas = nb.predict_proba(X_test)
skplt.plot_roc_curve(y_true=y_test, y_probas=probas)
plt.show()

In [None]:
#https://stackoverflow.com/questions/26976362/how-to-get-most-informative-features-for-scikit-learn-classifier-for-different-c

def most_informative_feature_for_binanry_classification(vectorizer, classifier, n=50):
    
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print "========================================="

    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature_for_binanry_classification(tfidf_vectorizer, nb, n=10)

It appears **spam** mails tend to contain such terms as **free, claim, prize, reply, ... **

# References

- http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
- http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html (Naive Bayes)
- http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html (KNN)
- http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html (SVM)
- https://stackoverflow.com/questions/26976362/how-to-get-most-informative-features-for-scikit-learn-classifier-for-different-c