##Goal:  Train a Naive Bayes model to classify future SMS messages as either spam or ham.

Steps:

1.  Convert the words ham and spam to a binary indicator variable(0/1)

2.  Convert the txt to a sparse matrix of TFIDF vectors

3.  Fit a Naive Bayes Classifier

4.  Measure your success using roc_auc_score



Melanie Klein - CSC570R Spring 2017

In [34]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

In [35]:
df= pd.read_csv("data/SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [36]:
df.head()

Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
#Convert the words ham and spam to a binary indicator variable (ham = 0, spam = 1)
y = pd.get_dummies(data = df.spam)['spam']

In [38]:
#Convert the text to a sparse matrix of TF-IDF vectors
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)
X = vectorizer.fit_transform(df.txt)

In [39]:
print (df.shape)
print (y.shape)
print (X.shape)

(5572, 2)
(5572,)
(5572, 8587)


In [40]:
#Train a naive bayes classifier on the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
nbClassifier = naive_bayes.MultinomialNB()
nbClassifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [41]:
#Evaluate performance on the test data using roc_auc_score
roc_auc_score(y_test, nbClassifier.predict_proba(X_test)[:,1])

0.98589322144123448

In [47]:
#Use grid search to look for a better LaPlace/additive estimator than 1
#The LaPlace estimator is a number added to the calculation of each probability to eliminate any zero-value probabilities
alpha = [1,0,0.1,0.5,0.7]

newNb = naive_bayes.MultinomialNB()
estimator = GridSearchCV(newNb, dict(alpha=alpha), cv=5, n_jobs=-1, scoring='roc_auc')

In [48]:
estimator.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'alpha': [1, 0, 0.1, 0.5, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [49]:
best_nb = estimator.best_estimator_
best_nb

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [50]:
roc_auc_score(y_test, best_nb.predict_proba(X_test)[:,1])

0.99234305262313927

In [51]:
#Test on a few new examples
import numpy as np
ham_txt_array = np.array(["hey when u comin over 4 dinner we r waitin on u"])
ham_txt_vector = vectorizer.transform(ham_txt_array)
print (nbClassifier.predict(ham_txt_vector))

spam_txt_array = np.array(["limited offer chance to win txt YES to 444-5718 call now to win !!!"])
spam_txt_vector = vectorizer.transform(spam_txt_array)
print (nbClassifier.predict(spam_txt_vector))

[ 0.]
[ 1.]
