In [1]:
import warnings
warnings.filterwarnings('ignore')
import nltk
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Read the CSV input file
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep ='\t', names=['labels', 'message'])

#convert categorical target variable to numeric
labels  = {'ham':0, 'spam':1}  

messages['labels'] = [labels[item] for item in messages['labels']]

# test data pre-processing
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

token1 = RegexpTokenizer(r'[a-zA-Z]+' r'\w{1,}') # Removing all, keeping only alphanumeric and string of length >= 2

def text_process(mess):
    # Remove punc
    # Remove stop words
    
    nopunc = re.sub(",", '', ''.join(mess))
    nopunc = [char for char in nopunc.split() if char.lower() not in stopwords.words('english')]
    nopunc = [char for char in nopunc if char not in string.punctuation] 
    return ' '.join(token1.tokenize(' '.join(nopunc)))



messages['message'] = messages['message'].apply(text_process)

from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB


vectorizer = CountVectorizer()
tfidf = TfidfTransformer()
nbmodel = MultinomialNB()

# define param field for the pipeline
nb_params = {'vect__ngram_range':[(1,1),(1,2)], 'classify__alpha':[0.01,0.1,1,10,100]}

# define pipeline for the model
nb_pipeline = Pipeline([('vect', vectorizer), ('tf', tfidf),('classify', nbmodel)])

from sklearn.metrics import classification_report, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix


# define model validation crireria, we will use recall score to rank the model performance
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

import warnings
warnings.filterwarnings('ignore')

#instanciate grid search cv with fold of 5 
grid = GridSearchCV(nb_pipeline, nb_params, cv=5, scoring=scorers, return_train_score=False, refit='recall_score')

grid.fit(messages['message'], messages['labels'])

# print best parameter for the model
print('best param is: ', grid.best_params_)

best param is:  {'classify__alpha': 0.01, 'vect__ngram_range': (1, 2)}


In [2]:
pd.DataFrame(grid.cv_results_)[['mean_test_precision_score','mean_test_recall_score','mean_test_accuracy_score', 'params']]

Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,params
0,0.972573,0.896898,0.982771,"{'classify__alpha': 0.01, 'vect__ngram_range':..."
1,0.966899,0.921004,0.985104,"{'classify__alpha': 0.01, 'vect__ngram_range':..."
2,0.979434,0.891539,0.98295,"{'classify__alpha': 0.1, 'vect__ngram_range': ..."
3,0.984295,0.89824,0.984386,"{'classify__alpha': 0.1, 'vect__ngram_range': ..."
4,0.998246,0.749603,0.96626,"{'classify__alpha': 1, 'vect__ngram_range': (1..."
5,1.0,0.686747,0.958004,"{'classify__alpha': 1, 'vect__ngram_range': (1..."
6,1.0,0.018754,0.868449,"{'classify__alpha': 10, 'vect__ngram_range': (..."
7,0.0,0.0,0.865937,"{'classify__alpha': 10, 'vect__ngram_range': (..."
8,0.0,0.0,0.865937,"{'classify__alpha': 100, 'vect__ngram_range': ..."
9,0.0,0.0,0.865937,"{'classify__alpha': 100, 'vect__ngram_range': ..."


In [3]:
grid.cv_results_

{'mean_fit_time': array([0.11860294, 0.2816813 , 0.11509743, 0.32600341, 0.12545919,
        0.28933582, 0.13083119, 0.33189764, 0.11768727, 0.30449648]),
 'std_fit_time': array([0.00456864, 0.00523117, 0.00513386, 0.03743566, 0.00722947,
        0.00738127, 0.01879092, 0.05950554, 0.00462623, 0.02876556]),
 'mean_score_time': array([0.08009405, 0.13556809, 0.07904463, 0.20110893, 0.08282952,
        0.14595118, 0.09577198, 0.14066978, 0.07826161, 0.14839211]),
 'std_score_time': array([0.00504522, 0.00431416, 0.00363729, 0.07554815, 0.00593123,
        0.00966799, 0.02175819, 0.00324467, 0.00283855, 0.01115585]),
 'param_classify__alpha': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngram_range': masked_array(data=[(1, 1), (1, 2), (1, 1), (1, 2), (1, 1), (1, 2), (1, 1),
                    