In [1]:
import warnings
warnings.filterwarnings('ignore')
import nltk
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep ='\t', names=['labels', 'message'])

In [3]:
messages.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
messages.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
labels  = {'ham':0, 'spam':1}
messages['labels'] = [labels[item] for item in messages['labels']]

In [6]:
messages.head()

Unnamed: 0,labels,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
token1 = RegexpTokenizer(r'[a-zA-Z]+' r'\w{1,}')
def text_process(mess):
    # Remove punc
    # Remove stop words
    
    nopunc = re.sub(",", '', ''.join(mess))
    nopunc = [char for char in nopunc.split() if char.lower() not in stopwords.words('english')]
    nopunc = [char for char in nopunc if char not in string.punctuation] 
    return ' '.join(token1.tokenize(' '.join(nopunc)))

In [8]:
messages['message'] = messages['message'].apply(text_process)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [10]:
vectorizer = CountVectorizer()
tfidf = TfidfTransformer()
nbmodel = MultinomialNB()

In [11]:
nb_params = {'vect__ngram_range':[(1,1),(1,2)], 'classify__alpha':[0.01,0.1,1,10,100]}

In [12]:
nb_pipeline = Pipeline([('vect', vectorizer), ('tf', tfidf),('classify', nbmodel)])

In [13]:
from sklearn.metrics import classification_report, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [19]:
import warnings
warnings.filterwarnings('ignore')
grid = GridSearchCV(nb_pipeline, nb_params, cv=5, scoring=scorers, return_train_score=False, refit='recall_score')
grid.fit(messages['message'], messages['labels'])

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..._tf=False, use_idf=True)), ('classify', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'classify__alpha': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit='recall_score',
       return_train_score=False,
       scoring={'precision_score': make_scorer(precision_score), 'recall_score': make_scorer(recall_score), 'accuracy_score': make_scorer(accuracy_score)},
       verbose=0)

In [20]:
grid.best_score_

0.9210042703128939

In [21]:
grid.best_params_

{'classify__alpha': 0.01, 'vect__ngram_range': (1, 2)}

In [26]:
#pd.DataFrame(grid.cv_results_)[['rank_test_precision_score','mean_test_precision_score','rank_test_recall_score', 'mean_test_recall_score','rank_test_accuracy_score','mean_test_accuracy_score', 'params']]

In [24]:
pd.DataFrame(grid.cv_results_)[['mean_test_precision_score','mean_test_recall_score','mean_test_accuracy_score', 'params']]

Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,params
0,0.972573,0.896898,0.982771,"{'classify__alpha': 0.01, 'vect__ngram_range':..."
1,0.966899,0.921004,0.985104,"{'classify__alpha': 0.01, 'vect__ngram_range':..."
2,0.979434,0.891539,0.98295,"{'classify__alpha': 0.1, 'vect__ngram_range': ..."
3,0.984295,0.89824,0.984386,"{'classify__alpha': 0.1, 'vect__ngram_range': ..."
4,0.998246,0.749603,0.96626,"{'classify__alpha': 1, 'vect__ngram_range': (1..."
5,1.0,0.686747,0.958004,"{'classify__alpha': 1, 'vect__ngram_range': (1..."
6,1.0,0.018754,0.868449,"{'classify__alpha': 10, 'vect__ngram_range': (..."
7,0.0,0.0,0.865937,"{'classify__alpha': 10, 'vect__ngram_range': (..."
8,0.0,0.0,0.865937,"{'classify__alpha': 100, 'vect__ngram_range': ..."
9,0.0,0.0,0.865937,"{'classify__alpha': 100, 'vect__ngram_range': ..."


In [25]:
grid.cv_results_['params']

[{'classify__alpha': 0.01, 'vect__ngram_range': (1, 1)},
 {'classify__alpha': 0.01, 'vect__ngram_range': (1, 2)},
 {'classify__alpha': 0.1, 'vect__ngram_range': (1, 1)},
 {'classify__alpha': 0.1, 'vect__ngram_range': (1, 2)},
 {'classify__alpha': 1, 'vect__ngram_range': (1, 1)},
 {'classify__alpha': 1, 'vect__ngram_range': (1, 2)},
 {'classify__alpha': 10, 'vect__ngram_range': (1, 1)},
 {'classify__alpha': 10, 'vect__ngram_range': (1, 2)},
 {'classify__alpha': 100, 'vect__ngram_range': (1, 1)},
 {'classify__alpha': 100, 'vect__ngram_range': (1, 2)}]

In [27]:
grid.cv_results_

{'mean_fit_time': array([0.16774645, 0.31149945, 0.11636128, 0.2992053 , 0.13080578,
        0.35657535, 0.13272901, 0.31232347, 0.13143659, 0.30129819]),
 'std_fit_time': array([0.06288862, 0.01365031, 0.00428341, 0.00466977, 0.01932867,
        0.04282254, 0.0288405 , 0.02141032, 0.01832953, 0.00773289]),
 'mean_score_time': array([0.11158905, 0.16950288, 0.07887845, 0.14178877, 0.08575883,
        0.17143607, 0.08565078, 0.14625278, 0.08445215, 0.14316196]),
 'std_score_time': array([0.03912365, 0.03905329, 0.00135481, 0.00362036, 0.01073895,
        0.02672678, 0.0077588 , 0.00502931, 0.00237141, 0.00284932]),
 'param_classify__alpha': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngram_range': masked_array(data=[(1, 1), (1, 2), (1, 1), (1, 2), (1, 1), (1, 2), (1, 1),
                    