## SMS Spam Classifier: Multinomial Naive Bayes

The notebook is divided into the following sections:
1. Importing and preprocessing data
2. Building the model: Multinomial Naive Bayes
    - Model building 
    - Model evaluation

### 1. Importing and Preprocessing Data

In [18]:
import pandas as pd

# reading the training data
docs = pd.read_csv("bbc-text.csv")
docs.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [19]:
# number of SMSes / documents
len(docs)

2225

In [20]:
# counting categories in the data set
categories = docs.category.value_counts()
categories

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [21]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

docs['target']= label_encoder.fit_transform(docs['category']) 


In [22]:
docs.head()

Unnamed: 0,category,text,target
0,tech,tv future in the hands of viewers with home th...,4
1,business,worldcom boss left books alone former worldc...,0
2,sport,tigers wary of farrell gamble leicester say ...,3
3,sport,yeading face newcastle in fa cup premiership s...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,1


In [23]:
# we can now drop the column 'category'
docs = docs.drop('category', axis=1)
docs.head()

Unnamed: 0,text,target
0,tv future in the hands of viewers with home th...,4
1,worldcom boss left books alone former worldc...,0
2,tigers wary of farrell gamble leicester say ...,3
3,yeading face newcastle in fa cup premiership s...,3
4,ocean s twelve raids box office ocean s twelve...,1


In [24]:
# convert to X and y
X = docs.text
y = docs.target
print(X.shape)
print(y.shape)

(2225,)
(2225,)


In [25]:
# splitting into test and train
from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [26]:
X_train.head()

1201    parmalat sues 45 banks over crash parmalat has...
977     jones happy with henson heroics wales fly-half...
985     cup holders man utd visit everton holders manc...
1105    gazprom  in $36m back-tax claim  the nuclear u...
330     yangtze electric s profits double yangtze elec...
Name: text, dtype: object

In [27]:
y_train.head()

1201    0
977     3
985     3
1105    0
330     0
Name: target, dtype: int64

In [28]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [29]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [30]:
# printing the vocabulary
vect.vocabulary_

{'parmalat': 17172,
 'sues': 22694,
 '45': 789,
 'banks': 3010,
 'crash': 6329,
 'sued': 22692,
 'tries': 24009,
 'reclaim': 19159,
 'money': 15612,
 'paid': 17038,
 'scandal': 20589,
 'hit': 11556,
 'italian': 12869,
 'dairy': 6632,
 'company': 5602,
 'went': 25404,
 'bust': 4316,
 'year': 25881,
 'firm': 9605,
 'collapsed': 5454,
 'debts': 6796,
 '14bn': 186,
 'euros': 8825,
 '19bn': 389,
 '10bn': 86,
 'new': 16139,
 'boss': 3830,
 'enrico': 8601,
 'bondi': 3750,
 'taken': 23075,
 'legal': 13881,
 'action': 1492,
 'number': 16395,
 'lenders': 13921,
 'claims': 5193,
 'aware': 2804,
 'problems': 18354,
 'continued': 6005,
 'work': 25702,
 'earn': 8128,
 'commissions': 5566,
 'identified': 11983,
 'gone': 10651,
 'time': 23600,
 'law': 13786,
 'administrators': 1581,
 'seek': 20862,
 'financial': 9539,
 'institutions': 12540,
 'prior': 18315,
 'insolvency': 12500,
 'suspicion': 22897,
 'knew': 13485,
 'trouble': 24053,
 'said': 20413,
 'preparing': 18204,
 'suits': 22731,
 'according':

In [31]:
# vocab size
len(vect.vocabulary_.keys())

26018

In [32]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed = vect.transform(X_test)

In [33]:
# note that the type is transformed (sparse) matrix
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 58)	1
  (0, 86)	1
  (0, 186)	1
  (0, 389)	1
  (0, 405)	1
  (0, 668)	1
  (0, 789)	2
  (0, 833)	1
  (0, 1118)	1
  (0, 1421)	1
  (0, 1424)	1
  (0, 1492)	2
  (0, 1581)	1
  (0, 1722)	1
  (0, 1950)	1
  (0, 2037)	1
  (0, 2273)	1
  (0, 2703)	1
  (0, 2804)	1
  (0, 2933)	1
  (0, 2999)	2
  (0, 3010)	5
  (0, 3457)	1
  (0, 3728)	1
  (0, 3750)	1
  :	:
  (1667, 18448)	1
  (1667, 18543)	1
  (1667, 19188)	1
  (1667, 19854)	1
  (1667, 20413)	2
  (1667, 20713)	1
  (1667, 20714)	1
  (1667, 20897)	2
  (1667, 20902)	1
  (1667, 21586)	1
  (1667, 21778)	4
  (1667, 21851)	1
  (1667, 22174)	1
  (1667, 22499)	1
  (1667, 22827)	1
  (1667, 23080)	1
  (1667, 23468)	1
  (1667, 23566)	2
  (1667, 23600)	1
  (1667, 23953)	1
  (1667, 25297)	1
  (1667, 25358)	1
  (1667, 25361)	1
  (1667, 25580)	1
  (1667, 25881)	5


### 2. Building and Evaluating the Model

In [34]:
# training the NB model and making predictions
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
mnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = mnb.predict_proba(X_test_transformed)


In [35]:
# note that alpha=1 is used by default for smoothing
mnb

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Model Evaluation

In [47]:
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9820466786355476

In [42]:
# evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix

In [43]:
print(classification_report(y_test,y_pred_class))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       137
           1       1.00      1.00      1.00        89
           2       0.98      0.97      0.98       102
           3       0.99      0.99      0.99       129
           4       0.99      0.98      0.98       100

   micro avg       0.99      0.99      0.99       557
   macro avg       0.99      0.99      0.99       557
weighted avg       0.99      0.99      0.99       557



In [54]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

In [55]:
#predict the categories
y_pred_class = text_clf.predict(X_test)


In [56]:
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)
print(classification_report(y_test,y_pred_class))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       137
           1       0.99      0.99      0.99        89
           2       0.94      0.97      0.96       102
           3       1.00      0.99      1.00       129
           4       0.99      0.97      0.98       100

   micro avg       0.98      0.98      0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557



Without stop words the accuracy of the model has increased

In [60]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# We will be using the 'text_clf' going forward.

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

#predict the categories
y_pred_class = text_clf.predict(X_test)

metrics.accuracy_score(y_test, y_pred_class)
print(classification_report(y_test,y_pred_class))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       137
           1       1.00      1.00      1.00        89
           2       0.98      0.97      0.98       102
           3       0.99      0.99      0.99       129
           4       0.99      0.98      0.98       100

   micro avg       0.99      0.99      0.99       557
   macro avg       0.99      0.99      0.99       557
weighted avg       0.99      0.99      0.99       557



In [61]:
from sklearn.feature_extraction.text import TfidfTransformer
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# We will be using the 'text_clf' going forward.

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

#predict the categories
y_pred_class = text_clf.predict(X_test)

metrics.accuracy_score(y_test, y_pred_class)
print(classification_report(y_test,y_pred_class))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       137
           1       1.00      0.98      0.99        89
           2       0.96      0.97      0.97       102
           3       0.98      0.99      0.99       129
           4       0.99      0.95      0.97       100

   micro avg       0.98      0.98      0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557



In [None]:
TfidfTransformer reduced the accuracy of the model

In [71]:
from sklearn.feature_extraction.text import TfidfTransformer
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# We will be using the 'text_clf' going forward.

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

#predict the categories
y_pred_class = text_clf.predict(X_test)

print("Accuracy_Score", metrics.accuracy_score(y_test, y_pred_class))
print(classification_report(y_test,y_pred_class))

Accuracy_Score 0.9802513464991023
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       137
           1       1.00      0.98      0.99        89
           2       0.96      0.97      0.97       102
           3       0.98      0.99      0.99       129
           4       0.99      0.95      0.97       100

   micro avg       0.98      0.98      0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557



In [72]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

#predict the categories
y_pred_class = text_clf.predict(X_test)

print("Accuracy_Score", metrics.accuracy_score(y_test, y_pred_class))
print(classification_report(y_test,y_pred_class))



Accuracy_Score 0.9802513464991023
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       137
           1       1.00      0.98      0.99        89
           2       0.96      0.97      0.97       102
           3       0.98      0.99      0.99       129
           4       0.99      0.95      0.97       100

   micro avg       0.98      0.98      0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557



In [64]:
# We will now perform Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [65]:
# Next, we create an instance of the grid search by passing the classifier, parameters 

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)



In [66]:
# To see the best mean score and the params, run the following code

gs_clf.best_score_
gs_clf.best_params_



{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [73]:
# lets try to use stemmer and then check the accuracy of the model

import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(X_train, y_train)

predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test)



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
Accuracy_Score 0.9910233393177738
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       137
           1       1.00      0.98      0.99        89
           2       0.96      0.97      0.97       102
           3       0.98      0.99      0.99       129
           4       0.99      0.95      0.97       100

   micro avg       0.98      0.98      0.98       557
   macro avg       0.98      0.98      0.98       557
weighted avg       0.98      0.98      0.98       557



In [74]:
print("Accuracy_Score", metrics.accuracy_score(y_test, predicted_mnb_stemmed))
print(classification_report(y_test,predicted_mnb_stemmed))

Accuracy_Score 0.9910233393177738
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       137
           1       1.00      1.00      1.00        89
           2       0.98      0.98      0.98       102
           3       1.00      0.99      1.00       129
           4       0.99      0.99      0.99       100

   micro avg       0.99      0.99      0.99       557
   macro avg       0.99      0.99      0.99       557
weighted avg       0.99      0.99      0.99       557

