<h2> Spam Classifier Using Naive Bayes <h2>

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Loading Data

df = pd.read_table('SMSSpamCollection', 
                   sep='\t', header=None, names=['label', 'message'])

df['label'] = df['label'].map({'spam':1, 'ham':0})

df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   int64 
 1   message  5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [4]:
# Train Test Split

x_train, x_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.25)

In [5]:
# Multinomial models - Building Model

cv = CountVectorizer()

nb = MultinomialNB()

pipeline = Pipeline([('cv', cv),('nb', nb)])

params = {
    'nb__alpha':[0.8,0.85,0.9,0.95,1]
}

model = GridSearchCV(pipeline, params, cv=10)

In [6]:
# Training model

model.fit(x_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'nb__alpha': [0.8, 0.85, 0.9, 0.95, 1]})

In [7]:
# Best Params

model.best_params_

{'nb__alpha': 0.95}

In [8]:
# Evaluating Model

y_predict = model.predict(x_test)

print('Accuracy score: ', format(accuracy_score(y_test, y_predict)))
print('Precision score: ', format(precision_score(y_test, y_predict)))
print('Recall score: ', format(recall_score(y_test, y_predict)))
print('F1 score: ', format(f1_score(y_test, y_predict)))

Accuracy score:  0.9834888729361091
Precision score:  0.9772727272727273
Recall score:  0.900523560209424
F1 score:  0.9373297002724795
