In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib

In [3]:
spam_data = pd.read_csv('../data/spam.csv', encoding="latin-1")
spam_data.sample(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
109,ham,Dont worry. I guess he's busy.,,,
3462,ham,i am seeking a lady in the street and a freak ...,,,
5512,ham,"Just making dinner, you ?",,,


In [4]:
spam_data.iloc[2].v2

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [5]:
# Check the v1 column values
spam_data.v1.unique()

array(['ham', 'spam'], dtype=object)

In [6]:
# Encode the ham and spam
spam_data['label'] = spam_data['v1'].map({'ham': 0, 'spam': 1})
spam_data.sample(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,label
477,ham,Tension ah?what machi?any problem?,,,,0
5145,ham,Aiyar u so poor thing... I give u my support k...,,,,0
4885,ham,Or just do that 6times,,,,0


In [7]:
X = spam_data['v2']
y = spam_data['label']

In [8]:
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2021)

#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1594
           1       0.95      0.93      0.94       245

    accuracy                           0.98      1839
   macro avg       0.97      0.96      0.96      1839
weighted avg       0.98      0.98      0.98      1839



In [9]:
# Persist the model
joblib.dump(clf, '../models/spam_detector_model.pkl')

['../models/spam_detector_model.pkl']