In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [2]:
df = pd.read_csv("../Dataset/spam.csv")
print(df.head())

print(df.Category.value_counts())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
Category
ham     4825
spam     747
Name: count, dtype: int64


In [3]:
df['spam'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0)
print(df.head())

  Category                                            Message  spam
0      ham  Go until jurong point, crazy.. Available only ...     0
1      ham                      Ok lar... Joking wif u oni...     0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...     1
3      ham  U dun say so early hor... U c already then say...     0
4      ham  Nah I don't think he goes to usf, he lives aro...     0


In [4]:
X = df['Message']
y = df['spam']

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [20]:
#Bag of words vectorize (Feature Engineering)
vectorize = CountVectorizer()
X_train_vector = vectorize.fit_transform(X_train.values)
print(X_train_vector.toarray()[:2])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [21]:
np.where(X_train_vector.toarray()[0]!=0)


(array([ 258,  354,  694, 1271, 2106, 2568, 2805, 3364, 5687, 5980, 5999,
        6460, 6773, 6888, 7396, 7437, 7471, 7474, 7555]),)

In [23]:
#Naive Bayes Model
model = MultinomialNB()
# Train with numerical y_train
model.fit(X_train_vector,y_train)

In [24]:
x_test_vector = vectorize.transform(X_test)
y_pred = model.predict(x_test_vector)

In [25]:
# Evaluate with numerical y_test
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['ham', 'spam']))

Accuracy: 0.9919282511210762
Confusion Matrix:
 [[966   0]
 [  9 140]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      1.00       966
        spam       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



Train the model using sklearn pipeline and reduce the number of lines of code

In [26]:
from sklearn.pipeline import Pipeline

In [27]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
#vectorize the email list
emails_count = vectorize.transform(emails)
model.predict(emails_count)

array([0, 1])

In [28]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb',MultinomialNB())
])

In [29]:
# Train pipeline with numerical target variable from the full dataset
clf.fit(X_train,y_train)

In [30]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

