In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string
import matplotlib.pyplot as plt

In [2]:
keggle = '/kaggle/input/spam-message-dataset/Spam/spam (1).tsv'
try:
    df = pd.read_csv('/kaggle/input/spam-message-dataset/Spam/spam (1).tsv', sep='\t', names=['Class', 'Message'])
except FileNotFoundError:
    df = pd.read_csv('spam.tsv', sep='\t', names=['Class', 'Message'])
df.head()

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5567 non-null   object
 1   Message  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
df['Length'] = df['Message'].apply(len)

In [5]:
df.head()

Unnamed: 0,Class,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


In [6]:
df.describe()

Unnamed: 0,Length
count,5567.0
mean,80.450153
std,59.891023
min,2.0
25%,36.0
50%,62.0
75%,122.0
max,910.0


In [7]:
df['Class'].value_counts()

Class
ham     4821
spam     746
Name: count, dtype: int64

### Text Preprocessing

In [8]:
# converting class column to int type where ham = 1 and spam = 0
df.loc[df['Class'] == 'ham', 'Class'] = 1

df.loc[df['Class'] == 'spam', 'Class'] = 0

df['Class'] = df['Class'].astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Class    5567 non-null   int64 
 1   Message  5567 non-null   object
 2   Length   5567 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 130.6+ KB


In [9]:
df.head()

Unnamed: 0,Class,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


### Removing Punctuation

In [10]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
# creating the function for removing puntuation
def remove_punctuation(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# test the function
s = 'data // science!|'
remove_punctuation(s)

'data  science'

In [12]:
# loop message column data and creating list
text = []
for i in df['Message']:
    t = remove_punctuation(i)
    text.append(t)

# add cleaned data to df
df['clean_text'] = text
df.head()

Unnamed: 0,Class,Message,Length,clean_text
0,1,I've been searching for the right words to tha...,196,Ive been searching for the right words to than...
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
2,1,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
3,1,Even my brother is not like to speak with me. ...,77,Even my brother is not like to speak with me T...
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36,I HAVE A DATE ON SUNDAY WITH WILL


In [13]:
# Spliting the Data
x = df['clean_text']
y = df['Class']

# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape

(4453,)

### Using Bag of words techniqe to remove stop words

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
cv = CountVectorizer(stop_words="english")

In [15]:
x_train_cv = cv.fit_transform(x_train)
x_train.shape

(4453,)

In [16]:
import warnings
warnings.filterwarnings('ignore')
x = cv.get_feature_names_out()

print(x_train_cv.shape)

(4453, 8216)


### Traning the model

In [17]:
# Using Multinomial Model
nb = MultinomialNB()
nb.fit(x_train_cv, y_train)

In [18]:
# BOW on test data
x_test_cv = cv.transform(x_test)

In [19]:
y_pred1 = nb.predict(x_test_cv)
y_pred1

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       145
           1       0.99      0.99      0.99       969

    accuracy                           0.99      1114
   macro avg       0.98      0.98      0.98      1114
weighted avg       0.99      0.99      0.99      1114



In [21]:
print(accuracy_score(y_test, y_pred1))

0.9892280071813285


In [22]:
pd.crosstab(y_test, y_pred1)

col_0,0,1
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,139,6
1,6,963


### Conclusion
#### A Multinomial Naive Bayes classifier was trained using Bag-of-Words features with English stopword removal to detect spam messages. The model achieved an accuracy of 98.9% on the test set, with strong precision and recall for both spam and non-spam classes. The confusion matrix indicates very few misclassifications, demonstrating that the model is highly effective at identifying spam messages while maintaining a low false positive rate. This result confirms that traditional NLP techniques combined with Naive Bayes provide a strong and efficient baseline for text classification tasks such as spam detection.

In [23]:
# Using Bernoli Model
bnb = BernoulliNB()
bnb.fit(x_train_cv, y_train)

In [24]:
y_pred2 = bnb.predict(x_test_cv)
y_pred2

array([1, 1, 1, ..., 1, 1, 1])

In [25]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.99      0.81      0.89       145
           1       0.97      1.00      0.99       969

    accuracy                           0.97      1114
   macro avg       0.98      0.90      0.94      1114
weighted avg       0.97      0.97      0.97      1114



In [26]:
print(accuracy_score(y_test, y_pred2))

0.973967684021544


In [27]:
pd.crosstab(y_test, y_pred2)

col_0,0,1
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,117,28
1,1,968


### Conclusion 
#### A Bernoulli Naive Bayes classifier was trained using Bag-of-Words features with English stopword removal to detect spam messages based on the presence or absence of words. The model achieved an accuracy of 97.4% on the test set, with very high recall for spam messages, indicating strong capability in identifying most spam instances. However, the confusion matrix shows a higher number of false positives for non-spam messages compared to the Multinomial Naive Bayes model. This suggests that while Bernoulli Naive Bayes is effective for aggressive spam detection, it is less balanced in preserving legitimate messages. Overall, it provides a reasonable baseline but performs slightly weaker than Multinomial Naive Bayes for this text classification task.

## Final Conclusion
### In this project, two Naive Bayes variants—Multinomial Naive Bayes and Bernoulli Naive Bayes—were evaluated for SMS spam classification using Bag-of-Words features with English stopword removal. Both models demonstrated strong performance, confirming the effectiveness of probabilistic classifiers for text classification tasks.
### Multinomial Naive Bayes achieved higher overall accuracy (98.9%) with a better balance between precision and recall for both spam and non-spam classes, resulting in fewer false positives and false negatives. Bernoulli Naive Bayes, while achieving very high recall for spam detection, produced a higher number of false positives, incorrectly classifying more legitimate messages as spam.
### Overall, Multinomial Naive Bayes proved to be the more reliable and balanced model for this dataset, making it the preferred choice for deployment in real-world spam detection systems. This comparison highlights that while multiple Naive Bayes variants can perform well, selecting the appropriate model based on data representation (word frequency vs. word presence) is crucial for achieving optimal performance.