In [2]:
#Import Required Libraries

import pandas as pd
import numpy as np
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

In [3]:
#Load the Dataset

df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

In [4]:
#Preprocess the Text

nltk.download('stopwords')
from nltk.corpus import stopwords

def preprocess(text):
    text = text.lower()  # Make all letters lowercase
    text = ''.join([ch for ch in text if ch not in string.punctuation])  # Remove punctuation
    words = text.split()  # Split sentence into list of words
    words = [w for w in words if w not in stopwords.words('english')]  # Remove common words like "the", "is"
    return ' '.join(words)

df['clean_text'] = df['text'].apply(preprocess)

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
#Convert Labels to Numbers

df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

In [6]:
#Convert Text to Numbers (TF-IDF Vectorization)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = df['label_num']

In [7]:
#Split Data for Training and Testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
#Train the Model

model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [9]:
#Test the Model

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9659192825112107
Confusion Matrix:
 [[965   0]
 [ 38 112]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

