In [1]:
#Build a spam filter using Python and the Naive Bayes algorithm
#Split sample data into training and test sets. (Use suitable data set)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# Step 1: Load and preprocess the data
data = pd.read_csv("spam.csv",encoding="latin1")
data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})  # Convert labels to binary (0 for ham, 1 for spam)
X = data['v2']
y = data['v1']

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,1,This is the 2nd time we have tried 2 contact u...,,,
5568,0,Will Ì_ b going to esplanade fr home?,,,
5569,0,"Pity, * was in mood for that. So...any other s...",,,
5570,0,The guy did some bitching but I acted like i'd...,,,
5571,0,Rofl. Its true to its name,,,


In [11]:
data.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,1,This is the 2nd time we have tried 2 contact u...,,,
5568,0,Will Ì_ b going to esplanade fr home?,,,
5569,0,"Pity, * was in mood for that. So...any other s...",,,
5570,0,The guy did some bitching but I acted like i'd...,,,
5571,0,Rofl. Its true to its name,,,


In [12]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [7]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

In [8]:
y_pred = nb_classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9838565022421525
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [13]:
# Step 6: Make predictions on new data
new_emails = ["Congratulations! You've won a prize. Click here to claim it now!",
              "Hi there, just checking in to see how you're doing."]

# Transform new data using the same vectorizer
new_emails_vec = vectorizer.transform(new_emails)

# Make predictions
predictions = nb_classifier.predict(new_emails_vec)

# Output predictions
for email, prediction in zip(new_emails, predictions):
    if prediction == 1:
        print(f"Prediction for '{email}': SPAM")
    else:
        print(f"Prediction for '{email}': HAM")


Prediction for 'Congratulations! You've won a prize. Click here to claim it now!': SPAM
Prediction for 'Hi there, just checking in to see how you're doing.': HAM
