In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
def remove_non_chars(text):
    lowercase_text = text.lower()
    return re.sub(r'[^\w\s]', '', lowercase_text)


df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]  
df.columns = ['label', 'message']
df['message'] = df['message'].apply(remove_non_chars)


vectorizer = TfidfVectorizer(stop_words='english' , ngram_range=(1,2))
X = vectorizer.fit_transform(df['message'])

y = df['label'].map({'ham': 0, 'spam': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9282511210762332
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       965
           1       0.97      0.48      0.64       150

    accuracy                           0.93      1115
   macro avg       0.95      0.74      0.80      1115
weighted avg       0.93      0.93      0.92      1115

[[963   2]
 [ 78  72]]


In [4]:
sample_messages = [
    "Congratulations! You've won a free ticket to Bahamas. Text WIN to 12345 to claim.",
    "Hey, are we still meeting for coffee tomorrow?",
    "Urgent! Your account has been compromised. Click here to secure your information.",
    "I’m running late, will be there in 10 minutes."
]

sample_messages_transformed = vectorizer.transform(sample_messages)

predictions = model.predict(sample_messages_transformed)

predicted_labels = ['spam' if label == 1 else 'ham' for label in predictions]

for message, label in zip(sample_messages, predicted_labels):
    print(f"Message: '{message}'\nPredicted label: {label}\n")


Message: 'Congratulations! You've won a free ticket to Bahamas. Text WIN to 12345 to claim.'
Predicted label: spam

Message: 'Hey, are we still meeting for coffee tomorrow?'
Predicted label: ham

Message: 'Urgent! Your account has been compromised. Click here to secure your information.'
Predicted label: ham

Message: 'I’m running late, will be there in 10 minutes.'
Predicted label: ham

