# Spam Classification using "spam-or-not-spam-dataset"

This notebook downloads and trains a spam detection model, evaluates it, and allows for manual message classification input.

Author: Tanish Chawla  


In [None]:
# 1. Install Required Packages
!pip install kagglehub --quiet
!pip install pandas scikit-learn matplotlib seaborn --quiet

In [None]:
# 2. Import Libraries
import kagglehub
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [None]:
# 3. Download & Inspect Dataset
path = kagglehub.dataset_download("ozlerhakan/spam-or-not-spam-dataset")
print("Dataset directory:", path)
print("Files available:", os.listdir(path))

In [None]:
# 4. Load CSV Data
csv_file = [f for f in os.listdir(path) if f.endswith('.csv')][0]
df = pd.read_csv(os.path.join(path, csv_file))
df.head()

In [None]:
# 5. Explore Dataset
print("Shape:", df.shape)
print("Columns:", df.columns)
print(df['label'].value_counts())

In [None]:
# 6. Visualize Label Distribution
df['label'].value_counts().plot(kind='bar')
plt.title('Label Distribution (Ham vs Spam)')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
# 7. Prepare Features and Encode Labels (Handle Missing Data)
X = df['email'].fillna('')  # Fill NaN with empty string
y = df['label']

le = LabelEncoder()
y_encoded = le.fit_transform(y)  # 0 = Not Spam, 1 = Spam

In [None]:
# 8. Split Dataset for Training and Testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
# 9. Vectorize Text Data (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# 10. Train Multinomial Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

In [None]:
# 11. Evaluate the Model
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Using digits as class names with 0 = Not Spam, 1 = Spam
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['0', '1']))
print("NOTE: In this classification, '0' means Not Spam, '1' means Spam.")

In [None]:
# 12. Plot Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix\n(Note: 0 = Not Spam, 1 = Spam)')
plt.show()

In [None]:
# 13. Predict on User Input

def predict_spam(message):
    message_vec = vectorizer.transform([message])
    pred = clf.predict(message_vec)[0]
    label_map = {0: "Not Spam (0)", 1: "Spam (1)"}
    return label_map[pred]

user_message = input("Enter your email/message text to classify: ")
prediction = predict_spam(user_message)
print(f"Prediction: {prediction}")

### Optional: Sample Test Cases
Run this cell to test with multiple predefined messages.

In [None]:
test_messages = [
    "Congratulations! You have won a free iPhone. Click here to claim now!!!",
    "Hi, just checking in about the meeting tomorrow at 10am.",
    "Get cheap loans at 0% interest! Limited offer for today only.",
    "Dear customer, your package will arrive tomorrow.",
    "URGENT! Your account has been compromised, reset your password immediately",
    "Can you send me the report by end of day? Thanks!",
    "WIN big prizes by entering our sweepstakes. Enter now!",
    "Your invoice for last month is attached.",
    "Don't miss out!!! Exclusive discount on all products, limited time offer.",
    "Let's catch up for lunch next week."
]

for msg in test_messages:
    print(f"Message: {msg}")
    print(f"Prediction: {predict_spam(msg)}\n")