# Spam Email Detection using NLP and Machine Learning

This notebook provides a complete end-to-end analysis for spam email detection, including:
- Exploratory Data Analysis
- Text Preprocessing
- Feature Extraction (TF-IDF)
- Model Training and Evaluation
- Visualizations
- Model Comparison


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Machine Learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

# NLP libraries
import nltk
import re
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag

# Download NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', quiet=True)

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger', quiet=True)

print("All libraries imported successfully!")


## 1. Load Dataset


In [None]:
# Load the dataset
data_path = '../data/emails.csv'

try:
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"\nFirst few rows:")
    df.head()
except FileNotFoundError:
    print(f"Error: Dataset not found at {data_path}")
    print("Please ensure your dataset is in the data/ directory with columns 'label' and 'text'")
    print("\nCreating sample data for demonstration...")
    # Create sample data for demonstration
    sample_data = {
        'label': ['spam', 'ham', 'spam', 'ham', 'spam'] * 100,
        'text': [
            'Congratulations! You won a $1000 lottery. Claim now!',
            'Hey, can we meet tomorrow for lunch?',
            'URGENT: Click here to claim your prize!',
            'Thanks for the meeting today. See you next week.',
            'Free money! No deposit required. Click now!'
        ] * 100
    }
    df = pd.DataFrame(sample_data)
    print("Sample dataset created for demonstration.")


## 2. Exploratory Data Analysis (EDA)


In [None]:
# Check dataset info
print("Dataset Information:")
print("=" * 50)
print(f"Total samples: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nData types:")
print(df.dtypes)


In [None]:
# Standardize labels
df['label'] = df['label'].str.lower().str.strip()
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Remove rows with missing values
df = df.dropna(subset=['label', 'text'])

# Check label distribution
print("Label Distribution:")
print("=" * 50)
label_counts = df['label'].value_counts()
print(f"Spam (1): {label_counts.get(1, 0)}")
print(f"Ham (0): {label_counts.get(0, 0)}")
print(f"\nPercentage:")
print(f"Spam: {label_counts.get(1, 0) / len(df) * 100:.2f}%")
print(f"Ham: {label_counts.get(0, 0) / len(df) * 100:.2f}%")


In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
label_counts.plot(kind='bar', ax=axes[0], color=['#3498db', '#e74c3c'])
axes[0].set_title('Label Distribution (Bar Chart)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Ham', 'Spam'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
label_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                  colors=['#3498db', '#e74c3c'], startangle=90)
axes[1].set_title('Label Distribution (Pie Chart)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')
axes[1].legend(['Ham', 'Spam'], loc='upper right')

plt.tight_layout()
plt.show()


In [None]:
# Analyze text length statistics
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("Text Statistics:")
print("=" * 50)
print(f"\nCharacter Length:")
print(df.groupby('label')['text_length'].describe())
print(f"\nWord Count:")
print(df.groupby('label')['word_count'].describe())


In [None]:
# Visualize text length distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Character length distribution
spam_lengths = df[df['label'] == 1]['text_length']
ham_lengths = df[df['label'] == 0]['text_length']

axes[0].hist([ham_lengths, spam_lengths], bins=50, alpha=0.7, 
             label=['Ham', 'Spam'], color=['#3498db', '#e74c3c'])
axes[0].set_title('Text Length Distribution (Characters)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Character Count', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].legend()
axes[0].grid(alpha=0.3)

# Word count distribution
spam_words = df[df['label'] == 1]['word_count']
ham_words = df[df['label'] == 0]['word_count']

axes[1].hist([ham_words, spam_words], bins=50, alpha=0.7, 
             label=['Ham', 'Spam'], color=['#3498db', '#e74c3c'])
axes[1].set_title('Word Count Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Word Count', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Sample emails from each class
print("Sample Spam Emails:")
print("=" * 50)
spam_samples = df[df['label'] == 1]['text'].head(3)
for i, email in enumerate(spam_samples, 1):
    print(f"\n{i}. {email}")

print("\n\nSample Ham Emails:")
print("=" * 50)
ham_samples = df[df['label'] == 0]['text'].head(3)
for i, email in enumerate(ham_samples, 1):
    print(f"\n{i}. {email}")


## 3. Text Preprocessing


In [None]:
# Import preprocessing module
import sys
sys.path.append('../src')
from preprocess import TextPreprocessor

# Initialize preprocessor
preprocessor = TextPreprocessor(use_stemming=False)

print("Preprocessor initialized!")


In [None]:
# Example of preprocessing
sample_text = "Congratulations! You won a $1000 lottery. Visit http://example.com/claim or email us at winner@example.com <html>Click here</html>"
print("Original Text:")
print(sample_text)
print("\n" + "=" * 50)
print("Preprocessed Text:")
cleaned = preprocessor.preprocess_text(sample_text)
print(cleaned)


In [None]:
# Preprocess all emails
print("Preprocessing all emails...")
df['cleaned_text'] = preprocessor.preprocess_batch(df['text'].tolist())
print("Preprocessing complete!")
print(f"\nSample cleaned text:")
print(df[['text', 'cleaned_text']].head())


## 4. Feature Extraction (TF-IDF)


In [None]:
# Initialize TF-IDF Vectorizer
max_features = 5000
ngram_range = (1, 2)  # Unigrams and bigrams

vectorizer = TfidfVectorizer(
    max_features=max_features,
    ngram_range=ngram_range,
    stop_words='english'
)

print(f"TF-IDF Vectorizer initialized with:")
print(f"  - Max features: {max_features}")
print(f"  - N-gram range: {ngram_range}")


In [None]:
# Split data into training and testing sets
X = df['cleaned_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTraining set label distribution:")
print(y_train.value_counts())
print(f"\nTest set label distribution:")
print(y_test.value_counts())


In [None]:
# Extract TF-IDF features
print("Extracting TF-IDF features...")
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

print(f"Feature extraction complete!")
print(f"Training features shape: {X_train_features.shape}")
print(f"Test features shape: {X_test_features.shape}")
print(f"\nFeature matrix is sparse: {hasattr(X_train_features, 'toarray')}")


## 5. Model Training


In [None]:
# Initialize models
models = {
    'MultinomialNB': MultinomialNB(),
    'SVM': LinearSVC(random_state=42, max_iter=1000),
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000)
}

print("Models initialized:")
for name, model in models.items():
    print(f"  - {name}: {type(model).__name__}")


In [None]:
# Train all models
print("Training models...")
print("=" * 50)

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_features, y_train)
    print(f"{name} training complete!")

print("\n" + "=" * 50)
print("All models trained successfully!")


## 6. Model Evaluation


In [None]:
# Evaluate all models
results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Evaluating {name}")
    print(f"{'='*60}")
    
    # Make predictions
    y_pred = model.predict(X_test_features)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'predictions': y_pred
    }
    
    # Print metrics
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))


In [None]:
# Create results comparison DataFrame
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results.keys()],
    'Precision': [results[m]['precision'] for m in results.keys()],
    'Recall': [results[m]['recall'] for m in results.keys()],
    'F1-Score': [results[m]['f1_score'] for m in results.keys()]
})

print("Model Comparison:")
print("=" * 60)
print(results_df.to_string(index=False))


## 7. Visualizations


In [None]:
# Plot confusion matrices
n_models = len(models)
fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 4))

if n_models == 1:
    axes = [axes]

for idx, (name, result) in enumerate(results.items()):
    cm = result['confusion_matrix']
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues',
        xticklabels=['Ham', 'Spam'],
        yticklabels=['Ham', 'Spam'],
        ax=axes[idx]
    )
    axes[idx].set_title(f'{name} Confusion Matrix', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Actual', fontsize=11)
    axes[idx].set_xlabel('Predicted', fontsize=11)

plt.tight_layout()
plt.show()


In [None]:
# Model comparison bar chart
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
metric_keys = ['accuracy', 'precision', 'recall', 'f1_score']

for idx, (metric, key) in enumerate(zip(metrics, metric_keys)):
    ax = axes[idx // 2, idx % 2]
    models_list = list(results.keys())
    values = [results[m][key] for m in models_list]
    
    bars = ax.bar(models_list, values, color=['#3498db', '#e74c3c', '#2ecc71'])
    ax.set_title(f'{metric} Comparison', fontsize=12, fontweight='bold')
    ax.set_ylabel(metric, fontsize=11)
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar, val in zip(bars, values):
        ax.text(
            bar.get_x() + bar.get_width()/2,
            bar.get_height() + 0.01,
            f'{val:.4f}',
            ha='center',
            va='bottom',
            fontweight='bold'
        )

plt.tight_layout()
plt.show()


In [None]:
# Single accuracy comparison chart
plt.figure(figsize=(10, 6))
models_list = list(results.keys())
accuracies = [results[m]['accuracy'] for m in models_list]

bars = plt.bar(models_list, accuracies, color=['#3498db', '#e74c3c', '#2ecc71'])
plt.xlabel('Model', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Model Comparison - Accuracy Scores', fontsize=14, fontweight='bold')
plt.ylim([0, 1])
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(
        bar.get_x() + bar.get_width()/2,
        bar.get_height() + 0.01,
        f'{acc:.4f}',
        ha='center',
        va='bottom',
        fontweight='bold'
    )

plt.tight_layout()
plt.show()


## 8. Save Best Model


In [None]:
# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Accuracy: {results[best_model_name]['accuracy']:.4f}")


In [None]:
# Save model and vectorizer
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save best model
model_path = '../models/spam_model.pkl'
joblib.dump(best_model, model_path)
print(f"Model saved to {model_path}")

# Save vectorizer
vectorizer_path = '../models/tfidf_vectorizer.pkl'
joblib.dump(vectorizer, vectorizer_path)
print(f"Vectorizer saved to {vectorizer_path}")

# Save preprocessor
preprocessor_path = '../models/preprocessor.pkl'
joblib.dump(preprocessor, preprocessor_path)
print(f"Preprocessor saved to {preprocessor_path}")

print("\nAll components saved successfully!")


## 9. Test Predictions


In [None]:
# Test predictions on sample emails
test_emails = [
    "Congratulations! You won a lottery. Claim your prize now!",
    "Hey, can we meet tomorrow for lunch?",
    "URGENT: Free money! No deposit required. Click here!",
    "Thanks for the meeting today. See you next week.",
    "You've been selected! Claim your $1000 prize immediately!"
]

print("Testing Predictions:")
print("=" * 60)

for email in test_emails:
    # Preprocess
    cleaned = preprocessor.preprocess_text(email)
    
    # Transform
    features = vectorizer.transform([cleaned])
    
    # Predict
    prediction = best_model.predict(features)[0]
    label = 'Spam' if prediction == 1 else 'Ham'
    
    # Get probability if available
    if hasattr(best_model, 'predict_proba'):
        prob = best_model.predict_proba(features)[0]
        spam_prob = prob[1] if len(prob) > 1 else prob[0]
    else:
        spam_prob = "N/A"
    
    print(f"\nEmail: {email}")
    print(f"Prediction: {label}")
    if spam_prob != "N/A":
        print(f"Spam Probability: {spam_prob:.4f}")
    print("-" * 60)


## Summary

This notebook demonstrated a complete spam email detection pipeline:

1. **Data Loading**: Loaded and explored the email dataset
2. **Preprocessing**: Cleaned text by removing HTML, URLs, emails, punctuation, and applying lemmatization
3. **Feature Extraction**: Converted text to TF-IDF features with n-grams
4. **Model Training**: Trained three models (Naive Bayes, SVM, Logistic Regression)
5. **Evaluation**: Compared models using multiple metrics
6. **Visualization**: Created confusion matrices and comparison charts
7. **Model Saving**: Saved the best model for production use

The system is now ready for deployment!
