<a href="https://colab.research.google.com/github/muntasir-islam/Beginner-Friendly-WordPress-Design-Learning-Path/blob/main/Spam_Email_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Upload and Import data

In [3]:
import pandas as pd
dataset = pd.read_csv('emails.csv')
print(dataset.head())


                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [12]:
# ==============================
# Email Spam Detector — venky73 Spam Mails Dataset
# ==============================

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Download NLTK stopwords
nltk.download('stopwords')

# ------------------------------
# 1. Load Dataset
# ------------------------------
df = pd.read_csv("emails.csv")  # make sure filename matches

print("Columns in dataset:", df.columns)
print(df.head())

# ------------------------------
# 2. Rename columns to standard format
# ------------------------------
# This dataset has: 'text' (email content) and 'spam' (0/1)
df = df[['text', 'spam']].copy()
df.columns = ['text', 'label']   # rename for consistency
print("Renamed columns:", df.columns)

# ------------------------------
# 3. Preprocess Text
# ------------------------------
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text)
    text = re.sub('[^a-zA-Z]', ' ', text)   # keep only letters
    words = text.lower().split()
    words = [ps.stem(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['text'].apply(clean_text)

# ------------------------------
# 4. Features & Labels
# ------------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()

y = df['label'].values  # already 0 (ham) and 1 (spam)

# ------------------------------
# 5. Train-Test Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------
# 6. Logistic Regression
# ------------------------------
print("\n===== Logistic Regression =====")
ml_model = LogisticRegression(max_iter=200)
ml_model.fit(X_train, y_train)
y_pred_ml = ml_model.predict(X_test)

print("ML Accuracy:", accuracy_score(y_test, y_pred_ml))
print("ML Report:\n", classification_report(y_test, y_pred_ml))

# ------------------------------
# 7. ANN Model
# ------------------------------
print("\n===== ANN Model =====")
input_dim = X_train.shape[1]

ann_model = Sequential()
ann_model.add(Dense(128, input_dim=input_dim, activation='relu'))
ann_model.add(Dropout(0.3))
ann_model.add(Dense(64, activation='relu'))
ann_model.add(Dropout(0.3))
ann_model.add(Dense(1, activation='sigmoid'))

ann_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
ann_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)

loss, acc = ann_model.evaluate(X_test, y_test, verbose=0)
print("ANN Test Accuracy:", acc)

# ------------------------------
# 8. Prediction Function
# ------------------------------
def predict_email(text):
    ct = clean_text(text)
    vec = vectorizer.transform([ct]).toarray()
    ml_pred = ml_model.predict(vec)[0]
    ann_pred = ann_model.predict(vec)[0][0]

    print("\nEmail:", text)
    print("ML Prediction:", "Spam" if ml_pred == 1 else "Ham")
    print("ANN Prediction:", "Spam" if ann_pred > 0.5 else "Ham")

# Test with custom emails
predict_email("Congratulations! You've won a free $1000 gift card. Click below!")
predict_email("Hey John, are we still meeting for coffee tomorrow?")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Columns in dataset: Index(['text', 'spam'], dtype='object')
                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1
Renamed columns: Index(['text', 'label'], dtype='object')

===== Logistic Regression =====
ML Accuracy: 0.9781849912739965
ML Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       0.99      0.93      0.96       290

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146


===== ANN Model =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7945 - loss: 0.4405 - val_accuracy: 0.9891 - val_loss: 0.0345
Epoch 2/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9930 - loss: 0.0237 - val_accuracy: 0.9956 - val_loss: 0.0144
Epoch 3/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 1.0000 - loss: 0.0028 - val_accuracy: 0.9978 - val_loss: 0.0101
Epoch 4/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 8.8794e-04 - val_accuracy: 0.9956 - val_loss: 0.0098
Epoch 5/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 4.4342e-04 - val_accuracy: 0.9956 - val_loss: 0.0107
ANN Test Accuracy: 0.9912739992141724
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step

Email: Congratulations! You've won a free $1000 gift card. Click below!
ML 

In [19]:
# ==============================
# EASY Email Spam Detector
# ==============================

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load dataset
df = pd.read_csv("emails.csv")   # dataset has "text" and "spam" (0/1)
df = df[['text', 'spam']]
df.columns = ['text', 'label']

# 2. Text Cleaning
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))  # remove numbers/punctuation
    words = text.lower().split()
    words = [w for w in words if w not in stop_words]  # remove stopwords
    return " ".join(words)

df['cleaned_text'] = df['text'].apply(clean_text)

# 3. Features (X) and Labels (y)
vectorizer = TfidfVectorizer(max_features=2000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['label'].values

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ==============================
# A. Machine Learning Model (Logistic Regression)
# ==============================
ml_model = LogisticRegression(max_iter=200)
ml_model.fit(X_train, y_train)

print("\nML Accuracy:", accuracy_score(y_test, ml_model.predict(X_test)))

# ==============================
# B. ANN Model (Neural Network)
# ==============================
ann_model = Sequential([
    Dense(64, activation='relu', input_dim=X_train.shape[1]),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ann_model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

loss, acc = ann_model.evaluate(X_test, y_test, verbose=0)
print("ANN Accuracy:", acc)

# ==============================
# C. Prediction Function
# ==============================
def predict_email(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned]).toarray()
    ml_pred = ml_model.predict(vec)[0]
    ann_pred = ann_model.predict(vec)[0][0]

    print("\nEmail:", text)
    print("ML →", "Spam" if ml_pred == 1 else "Ham")
    print("ANN →", "Spam" if ann_pred > 0.5 else "Ham")

# Try it
predict_email("Congratulations! You won a FREE iPhone. Click now!")
predict_email("Hey, I have an urgent meeting with you tommorow at sharp 8 pm")
predict_email("Everyone's talking about this except you")
predict_email("Event Reminder – Line Following Robot Workshop & Competition")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



ML Accuracy: 0.9755671902268761
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8142 - loss: 0.4400
Epoch 2/5
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9952 - loss: 0.0376
Epoch 3/5
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9996 - loss: 0.0076
Epoch 4/5
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9996 - loss: 0.0035
Epoch 5/5
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9997 - loss: 0.0021
ANN Accuracy: 0.988656222820282
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step

Email: Congratulations! You won a FREE iPhone. Click now!
ML → Spam
ANN → Spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Email: Hey, I have an urgent meeting with you tommorow at sharp 8 pm
ML → Ham
ANN → Ham
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/ste