In [1]:
# phishing_detection.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Step 1: Load the dataset
data = pd.read_csv("mail_data.csv")

# Step 2: Clean and prepare
data = data.dropna()
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1, 'phishing': 1})  # map spam/phishing as 1
X = data['Message']
y = data['Category']

# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Convert text to numbers using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Train model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 6: Evaluate model
y_pred = model.predict(X_test_tfidf)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 7: Save model and vectorizer
joblib.dump(model, "phishing_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("\nModel and vectorizer saved successfully!")


✅ Accuracy: 0.9730941704035875

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.80      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Model and vectorizer saved successfully!


In [3]:
python phishing_detection.ipynb

SyntaxError: invalid syntax (2418844097.py, line 1)

In [4]:
import joblib

model = joblib.load("phishing_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

def predict_email(text):
    text_tfidf = vectorizer.transform([text])
    prediction = model.predict(text_tfidf)[0]
    if prediction == 1:
        print("🚨 This email looks like PHISHING / SPAM")
    else:
        print("✅ This email seems LEGITIMATE")

# Example test
email = "Your bank account has been blocked. Click here to verify your identity."
predict_email(email)


✅ This email seems LEGITIMATE


In [5]:
# ===============================================
# 📧 PHISHING EMAIL DETECTION USING MACHINE LEARNING
# ===============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import re
import string

# STEP 1: Load Dataset
data = pd.read_csv("mail_data.csv")
print("✅ Data Loaded — Shape:", data.shape)

# Clean up column names
data.columns = [c.strip() for c in data.columns]
if "Category" not in data.columns:
    for c in data.columns:
        if "label" in c.lower() or "spam" in c.lower():
            data.rename(columns={c: "Category"}, inplace=True)
if "Message" not in data.columns:
    for c in data.columns:
        if "message" in c.lower() or "text" in c.lower():
            data.rename(columns={c: "Message"}, inplace=True)

# STEP 2: Drop null rows
data = data.dropna(subset=["Message", "Category"])

# STEP 3: Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data["clean_text"] = data["Message"].apply(clean_text)

# STEP 4: Label encode
data["Category"] = data["Category"].str.lower().map(
    {"spam": 1, "phishing": 1, "ham": 0, "legit": 0, "normal": 0}
).fillna(0).astype(int)

# STEP 5: Split data
X_train, X_test, y_train, y_test = train_test_split(
    data["clean_text"], data["Category"], test_size=0.2, random_state=42, stratify=data["Category"]
)

# STEP 6: TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# STEP 7: Train model
model = LogisticRegression(max_iter=2000)
model.fit(X_train_tfidf, y_train)

# STEP 8: Evaluate
y_pred = model.predict(X_test_tfidf)
print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# STEP 9: Save model
joblib.dump(model, "phishing_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("\n💾 Model & Vectorizer saved successfully!")

# STEP 10: Test with new email
def predict_email(text):
    clean = clean_text(text)
    vectorized = vectorizer.transform([clean])
    pred = model.predict(vectorized)[0]
    return "🚨 PHISHING/SPAM" if pred == 1 else "✅ LEGITIMATE"

sample = "Your bank account has been locked. Click the link below to verify your credentials."
print("\nSample Test:", predict_email(sample))


✅ Data Loaded — Shape: (5572, 2)

✅ Accuracy: 96.23 %

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115


💾 Model & Vectorizer saved successfully!

Sample Test: ✅ LEGITIMATE
