In [4]:
# app.py — Phishing Email Detection API

from fastapi import FastAPI, Form
import joblib

# Load model and vectorizer
model = joblib.load("phishing_email_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

app = FastAPI(title="Phishing Email Detection System")

@app.get("/")
def home():
    return {"message": "Welcome to the Phishing Email Detection API"}

@app.post("/predict/")
def predict_email(content: str = Form(...)):
    """
    Input: email text (subject + body)
    Output: prediction (Phishing / Legit)
    """
    processed = vectorizer.transform([content])
    prediction = model.predict(processed)[0]
    label = "Phishing / Spam Email" if prediction == 1 else "Legitimate Email"
    return {"prediction": label}


In [5]:
# 📘 Phishing / Spam Mail Detection using Machine Learning

import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------
# 1️⃣ Load the dataset
# -----------------------
data = pd.read_csv('mail_data.csv')

# Check structure
print("Dataset shape:", data.shape)
print("Columns:", data.columns.tolist())
print(data.head())

# If dataset has unnamed index column, drop it
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

# -----------------------
# 2️⃣ Clean and preprocess
# -----------------------

# Rename columns if needed
if 'Message' not in data.columns and 'message' in data.columns:
    data.rename(columns={'message': 'Message'}, inplace=True)
if 'Category' not in data.columns and 'label' in data.columns:
    data.rename(columns={'label': 'Category'}, inplace=True)

# Remove missing values
data = data.dropna(subset=['Message', 'Category'])

# Convert labels to binary
data['Category'] = data['Category'].map({'spam': 1, 'ham': 0, 'phishing': 1}).fillna(0).astype(int)

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

data['clean_text'] = data['Message'].apply(clean_text)

# -----------------------
# 3️⃣ Split dataset
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    data['clean_text'], data['Category'], 
    test_size=0.2, random_state=42, stratify=data['Category']
)

# -----------------------
# 4️⃣ TF-IDF + Model
# -----------------------
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga')
model.fit(X_train_tfidf, y_train)

# -----------------------
# 5️⃣ Evaluation
# -----------------------
y_pred = model.predict(X_test_tfidf)

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -----------------------
# 6️⃣ Save model & vectorizer
# -----------------------
import joblib
joblib.dump(model, 'phishing_email_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("\n✅ Model and Vectorizer saved successfully!")


Dataset shape: (5572, 2)
Columns: ['Category', 'Message']
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

📊 Classification Report:
              precision    recall  f1-score   support

           0     0.9865    0.9824    0.9844       966
           1     0.8889    0.9128    0.9007       149

    accuracy                         0.9731      1115
   macro avg     0.9377    0.9476    0.9426      1115
weighted avg     0.9734    0.9731    0.9732      1115


✅ Accuracy: 0.9730941704035875

Confusion Matrix:
 [[949  17]
 [ 13 136]]

✅ Model and Vectorizer saved successfully!


In [2]:
pip install python-multipart

Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl (24 kB)
Installing collected packages: python-multipart
Successfully installed python-multipart-0.0.20
Note: you may need to restart the kernel to use updated packages.


In [7]:
# ==============================================
# 📧 Phishing / Spam Email Detection System (ML)
# ==============================================

# Import libraries
import pandas as pd
import numpy as np
import re
import string
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --------------------------
# 1️⃣ Load Dataset
# --------------------------
file_path = 'mail_data.csv'   # Adjust path if needed
data = pd.read_csv(file_path)

print("✅ Dataset loaded successfully!")
print("Shape:", data.shape)
print("Columns:", data.columns.tolist())
print(data.head())

# Drop any unnamed or empty columns
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

# Rename common columns to standard ones
for col in data.columns:
    if 'message' in col.lower() or 'text' in col.lower():
        data.rename(columns={col: 'Message'}, inplace=True)
    if 'label' in col.lower() or 'category' in col.lower() or 'spam' in col.lower():
        data.rename(columns={col: 'Category'}, inplace=True)

# Drop nulls
data.dropna(subset=['Message', 'Category'], inplace=True)

# --------------------------
# 2️⃣ Preprocessing & Cleaning
# --------------------------

def clean_text(text):
    text = text.lower()                          # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)         # Remove mentions/hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)              # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()     # Remove extra spaces
    return text

data['clean_text'] = data['Message'].apply(clean_text)

# --------------------------
# 3️⃣ Label Encoding
# --------------------------
# Map labels to binary: spam/phishing → 1, ham → 0
data['Category'] = data['Category'].str.lower()
data['Category'] = data['Category'].replace({
    'spam': 1,
    'phishing': 1,
    'ham': 0,
    'legit': 0,
    'normal': 0
})

# If any unknown labels, fill them with 0
data['Category'] = data['Category'].fillna(0).astype(int)

print("\n✅ Unique label values:", data['Category'].unique())

# --------------------------
# 4️⃣ Train-Test Split
# --------------------------
X = data['clean_text']
y = data['Category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("\n✅ Training samples:", len(X_train))
print("✅ Testing samples:", len(X_test))

# --------------------------
# 5️⃣ TF-IDF Vectorization
# --------------------------
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2)
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# --------------------------
# 6️⃣ Train Model
# --------------------------
model = LogisticRegression(
    solver='saga',
    max_iter=2000,
    class_weight='balanced'
)
model.fit(X_train_tfidf, y_train)

# --------------------------
# 7️⃣ Evaluate Model
# --------------------------
y_pred = model.predict(X_test_tfidf)

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, digits=4))
print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# --------------------------
# 8️⃣ Save Model & Vectorizer
# --------------------------
joblib.dump(model, 'phishing_email_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("\n💾 Model and Vectorizer saved successfully!")
print("➡️ Files created: phishing_email_model.pkl, tfidf_vectorizer.pkl")

# --------------------------
# 9️⃣ Predict on New Example
# --------------------------
def predict_email(text):
    clean = clean_text(text)
    features = vectorizer.transform([clean])
    result = model.predict(features)[0]
    return "🚨 Phishing / Spam Email" if result == 1 else "✅ Legitimate Email"

# Example test
sample_email = """Dear user, your account has been locked. 
Click the link below to verify your identity and regain access."""
print("\n🔍 Sample Prediction:", predict_email(sample_email))


✅ Dataset loaded successfully!
Shape: (5572, 2)
Columns: ['Category', 'Message']
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

✅ Unique label values: [0 1]

✅ Training samples: 4457
✅ Testing samples: 1115

📊 Classification Report:
              precision    recall  f1-score   support

           0     0.9865    0.9824    0.9844       966
           1     0.8889    0.9128    0.9007       149

    accuracy                         0.9731      1115
   macro avg     0.9377    0.9476    0.9426      1115
weighted avg     0.9734    0.9731    0.9732      1115


✅ Accuracy: 97.31 %

Confusion Matrix:
 [[949  17]
 [ 13 136]]

💾 Model and Vectorizer saved successfully!
➡️ Fil

In [10]:
python -m venv phishing_env

SyntaxError: invalid syntax (2052059117.py, line 1)