In [1]:
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score
import itertools
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
    confusion_matrix, roc_auc_score
)

In [2]:
df = pd.read_csv('emails_features.csv')

In [7]:
df.head()

Unnamed: 0,Date,Label,HasPhishyKeywords,NumRecipients,GlobalURLRank,LocalURLFreq,RecipientLikelihood
0,2000-01-04 00:11:00,0,0,1,10000000,0,0.0
1,2000-01-04 04:11:00,0,0,1,10000000,0,0.0
2,2000-01-04 05:25:00,0,0,1,2802,0,0.0
3,2000-01-05 02:45:00,0,0,1,21039,0,0.0
4,2000-01-05 02:54:00,0,0,1,10000000,0,0.0


In [10]:
df.sample(5)

Unnamed: 0,Date,Label,HasPhishyKeywords,NumRecipients,GlobalURLRank,LocalURLFreq,RecipientLikelihood
7515,2001-08-21 12:15:31,0,0,2,10000000,0,0.0
554,2000-06-07 10:46:00,0,0,1,10000000,5,0.0
11572,2001-12-03 15:30:32,0,0,1,479040,29,1.0
8855,2001-10-17 14:57:33,0,0,3,10000000,15,0.0
3476,2001-02-04 09:22:00,0,0,1,10000000,4,0.0


In [11]:
df.count()

Unnamed: 0,0
Date,13184
Label,13184
HasPhishyKeywords,13184
NumRecipients,13184
GlobalURLRank,13184
LocalURLFreq,13184
RecipientLikelihood,13184


In [12]:
df["Date"] = pd.to_datetime(df["Date"])

train_df = df[(df["Date"] >= "2000-02-01") & (df["Date"] < "2002-01-01")]
test_df = df[(df["Date"] >= "2002-01-01") & (df["Date"] < "2002-07-13")]

# Check class ratios
def legit_to_phishing_ratio(name, subset):
    num_legit = (subset["Label"] == 0).sum()
    num_phishing = (subset["Label"] == 1).sum()

    # Calculate ratio
    ratio = num_legit / num_phishing
    print(f"Legit to Phishing ratio: 1:{ratio:.0f}")

legit_to_phishing_ratio("Train", train_df)
legit_to_phishing_ratio("Test", test_df)

Legit to Phishing ratio: 1:137
Legit to Phishing ratio: 1:27


In [13]:
# For training set
print("Training Set:")
print(train_df["Label"].value_counts())
print()

# For testing set
print("Testing Set:")
print(test_df["Label"].value_counts())

Training Set:
Label
0    11817
1       86
Name: count, dtype: int64

Testing Set:
Label
0    1150
1      43
Name: count, dtype: int64


In [25]:
# Define features
features = ["HasPhishyKeywords", "NumRecipients", "GlobalURLRank", "LocalURLFreq", "RecipientLikelihood"]

# Training data
X_train = train_df[features]
y_train = train_df["Label"]

# Test data
X_test = test_df[features]
y_test = test_df["Label"]

# Final training
final_model = RandomForestClassifier(
    n_estimators=30,
    max_depth=9,
    min_samples_leaf=4,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
final_model.fit(X_train, y_train)

# Predict on test set
y_proba = final_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.83).astype(int)

# Metrics
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# FP per million
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
fp_per_million = (fp / (y_test == 0).sum()) * 1_000_000

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"False Positives per Million: {fp_per_million:.2f}")
print(f"TP: {tp}, FP: {fp}")


Precision: 1.0000
Recall: 0.8140
F1 Score: 0.8974
Accuracy: 0.9933
AUC: 0.9781
False Positives per Million: 0.00
TP: 35, FP: 0


In [26]:
importances = final_model.feature_importances_
feature_names = features

# Create a DataFrame for readability
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by="Importance", ascending=False)

print(importance_df)

               Feature  Importance
1        NumRecipients    0.395486
3         LocalURLFreq    0.266558
4  RecipientLikelihood    0.252717
2        GlobalURLRank    0.049962
0    HasPhishyKeywords    0.035277


In [28]:
# Define features
features = ["HasPhishyKeywords", "NumRecipients", "GlobalURLRank", "LocalURLFreq", "RecipientLikelihood"]

# Training data
X_train = train_df[features]
y_train = train_df["Label"]

# Test data
X_test = test_df[features]
y_test = test_df["Label"]

# Final training
final_model = RandomForestClassifier(
    n_estimators=30,
    max_depth=9,
    min_samples_leaf=4,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
final_model.fit(X_train, y_train)

# Predict on test set
y_proba = final_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

# Metrics
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# FP per million
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
fp_per_million = (fp / (y_test == 0).sum()) * 1_000_000

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"False Positives per Million: {fp_per_million:.2f}")
print(f"TP: {tp}, FP: {fp}")


Precision: 0.5902
Recall: 0.8372
F1 Score: 0.6923
Accuracy: 0.9732
AUC: 0.9781
False Positives per Million: 21739.13
TP: 36, FP: 25


In [27]:
# Define features
features = ["NumRecipients", "LocalURLFreq", "RecipientLikelihood"]

# Training data
X_train = train_df[features]
y_train = train_df["Label"]

# Test data
X_test = test_df[features]
y_test = test_df["Label"]

# Final training
final_model = RandomForestClassifier(
    n_estimators=30,
    max_depth=9,
    min_samples_leaf=4,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
final_model.fit(X_train, y_train)

# Predict on test set
y_proba = final_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.83).astype(int)

# Metrics
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

# FP per million
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
fp_per_million = (fp / (y_test == 0).sum()) * 1_000_000

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"False Positives per Million: {fp_per_million:.2f}")
print(f"TP: {tp}, FP: {fp}")


Precision: 1.0000
Recall: 0.8140
F1 Score: 0.8974
Accuracy: 0.9933
AUC: 0.9113
False Positives per Million: 0.00
TP: 35, FP: 0
