In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [None]:
# Load dataset
file_path = "./email_dataset.csv" 
data = pd.read_csv(file_path)

In [3]:
# Select relevant columns (Authentication headers + phishing label)
columns = ['spf', 'dkim', 'dmarc', 'phishing']
df = data[columns].dropna()


In [4]:
# Convert categorical authentication results to numerical values
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(df[['spf', 'dkim', 'dmarc']])
y = df['phishing'].values


In [None]:
#Split the dataset to train and test data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

In [6]:
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print("\n--- Logistic Regression ---")
print(classification_report(y_test, y_pred_log))


--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        30
           1       1.00      0.81      0.89        63

    accuracy                           0.87        93
   macro avg       0.86      0.90      0.86        93
weighted avg       0.91      0.87      0.87        93



In [10]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
print("\n--- Random Forest ---")
print(classification_report(y_test, y_pred_rf))


--- Random Forest ---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        30
           1       1.00      0.81      0.89        63

    accuracy                           0.87        93
   macro avg       0.86      0.90      0.86        93
weighted avg       0.91      0.87      0.87        93



In [11]:
# SVM
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\n--- SVM ---")
print(classification_report(y_test, y_pred_svm))


--- SVM ---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        30
           1       1.00      0.81      0.89        63

    accuracy                           0.87        93
   macro avg       0.86      0.90      0.86        93
weighted avg       0.91      0.87      0.87        93



In [12]:
#Feed forward neural network implementation

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import  accuracy_score

# Scale features for deep learning model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=16, validation_data=(X_test_scaled, y_test), verbose=1)

# Evaluate model
y_pred_nn = (model.predict(X_test_scaled) > 0.5).astype("int32")
print("Deep Learning Model Accuracy:", accuracy_score(y_test, y_pred_nn))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7350 - loss: 0.5811 - val_accuracy: 0.8387 - val_loss: 0.5071
Epoch 2/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8199 - loss: 0.4143 - val_accuracy: 0.8387 - val_loss: 0.4635
Epoch 3/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8401 - loss: 0.3926 - val_accuracy: 0.8387 - val_loss: 0.4369
Epoch 4/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8213 - loss: 0.3855 - val_accuracy: 0.8387 - val_loss: 0.4216
Epoch 5/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8767 - loss: 0.3129 - val_accuracy: 0.8387 - val_loss: 0.4065
Epoch 6/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8614 - loss: 0.3092 - val_accuracy: 0.8710 - val_loss: 0.3867
Epoch 7/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━