In [56]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [57]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [58]:
# Load dataset
file_path = "./email_dataset.csv"  # Adjust this path
data = pd.read_csv(file_path)

In [60]:
# Select relevant columns (Authentication headers + phishing label)
columns = ['spf', 'dkim', 'dmarc', 'phishing']
df = data[columns].dropna()


In [61]:
# Convert categorical authentication results to numerical values
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(df[['spf', 'dkim', 'dmarc']])
y = df['phishing'].values


In [62]:
# 1️⃣ Chi-Square Test
print("\n--- Chi-Square Test ---")
chi2_results = []
for col in ['spf', 'dkim', 'dmarc']:
    contingency_table = pd.crosstab(df[col], df['phishing'])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    chi2_results.append((col, chi2, p))
    print(f"{col}: Chi2 = {chi2:.3f}, p-value = {p}")


--- Chi-Square Test ---
spf: Chi2 = 77.569, p-value = 2.704644311221641e-15
dkim: Chi2 = 147.436, p-value = 4.690405362425339e-30
dmarc: Chi2 = 162.223, p-value = 2.002143232183971e-32


In [49]:
from tabulate import tabulate
def performChiSquareTest(data, feature, target='phishing'):
    contingency_table = pd.crosstab(data[feature], data[target])
    # print(tabulate(contingency_table,headers='keys', tablefmt='psql'))
    chi, p_value, degree_of_freedom, expected_freq = stats.chi2_contingency(contingency_table)
    
    return {
        'feature': feature,
        'chi_statistic': chi,
        'p_value': p_value,
        'degree_of_freedom': degree_of_freedom,
        'expected_frequencies': expected_freq,
        'contingency_table': contingency_table
    }

In [50]:
auth_features = ['spf', 'dkim', 'dmarc']

chi_square_results = {feature: performChiSquareTest(df, feature) for feature in auth_features}

chi_square_results_summary = {
    feature: {
        'chi_statistic': result['chi_statistic'],
        'p_value': result['p_value'],
        'degree_of_freedom': result['degree_of_freedom']
    }
    for feature, result in chi_square_results.items()
}
print(chi_square_results_summary)

{'spf': {'chi_statistic': np.float64(77.5691541821256), 'p_value': np.float64(2.704644311221641e-15), 'degree_of_freedom': 5}, 'dkim': {'chi_statistic': np.float64(147.43599862306604), 'p_value': np.float64(4.690405362425339e-30), 'degree_of_freedom': 5}, 'dmarc': {'chi_statistic': np.float64(162.22333216580768), 'p_value': np.float64(2.002143232183971e-32), 'degree_of_freedom': 6}}


In [63]:
# 2️⃣ Baseline Models (Traditional ML)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42)

In [64]:
# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print("\n--- Logistic Regression ---")
print(classification_report(y_test, y_pred_log))


--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        30
           1       1.00      0.81      0.89        63

    accuracy                           0.87        93
   macro avg       0.86      0.90      0.86        93
weighted avg       0.91      0.87      0.87        93



In [66]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
# feature_importances = rf_model.feature_importances_
print("\n--- Random Forest ---")
print(classification_report(y_test, y_pred_rf))
# Create a bar plot for feature importance
# The feature names are the categories from the OneHotEncoder
# Flatten the encoder categories to match the features
# feature_names = [f"{col}_{category}" for col, categories in zip(['spf', 'dkim', 'dmarc'], encoder.categories_) for category in categories]

# # Create a bar plot for feature importance
# plt.figure(figsize=(10, 6))
# plt.barh(feature_names, feature_importances)
# plt.xlabel('Feature Importance')
# plt.title('Feature Importance for Random Forest Model')
# plt.show()


--- Random Forest ---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        30
           1       1.00      0.81      0.89        63

    accuracy                           0.87        93
   macro avg       0.86      0.90      0.86        93
weighted avg       0.91      0.87      0.87        93



In [67]:
# SVM
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\n--- SVM ---")
print(classification_report(y_test, y_pred_svm))


--- SVM ---
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        30
           1       1.00      0.81      0.89        63

    accuracy                           0.87        93
   macro avg       0.86      0.90      0.86        93
weighted avg       0.91      0.87      0.87        93



In [68]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# Scale features for deep learning model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Improved Deep Learning Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=16, validation_data=(X_test_scaled, y_test), verbose=1)

# Evaluate model
y_pred_nn = (model.predict(X_test_scaled) > 0.5).astype("int32")
print("Deep Learning Model Accuracy:", accuracy_score(y_test, y_pred_nn))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5771 - loss: 0.7530 - val_accuracy: 0.8387 - val_loss: 0.5437
Epoch 2/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7240 - loss: 0.5700 - val_accuracy: 0.8387 - val_loss: 0.4972
Epoch 3/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8587 - loss: 0.4194 - val_accuracy: 0.8602 - val_loss: 0.4542
Epoch 4/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8551 - loss: 0.3954 - val_accuracy: 0.8602 - val_loss: 0.4310
Epoch 5/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8560 - loss: 0.3609 - val_accuracy: 0.8602 - val_loss: 0.4065
Epoch 6/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8624 - loss: 0.3359 - val_accuracy: 0.8602 - val_loss: 0.3846
Epoch 7/50
[1m14/14[0m [32m━━━━━━━━━

In [40]:
# One-hot encode categorical headers
encoder = OneHotEncoder(sparse_output=False)
encoded_headers = encoder.fit_transform(df[['spf', 'dkim', 'dmarc']])
X = encoded_headers
y = df['phishing'].values
# Reshape data for LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))  # Adding a time-step dimension

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import chi2_contingency
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Embedding, Flatten
from tensorflow.keras.optimizers import Adam
# Reshape data for LSTM input
X = X.reshape((X.shape[0], X.shape[1], 1))  # Adding a time-step dimension

# Split dataset
# LSTM Model
lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Dropout(0.3),
    LSTM(32, return_sequences=False),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = lstm_model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test), verbose=1)

# Evaluate model
y_pred_lstm = (lstm_model.predict(X_test) > 0.5).astype("int32")
print("LSTM Model Accuracy:", accuracy_score(y_test, y_pred_lstm))

Epoch 1/50


  super().__init__(**kwargs)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.6171 - loss: 0.6650 - val_accuracy: 0.6774 - val_loss: 0.6648
Epoch 2/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6432 - loss: 0.5905 - val_accuracy: 0.6774 - val_loss: 0.6735
Epoch 3/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7104 - loss: 0.5244 - val_accuracy: 0.6774 - val_loss: 0.6594
Epoch 4/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7778 - loss: 0.5282 - val_accuracy: 0.6774 - val_loss: 0.6286
Epoch 5/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7513 - loss: 0.4604 - val_accuracy: 0.6989 - val_loss: 0.6432
Epoch 6/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8088 - loss: 0.4869 - val_accuracy: 0.6774 - val_loss: 0.5684
Epoch 7/50
[1m14/14[0m [32m━━━━━━━━━━━━━━━