In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score, roc_curve, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
import sys
import shap
os.getcwd()

In [None]:
df_raw_large = pd.read_parquet('../../../../dataset/sampled-dataset/raw/sample-large.parquet')
df_pro_large=o_small = pd.read_parquet('../../../../dataset/sampled-dataset/processed/sample-large.parquet')
df_large = df_raw_large.join(df_pro_large)

In [None]:
sys.path.append(os.path.abspath('../../src'))

from extract_extra_features import (
    compute_name_email_similarity,
    from_name_email_sim, from_email_length,
    to_is_hidden, subject_word_count_series,
    subject_contains_malicious_words, subject_contains_benign_words
)



df_large['name_email_similarity'] = compute_name_email_similarity(df_large['From_name'], df_large['From_email'])
df_large['from_name_email_sim'] = from_name_email_sim(df_large, from_name_col='From_name', from_email_col='From_email')
df_large['from_email_length'] = from_email_length(df_large['From_email'])
df_large['to_is_hidden'] = to_is_hidden(df_large, to_col='To')
df_large['subject_word_count'] = subject_word_count_series(df_large['Subject'])
df_large['subject_contains_malicious_words'] = subject_contains_malicious_words(df_large['Subject'])
df_large['subject_contains_benign_words'] = subject_contains_benign_words(df_large['Subject'])
df_large


In [None]:
numerical_features = [
    'routing_length', 'word_count', 'readable_proportion',
    'whitespace_ratio', 'alphabet_proportion', 'grammar_error_rate',
    'english_french_proportion', 'url_count',
    'name_email_similarity', 'from_name_email_sim', 'subject_word_count',
    'from_email_length',
]

categorical_features = [
    'dkim_result', 'spf_result', 'dmarc_result', 'html_parsing_error'
]

binary_features = [
    'is_multipart', 'attachments_present',
    'dmarc_authentication_present', 'dkim_sender_domains_match',
    'to_from_addresses_match', 'sender_email_spf_match',
    'non_ascii_present', 'hidden_text_present',
    'ip_addr_urls', 'http_urls_present', 'url_at_symbol',
    'url_port_number', 'any_long_urls', 'url_multiple_subdomains',
    'to_is_hidden', 'subject_contains_malicious_words', 'subject_contains_benign_words',
]


target_col = 'target_1'

model_features = numerical_features + categorical_features + binary_features
X = df_large[model_features].copy()
y = df_large[target_col].map({'benign': 0, 'malicious': 1}) 

In [None]:
selected_categorical = ['spf_result', 'dkim_result']
selected_basic = [
    'subject_contains_malicious_words',
    'subject_contains_benign_words',
    'from_name_email_sim',
    'whitespace_ratio',
    'is_multipart',
    'routing_length',
    'to_is_hidden',
    'url_at_symbol',
]
X_selected = pd.concat([
    df_large[selected_basic],
    df_large[selected_categorical]
], axis=1)
y_selected = df_large[target_col].map({'benign': 0, 'malicious': 1})


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_selected, y_selected, test_size=0.2, stratify=y_selected, random_state=42
)

In [None]:
X_train = pd.get_dummies(X_train, columns=selected_categorical, dummy_na=True)
X_val = pd.get_dummies(X_val, columns=selected_categorical, dummy_na=True)
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)

In [None]:
num_feats = ['from_name_email_sim', 'whitespace_ratio', 'routing_length']
scaler = StandardScaler()
X_train[num_feats] = scaler.fit_transform(X_train[num_feats])
X_val[num_feats] = scaler.transform(X_val[num_feats])

In [None]:
lr = LogisticRegression(max_iter=5000, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
y_prob = lr.predict_proba(X_val)[:, 1] 

In [None]:
f1 = f1_score(y_val, y_pred, average='binary')
print("F1-score:", f1)

cm = confusion_matrix(y_val, y_pred)
TN, FP, FN, TP = cm.ravel()

FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
print("False Positive Rate (FPR):", FPR)

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2', 'l1'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}


In [None]:
lr_grid = LogisticRegression(max_iter=5000, random_state=42)
grid_search = GridSearchCV(
    lr_grid, param_grid, cv=5, scoring='f1', n_jobs=-1
)
grid_search.fit(X_train, y_train)

In [None]:
print("Best Params:", grid_search.best_params_)
best_lr = grid_search.best_estimator_

y_pred = best_lr.predict(X_val)
y_prob = best_lr.predict_proba(X_val)[:, 1]
print(classification_report(y_val, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('ROC-AUC:', roc_auc_score(y_val, y_prob))
print('F1-score:', f1_score(y_val, y_pred))

cm = confusion_matrix(y_val, y_pred)
TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
print('False Positive Rate (FPR):', fpr)

In [None]:
thresholds = np.arange(0.01, 1, 0.01)
f1s, fprs = [], []

for t in thresholds:
    y_pred_thr = (y_prob >= t).astype(int)
    f1s.append(f1_score(y_val, y_pred_thr))
    cm = confusion_matrix(y_val, y_pred_thr)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fprs.append(fpr)

import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(thresholds, f1s, label='F1-score')
plt.plot(thresholds, fprs, label='FPR')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.title('F1-score & FPR vs. Threshold')
plt.show()


In [None]:
fprs_arr = np.array(fprs)
f1s_arr = np.array(f1s)
thresholds_arr = np.array(thresholds)

idx = np.argmin(np.abs(fprs_arr - 0.05))

print(f"Threshold at closest FPR=0.05: {thresholds_arr[idx]:.2f}")
print(f"Actual FPR: {fprs_arr[idx]:.4f}")
print(f"F1-score: {f1s_arr[idx]:.4f}")

In [None]:
fprs_arr = np.array(fprs)
f1s_arr = np.array(f1s)
thresholds_arr = np.array(thresholds)

idx = np.argmin(np.abs(f1s_arr - 0.8))

print(f"Threshold at closest FPR=0.05: {thresholds_arr[idx]:.2f}")
print(f"Actual FPR: {fprs_arr[idx]:.4f}")
print(f"F1-score: {f1s_arr[idx]:.4f}")

In [None]:
coef_df = pd.DataFrame({
    "feature": X_train.columns,
    "coefficient": lr.coef_[0]
})

coef_df["abs_coef"] = coef_df["coefficient"].abs()
coef_df = coef_df.sort_values(by="abs_coef", ascending=False)

print(coef_df[["feature", "coefficient"]])


In [None]:
from sklearn.feature_selection import RFECV

estimator = LogisticRegression(max_iter=5000, random_state=42)
rfecv = RFECV(
    estimator,
    step=1,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    min_features_to_select=2
)
rfecv.fit(X_train, y_train)

print("RFECV optimal feature count:", rfecv.n_features_)
selected_features_cv = X_train.columns[rfecv.support_].tolist()
print("RFECV Selected features:", selected_features_cv)

plt.figure(figsize=(8, 4))
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])
plt.xlabel("Number of features selected")
plt.ylabel("CV mean F1-score")
plt.title("RFECV: Feature Count vs. CV F1-score")
plt.show()

X_train_cv = X_train[selected_features_cv]
X_val_cv = X_val[selected_features_cv]
lr_cv = LogisticRegression(max_iter=5000, random_state=42)
lr_cv.fit(X_train_cv, y_train)
y_pred_cv = lr_cv.predict(X_val_cv)
y_prob_cv = lr_cv.predict_proba(X_val_cv)[:, 1]

print(classification_report(y_val, y_pred_cv))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred_cv))
print('ROC-AUC:', roc_auc_score(y_val, y_prob_cv))
print('F1-score:', f1_score(y_val, y_pred_cv))

cm = confusion_matrix(y_val, y_pred_cv)
TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
print('False Positive Rate (FPR):', fpr)
