In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score, roc_curve, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
import sys
import shap
os.getcwd()

In [None]:
df_raw_large = pd.read_parquet('../../../../dataset/sampled-dataset/raw/sample-large.parquet')
df_pro_large=o_small = pd.read_parquet('../../../../dataset/sampled-dataset/processed/sample-large.parquet')
df_large = df_raw_large.join(df_pro_large)
df_large

In [None]:
sys.path.append(os.path.abspath('../src'))
os.getcwd()

In [None]:
sys.path.append(os.path.abspath('../../src'))

from extract_extra_features import (
    compute_name_email_similarity,
    from_name_email_sim, from_email_length,
    to_is_hidden, subject_word_count_series,
    subject_contains_malicious_words, subject_contains_benign_words,
    subject_length_series, subject_has_reply_fwd,
    subject_is_empty, from_email_in_malicious_list
)

df_large['name_email_similarity'] = compute_name_email_similarity(df_large['From_name'], df_large['From_email'])
df_large['from_name_email_sim'] = from_name_email_sim(df_large, from_name_col='From_name', from_email_col='From_email')
df_large['from_email_length'] = from_email_length(df_large['From_email'])
df_large['to_is_hidden'] = to_is_hidden(df_large, to_col='To')
df_large['subject_word_count'] = subject_word_count_series(df_large['Subject'])
df_large['subject_contains_malicious_words'] = subject_contains_malicious_words(df_large['Subject'])
df_large['subject_contains_benign_words'] = subject_contains_benign_words(df_large['Subject'])
df_large['subject_length'] = subject_length_series(df_large['Subject'])
df_large['subject_has_reply_fwd'] = subject_has_reply_fwd(df_large['Subject'])
df_large['subject_is_empty'] = subject_is_empty(df_large['Subject'])
# df_large['from_email_in_malicious_list'] = from_email_in_malicious_list(df_large['From_email'])
df_large

In [None]:
numerical_features = [
    'routing_length', 'word_count', 'readable_proportion',
    'whitespace_ratio', 'alphabet_proportion', 'grammar_error_rate',
    'english_french_proportion', 'url_count',
    'name_email_similarity', 'from_name_email_sim', 'subject_word_count',
    'from_email_length',
    'subject_length',
]

categorical_features = [
    'dkim_result', 'spf_result', 'dmarc_result', 'html_parsing_error'
]

binary_features = [
    'is_multipart', 'attachments_present',
    'dmarc_authentication_present', 'dkim_sender_domains_match',
    'to_from_addresses_match', 'sender_email_spf_match',
    'non_ascii_present', 'hidden_text_present',
    'ip_addr_urls', 'http_urls_present', 'url_at_symbol',
    'url_port_number', 'any_long_urls', 'url_multiple_subdomains',
    'to_is_hidden', 'subject_contains_malicious_words', 'subject_contains_benign_words',
    'subject_has_reply_fwd', 
    'subject_is_empty'
]

target_col = 'target_1'

model_features = numerical_features + categorical_features + binary_features
X = df_large[model_features].copy()
y = df_large[target_col].map({'benign': 0, 'malicious': 1}) 


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
X_train = pd.get_dummies(X_train, columns=categorical_features, dummy_na=True)
X_val = pd.get_dummies(X_val, columns=categorical_features, dummy_na=True)
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)

In [None]:
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler.transform(X_val[numerical_features])

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
y_prob = lr.predict_proba(X_val)[:, 1] 

In [None]:
f1 = f1_score(y_val, y_pred, average='binary')
print("F1-score:", f1)

cm = confusion_matrix(y_val, y_pred)
TN, FP, FN, TP = cm.ravel()

FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
print("False Positive Rate (FPR):", FPR)

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2', 'l1'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}


In [None]:
lr_grid = LogisticRegression(max_iter=5000, random_state=42)

grid_search = GridSearchCV(
    lr_grid, param_grid, cv=5, scoring='f1', n_jobs=-1
)

grid_search.fit(X_train, y_train)

In [None]:
print("Best Params:", grid_search.best_params_)
best_lr = grid_search.best_estimator_
y_pred = best_lr.predict(X_val)
y_prob = best_lr.predict_proba(X_val)[:, 1]
print(classification_report(y_val, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('ROC-AUC:', roc_auc_score(y_val, y_prob))
print('F1-score:', f1_score(y_val, y_pred))

In [None]:
cm = confusion_matrix(y_val, y_pred)
TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
print('False Positive Rate (FPR):', fpr)

In [None]:
thresholds = np.arange(0.01, 1, 0.01)
f1s, fprs = [], []

for t in thresholds:
    y_pred_thr = (y_prob >= t).astype(int)
    f1s.append(f1_score(y_val, y_pred_thr))
    cm = confusion_matrix(y_val, y_pred_thr)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fprs.append(fpr)

plt.figure(figsize=(8,5))
plt.plot(thresholds, f1s, label='F1-score')
plt.plot(thresholds, fprs, label='FPR')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.title('F1-score & FPR vs. Threshold')
plt.show()

In [None]:
fprs_arr = np.array(fprs)
f1s_arr = np.array(f1s)
thresholds_arr = np.array(thresholds)

idx = np.argmin(np.abs(fprs_arr - 0.05))

print(f"Threshold at closest FPR=0.05: {thresholds_arr[idx]:.2f}")
print(f"Actual FPR: {fprs_arr[idx]:.4f}")
print(f"F1-score: {f1s_arr[idx]:.4f}")

In [None]:
fprs_arr = np.array(fprs)
f1s_arr = np.array(f1s)
thresholds_arr = np.array(thresholds)

idx = np.argmin(np.abs(f1s_arr - 0.8))

print(f"Threshold at closest FPR=0.05: {thresholds_arr[idx]:.2f}")
print(f"Actual FPR: {fprs_arr[idx]:.4f}")
print(f"F1-score: {f1s_arr[idx]:.4f}")

In [None]:
X_val = X_val.dropna(axis=1, how='all')
X_train = X_train.dropna(axis=1, how='all')
X_val = X_val.astype(np.float64)
X_train = X_train.astype(np.float64)
X_val.columns = X_val.columns.map(str)
X_train.columns = X_train.columns.map(str)

In [None]:
import shap

explainer = shap.Explainer(best_lr, X_train)
shap_values = explainer(X_val)

shap.summary_plot(shap_values, X_val, plot_type='bar')

shap.summary_plot(shap_values, X_val)

In [None]:
coef_df = pd.DataFrame({
    "feature": X_train.columns,
    "coefficient": lr.coef_[0]
})

coef_df["abs_coef"] = coef_df["coefficient"].abs()
coef_df = coef_df.sort_values(by="abs_coef", ascending=False)

print(coef_df[["feature", "coefficient"]])


# RFECV

In [None]:
from sklearn.feature_selection import RFECV

estimator = LogisticRegression(max_iter=5000, random_state=42)

rfecv = RFECV(
    estimator,
    step=1,
    cv=5,
    scoring='f1',
    n_jobs=-1
)
rfecv.fit(X_train, y_train)

print("Optimal number of features selected by RFECV:", rfecv.n_features_)
selected_features = X_train.columns[rfecv.support_].tolist()
print("Selected features:", selected_features)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross-Validated F1 Score")
plt.title("RFECV: Number of Features vs. F1 Score")
plt.show()

X_train_rfecv = X_train[selected_features]
X_val_rfecv = X_val[selected_features]

estimator.fit(X_train_rfecv, y_train)
y_pred = estimator.predict(X_val_rfecv)
y_prob = estimator.predict_proba(X_val_rfecv)[:, 1]

print("RFECV Model Evaluation:")
print(classification_report(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("ROC-AUC:", roc_auc_score(y_val, y_prob))
print("F1-score:", f1_score(y_val, y_pred))

cm = confusion_matrix(y_val, y_pred)
TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
print("False Positive Rate (FPR):", fpr)


In [None]:
thresholds = np.arange(0.01, 1.0, 0.01)
f1s = []
fprs = []

for t in thresholds:
    y_pred_thr = (y_prob >= t).astype(int)
    f1 = f1_score(y_val, y_pred_thr)
    f1s.append(f1)
    
    cm = confusion_matrix(y_val, y_pred_thr)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fprs.append(fpr)

f1s = np.array(f1s)
fprs = np.array(fprs)

idx_f1 = np.argmin(np.abs(f1s - 0.8))
print(f"Threshold closest to F1=0.8: {thresholds[idx_f1]:.2f}")
print(f"F1-score at this threshold: {f1s[idx_f1]:.4f}")
print(f"FPR at this threshold: {fprs[idx_f1]:.4f}")


In [None]:
idx_fpr = np.argmin(np.abs(fprs - 0.05))
print(f"Threshold closest to FPR=0.05: {thresholds[idx_fpr]:.2f}")
print(f"FPR at this threshold: {fprs[idx_fpr]:.4f}")
print(f"F1-score at this threshold: {f1s[idx_fpr]:.4f}")