## Summary of work done

* Used `XGBClassifier` to predict `benign` or `malicious` on `sample-large` (3000 emails)
* 5 different iterations of `XGBClassifier`: `nontext`, `subject` (preprocessed with spacy), `body` (preprocessed with spacy) + ensemble classifiers `VotingClassifier` and `StackingClassifier` that takes in the 3 `nontext`, `subject` and `body` models

Findings:
* `subject` model has the worst performance, probably due to the lack of information that can be extracted from the subject line that is only a few words long
* `nontext` and `body` models have similar performance &mdash; confident in predicting `malicous` but a coin toss (or worse) for `benign` (poor F1-score and horrible FPR for `benign` class)
* `StackingClassifier` is better than `VotingClassifier`, but the performance is not too different from individual `nontext` and `body` models
* Majority of features in `nontext` do not have large impact on SHAP values
* Attempted hyperparameter tuning for `XGBClassifier` for `nontext` model; returned best hyperparameters values as the default values
* Attempted feature engineering with `From_email_domain` and `Reply-To` domains but no improvement in performance
* Attempted dropping `self_phishing` emails but no improvement in performance

In [None]:
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath("../.."))

## Read in data

In [None]:
original_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/raw/sample-large.parquet')
input_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/processed/sample-large.parquet')

combined_df = original_df.join(input_df)
combined_df.head()

## Prepare train data

In [None]:
from src.extract_text_keywords import preprocess_text

text_features = ['Subject', 'text_plain']
numerical_features = [
    'routing_length', 'word_count', 'readable_proportion',
    'whitespace_ratio', 'alphabet_proportion', 'grammar_error_rate',
    'english_french_proportion', 'url_count', 
]
categorical_features = [
    'dkim_result', 'spf_result', 'dmarc_result',
    'html_parsing_error'
]
binary_features = [
    'http_urls_present', 'any_long_urls', 
    'is_multipart', 'attachments_present', 
    'url_at_symbol', 
    'dmarc_authentication_present', 'dkim_sender_domains_match',
    'to_from_addresses_match', 'sender_email_spf_match',
    'non_ascii_present', 'hidden_text_present', #'all_urls_accessible', 'urls_redirected',
    'ip_addr_urls',  
    'url_port_number', 'url_multiple_subdomains',
]

input_df_columns = text_features + numerical_features + categorical_features + binary_features + ['target_1']
input_df = combined_df[input_df_columns]

input_df['Subject'] = preprocess_text(input_df['Subject'].fillna(""))
input_df['target_1'] = input_df['target_1'].map({'benign': 0, 'malicious': 1})
input_df.head()

## Generate train-test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    input_df.drop(columns=['target_1']), input_df['target_1'],
    train_size=0.7, random_state=42
)

## Create preprocessors and pipelines

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

preprocessor_nontext = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OneHotEncoder(drop='if_binary', handle_unknown='ignore'), categorical_features + binary_features),
    ("drop", text_features)
)

preprocessor_subject = make_column_transformer(
    ("drop", numerical_features + categorical_features + binary_features + [text_features[1]]),
    (TfidfVectorizer(), text_features[0])
)

preprocessor_body = make_column_transformer(
    ("drop", numerical_features + categorical_features + binary_features + [text_features[0]]),
    (TfidfVectorizer(), text_features[1])
)

In [None]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

pipe_nontext = make_pipeline(
    preprocessor_nontext,
    XGBClassifier(
        n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

pipe_subject = make_pipeline(
    preprocessor_subject,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

pipe_body = make_pipeline(
    preprocessor_body,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

## Create ensemble classifiers

In [None]:
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [('nontext', pipe_nontext), ('subject', pipe_subject), ('body', pipe_body)]

vc = VotingClassifier(
    estimators=estimators,
    n_jobs=-1, voting='soft'
)

sc = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    n_jobs=-1,
)

## Cross-validation results

In [None]:
def summarise_cv_results(results):
    return {
        "mean_fit_time": results['fit_time'].mean(),
        "mean_score_time": results['score_time'].mean(),
        "mean_train_f1": results['train_score'].mean(),
        "mean_test_f1": results['test_score'].mean(),
    }

In [None]:
from sklearn.model_selection import cross_validate

models = [pipe_nontext, pipe_subject, pipe_body, vc, sc]
names = ['pipe_nontext', 'pipe_subject', 'pipe_body', 'voting', 'stacking']
cv_results = {}

for name, model in zip(names, models):
    results = cross_validate(
        model, X_train, y_train,
        scoring='f1', cv=5, n_jobs=-1,
        return_train_score=True
    )

    cv_results[name] = summarise_cv_results(results)

pd.DataFrame(cv_results)

## Train results

In [None]:
for model in models:
    model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

for model, name in zip(models, names):
    y_pred = model.predict(X_train)
    print(f'Model: {name}\n')
    print(classification_report(y_train, y_pred))

    cm = confusion_matrix(y_train, y_pred)
    TP, TN, FP, FN = cm[1, 1], cm[0, 0], cm[0, 1], cm[1, 0]
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0

    print(f'         FPR    {FPR:.05f}\n')
    print(cm)
    print('-----------------------------------')

## Test results

In [None]:
for model, name in zip(models, names):
    y_pred = model.predict(X_test)
    print(f'Model: {name}\n')
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    TP, TN, FP, FN = cm[1, 1], cm[0, 0], cm[0, 1], cm[1, 0]
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0

    print(f'         FPR    {FPR:.05f}\n')
    print(cm)
    print('-----------------------------------')

## Ensemble classifiers

In [None]:
vc

In [None]:
sc

## Distribution of probabilities

In [None]:
probs = pd.DataFrame(sc.predict_proba(X_test))
probs['true'] = y_test.tolist()
probs['pred'] = pipe_body.predict(X_test)

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot distribution of probabilities for actual benign emails (true=0)
benign = probs[probs['true'] == 0]
sns.histplot(data=benign, x=0, ax=axes[0], bins=20, color='blue', alpha=0.7)
sns.histplot(data=benign, x=1, ax=axes[0], bins=20, color='red', alpha=0.7)
axes[0].set_title('Probability Distribution for Actual Benign Emails')
axes[0].set_xlabel('Probability')
axes[0].set_ylabel('Count')
axes[0].legend(['Prob of Benign (0)', 'Prob of Malicious (1)'])

# Plot distribution of probabilities for actual malicious emails (true=1)
malicious = probs[probs['true'] == 1]
sns.histplot(data=malicious, x=0, ax=axes[1], bins=20, color='blue', alpha=0.7)
sns.histplot(data=malicious, x=1, ax=axes[1], bins=20, color='red', alpha=0.7)
axes[1].set_title('Probability Distribution for Actual Malicious Emails')
axes[1].set_xlabel('Probability')
axes[1].set_ylabel('Count')
axes[1].legend(['Prob of Benign (0)', 'Prob of Malicious (1)'])

# Add overall title and adjust layout
plt.suptitle('Distribution of Prediction Probabilities by True Label', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.9)

# Show the plot
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(
    vc, X_test, y_test
)

## Feature importances

In [None]:
import shap

explainer = shap.TreeExplainer(pipe_nontext['xgbclassifier'])
observations = pd.DataFrame(
    pipe_nontext['columntransformer'].transform(X_train),
    columns=pipe_nontext['columntransformer'].get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
shap.summary_plot(shap_values, observations, plot_type="bar")

In [None]:
explanation = explainer(observations)
shap.plots.beeswarm(explanation, max_display=50)

In [None]:
explainer = shap.TreeExplainer(pipe_subject['xgbclassifier'])
observations = pd.DataFrame(
    pipe_subject['columntransformer'].transform(X_train).toarray(),
    columns=pipe_subject['columntransformer'].get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
explanation = explainer(observations)
shap.plots.beeswarm(explanation)

In [None]:
explainer = shap.TreeExplainer(pipe_body['xgbclassifier'])
observations = pd.DataFrame(
    pipe_body['columntransformer'].transform(X_train).toarray(),
    columns=pipe_body['columntransformer'].get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
explanation = explainer(observations)
shap.plots.beeswarm(explanation)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'xgbclassifier__eta': [0.01, 0.5, 0.1, 0.25, 0.5],
    'xgbclassifier__max_depth': [3, 4, 5, 6],
    'xgbclassifier__min_child_weight': [1, 2, 4, 8],
    'xgbclassifier__gamma': [0.01, 0.5, 0.1, 0.25, 0.5],
    'xgbclassifier__reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

grid_nontext = GridSearchCV(
    pipe_nontext, param_grid, scoring="f1", cv=5, n_jobs=-1
)

In [None]:
grid_nontext.fit(X_train, y_train)

In [None]:
y_pred = grid_nontext.predict(X_train)
print(f'Model: {name}\n')
print(classification_report(y_train, y_pred))

cm = confusion_matrix(y_train, y_pred)
TP, TN, FP, FN = cm[1, 1], cm[0, 0], cm[0, 1], cm[1, 0]
FPR = FP / (FP + TN) if (FP + TN) > 0 else 0

print(f'         FPR    {FPR:.05f}\n')
print(cm)

In [None]:
y_pred = grid_nontext.predict(X_test)
print(f'Model: {name}\n')
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
TP, TN, FP, FN = cm[1, 1], cm[0, 0], cm[0, 1], cm[1, 0]
FPR = FP / (FP + TN) if (FP + TN) > 0 else 0

print(f'         FPR    {FPR:.05f}\n')
print(cm)