## Import libraries

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

import pickle
import os, sys

sys.path.append(os.path.abspath("../.."))

## Import and prepare dataset

In [None]:
original_df = pd.read_parquet(
    '/data/workspace/danishki/git_repo/data/full-dataset/raw/train.parquet'
).query(
    '`target_3` != "self_phishing"'
)

original_df.head()

In [None]:
features_df = pd.read_parquet(
    '/data/workspace/danishki/git_repo/data/full-dataset/processed/train.parquet'
)

features_df = features_df.copy()
features_df.loc[features_df['empty_body'] == True, 'target_1'] = 'malicious'

features_df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    features_df.drop(columns=['target_1', 'target_2', 'target_3']), features_df['target_1'],
    train_size=0.7, random_state=42
)

In [None]:
X_train.head()

In [None]:
y_train.head()

### Preprocessors

In [None]:
header_text_feats = 'subject'

header_bool_feats = [
    'url_present_in_subject', 
    'dmarc_authentication_present',
    'dkim_sender_domains_match',
    'to_from_addresses_match', 
    'sender_email_spf_match',
    'different_reply_domains',
    'name_server_match', 
]

header_cat_feats = [
    'dkim_result',
    'spf_result',
    'dmarc_result',
]

header_num_feats = [
    'routing_length_before_ubc',
    'internal_server_transfer_count',
]

body_text_feats = 'text_clean'

body_bool_feats = [
    'non_ascii_present',
    'hidden_text_present',
    'empty_body',
]

body_cat_feats = [
    'html_parsing_error',
]

body_num_feats = [
    'word_count',
    'readable_proportion',
    'whitespace_ratio',
    'alphabet_proportion',
    'grammar_error_rate',
    'english_french_proportion',
    'text_content_count',
    'multimedia_content_count',
    'others_content_count',
    'hyperlink_proportion',
]

for feat in header_cat_feats + body_cat_feats:
    features_df[feat] = pd.Categorical(features_df[feat])

features_df.info()

In [None]:
preprocessor_header = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', drop='if_binary'), header_bool_feats + header_cat_feats),
    (StandardScaler(), header_num_feats),
    remainder='drop',
)

preprocessor_header

In [None]:
preprocessor_subject = make_column_transformer(
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), header_text_feats),
    remainder='drop'    
)

preprocessor_subject

In [None]:
preprocessor_body = make_column_transformer(
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), body_text_feats),
    remainder='drop'
)

preprocessor_body

In [None]:
preprocessor_body_nontext = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', drop='if_binary'), body_bool_feats + body_cat_feats),
    (StandardScaler(), body_num_feats),
    remainder='drop'
)

preprocessor_body_nontext

In [None]:
preprocessor_text = make_column_transformer(
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), header_text_feats),
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), body_text_feats),
    remainder='drop',    
)

preprocessor_text

In [None]:
preprocessor_nontext = make_column_transformer(
    (
        OneHotEncoder(handle_unknown='ignore', drop='if_binary'), 
        header_bool_feats + header_cat_feats + body_bool_feats + body_cat_feats
    ),
    (
        StandardScaler(), 
        header_num_feats + body_num_feats
    ),
    remainder='drop',
)

preprocessor_nontext

In [None]:
preprocessor_all = make_column_transformer(
    (
        OneHotEncoder(handle_unknown='ignore', drop='if_binary'), 
        header_bool_feats + header_cat_feats + body_bool_feats + body_cat_feats
    ),
    (
        StandardScaler(), 
        header_num_feats + body_num_feats
    ),
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), header_text_feats),
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), body_text_feats),
    remainder='drop',
)

preprocessor_all

## Model selection

### Base models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

dummy = make_pipeline(
    preprocessor_nontext,
    DummyClassifier(random_state=42)
)

lr = make_pipeline(
    preprocessor_all,
    LogisticRegression()
)

nb = make_pipeline(
    preprocessor_text,
    LogisticRegression()
)

xgb = make_pipeline(
    preprocessor_all,
    XGBClassifier(n_jobs=-1)
)

In [None]:
from sklearn.model_selection import cross_validate

# Code adapted from DSCI571: Lecture 4 
def mean_std_cross_val_scores(model, X_train, y_train, scoring=None):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, scoring=scoring, n_jobs=-1, return_train_score=True)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
from sklearn.metrics import make_scorer, f1_score

y_train_binary = y_train.map({
    'benign': 0, 'malicious': 1
})

scoring = make_scorer(f1_score, pos_label=0)

cv_results = {}

models = {
    'DummyClassifier': dummy,
    'LogisticRegression': lr,
    'GaussianNB': nb,
    'XGBClassifier': xgb,
}

for name, model in models.items():
    cv_results[name] = mean_std_cross_val_scores(model, X_train, y_train_binary, scoring)
    # models[name].fit(X_train, y_train_binary)

### Cross-validation results

In [None]:
pd.DataFrame(cv_results).T

### Feature importances for `XGBClassifier`

In [None]:
import shap

models['XGBClassifier'].fit(X_train, y_train_binary)
model = models['XGBClassifier']['xgbclassifier']
ct = models['XGBClassifier']['columntransformer']

explainer = shap.TreeExplainer(model)
observations = pd.DataFrame(
    ct.transform(X_train).toarray(),
    columns=ct.get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
shap.summary_plot(shap_values, observations, plot_type="bar")

## Ensemble model

### Set up pipeline

In [None]:
pipe_header = make_pipeline(
    preprocessor_header,
    XGBClassifier(n_jobs=-1, objective="binary:logistic", enable_categorical=True)
)

pipe_subject = make_pipeline(
    preprocessor_subject,
    XGBClassifier(n_jobs=-1, objective="binary:logistic")
)

pipe_body = make_pipeline(
    preprocessor_body,
    XGBClassifier(n_jobs=-1, objective="binary:logistic")
)

pipe_body_nontext = make_pipeline(
    preprocessor_body_nontext,
    XGBClassifier(n_jobs=-1, objective="binary:logistic", enable_categorical=True)
)

### Model selection for `final_estimator`

In [None]:
estimators = [
    ("header", pipe_header), 
    ("subject", pipe_subject), 
    ("body", pipe_body),
    ("body_nontext", pipe_body_nontext)
]

sc_lr = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    n_jobs=-1,
)

sc_svc = StackingClassifier(
    estimators=estimators,
    final_estimator=SVC(
        probability=True, 
        class_weight='balanced',
    ),
    n_jobs=-1,
)

sc_xgb = StackingClassifier(
    estimators=estimators,
    final_estimator=XGBClassifier(),
    n_jobs=-1,
)

sc_lr

In [None]:
from sklearn.metrics import make_scorer, f1_score

y_train_binary = y_train.map({
    'benign': 0, 'malicious': 1
})

scoring = make_scorer(f1_score, pos_label=0)

cv_results = {}

sc_models = {
    'LogisticRegression': sc_lr,
    'SVC': sc_svc,
    'XGBClassifier': sc_xgb,
}

for name, model in sc_models.items():
    cv_results[name] = mean_std_cross_val_scores(model, X_train, y_train_binary, scoring)

### Cross-validation results

In [None]:
pd.DataFrame(cv_results)

### Comparison of architectures

In [None]:
pipe_text = make_pipeline(
    preprocessor_text,
    XGBClassifier(n_jobs=-1, objective="binary:logistic")
)

pipe_nontext = make_pipeline(
    preprocessor_nontext,
    XGBClassifier(n_jobs=-1, objective="binary:logistic", enable_categorical=True)
)

estimators = [
    ("text", pipe_text), 
    ("nontext", pipe_nontext), 
]

sc_svc_2 = StackingClassifier(
    estimators=estimators,
    final_estimator=SVC(
        probability=True, 
        class_weight='balanced',
    ),
    n_jobs=-1,
)

In [None]:
from sklearn.metrics import make_scorer, f1_score

y_train_binary = y_train.map({
    'benign': 0, 'malicious': 1
})

scoring = make_scorer(f1_score, pos_label=0)

cv_results = {}

sc_models = {
    'XGBClassifier': xgb,
    'Stacked (2 meta-estimators)': sc_svc_2,
    'Stacked (4 meta-estimators)': sc_svc,
}

for name, model in sc_models.items():
    cv_results[name] = mean_std_cross_val_scores(model, X_train, y_train_binary, scoring)

In [None]:
pd.DataFrame(cv_results)

## Hyperparameter tuning

### Preprocessor

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

scoring = make_scorer(f1_score, pos_label='benign')

param_grid = {
    'header__xgbclassifier__reg_alpha': [0, 0.001, 0.01, 0.1, 1.0],
}

# rscv = RandomizedSearchCV(
#     sc_svc, param_grid, n_jobs=-1, cv=5, return_train_score=True, 
#     scoring=scoring, verbose=3
# )

# rscv.fit(X_train, y_train)

model_pickle = '/data/workspace/danishki/git_repo/notebooks/milestone5/rscv-xgb-reg.pkl'
with open(model_pickle, 'rb') as f:
    rscv = pickle.load(f)

In [None]:
rscv_results = pd.DataFrame(rscv.cv_results_).sort_values('rank_test_score')

rscv_results.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))

data = [rscv_results['mean_train_score'], rscv_results['mean_test_score']]
labels = ['Train', 'Validation']

bp = plt.boxplot(data, labels=labels, patch_artist=True, orientation='horizontal')

plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.title('Distribution of Train vs Test F1 Scores', fontsize=14)
plt.xlabel('F1 Score')
plt.xlim(0.7, 1.0)

plt.tight_layout();

In [None]:
subject_countvec_min_df = 0.001
subject_countvec_max_df = 0.950
subject_countvec_max_features = 500

body_countvec_min_df = 0.001
body_countvec_max_df = 0.99
body_countvec_max_features = 10000

### Meta-models

#### `max_depth`, `eta` (learning rate)

In [None]:
scoring = make_scorer(f1_score, pos_label='benign')

param_grid = {
    'header__xgbclassifier__max_depth': [0, 3, 6],
    'header__xgbclassifier__eta': [0.1, 0.3, 0.6],
    'subject__xgbclassifier__max_depth': [0, 3, 6],
    'subject__xgbclassifier__eta': [0.1, 0.3, 0.6],
    'body__xgbclassifier__max_depth': [0, 3, 6],
    'body__xgbclassifier__eta': [0.1, 0.3, 0.6],
    'body_nontext__xgbclassifier__max_depth': [0, 3, 6],
    'body_nontext__xgbclassifier__eta': [0.1, 0.3, 0.6],
}

rscv = RandomizedSearchCV(
    sc_svc, param_grid, n_jobs=-1, cv=5, return_train_score=True, 
    scoring=scoring, n_iter=100, verbose=3
)

model_pickle = '/data/workspace/danishki/git_repo/notebooks/milestone5/rscv-xgb.pkl'
with open(model_pickle, 'rb') as f:
    rscv = pickle.load(f)

In [None]:
rscv_results = pd.DataFrame(rscv.cv_results_).sort_values('rank_test_score')

rscv_results.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))

data = [rscv_results['mean_train_score'], rscv_results['mean_test_score']]
labels = ['Train', 'Validation']

bp = plt.boxplot(data, labels=labels, patch_artist=True, orientation='horizontal')

plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.title('Distribution of Train vs Test F1 Scores', fontsize=14)
plt.xlabel('F1 Score')
plt.xlim(0.7, 1.0)

plt.tight_layout();

#### `alpha`, `gamma` (regularisation)

In [None]:
scoring = make_scorer(f1_score, pos_label='benign')

param_grid = {
    'header__xgbclassifier__reg_alpha': [0, 0.001, 0.01, 0.1, 1.0],
    'header__xgbclassifier__reg_lambda': [0.01, 0.1, 1.0, 10.0],
    'subject__xgbclassifier__reg_alpha': [0, 0.001, 0.01, 0.1, 1.0],
    'subject__xgbclassifier__reg_lambda': [0.01, 0.1, 1.0, 10.0],
    'body__xgbclassifier__reg_alpha': [0, 0.001, 0.01, 0.1, 1.0],
    'body__xgbclassifier__reg_lambda': [0.01, 0.1, 1.0, 10.0],
    'body_nontext__xgbclassifier__reg_alpha': [0, 0.001, 0.01, 0.1, 1.0],
    'body_nontext__xgbclassifier__reg_lambda': [0.01, 0.1, 1.0, 10.0],
}

rscv = RandomizedSearchCV(
    sc_svc, param_grid, n_jobs=-1, cv=5, return_train_score=True, 
    scoring=scoring, n_iter=100, verbose=3
)

model_pickle = '/data/workspace/danishki/git_repo/notebooks/milestone5/rscv-xgb-reg.pkl'
with open(model_pickle, 'rb') as f:
    rscv = pickle.load(f)

rscv_results = pd.DataFrame(rscv.cv_results_).sort_values('rank_test_score')

rscv_results.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))

data = [rscv_results['mean_train_score'], rscv_results['mean_test_score']]
labels = ['Train', 'Validation']

bp = plt.boxplot(data, labels=labels, patch_artist=True, orientation='horizontal')

plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.title('Distribution of Train vs Test F1 Scores', fontsize=14)
plt.xlabel('F1 Score')
plt.xlim(0.7, 1.0)

plt.tight_layout();

## Train full model

In [None]:
preprocessor_header = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', drop='if_binary'), header_bool_feats + header_cat_feats),
    (StandardScaler(), header_num_feats),
    remainder='drop',
)

preprocessor_subject = make_column_transformer(
    (
        CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french'],
                        min_df=subject_countvec_min_df, max_df=subject_countvec_max_df,
                        max_features=subject_countvec_max_features), 
        header_text_feats
    ),
    remainder='drop'    
)

preprocessor_body = make_column_transformer(
    (
        CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french'],
                        min_df=body_countvec_min_df, max_df=body_countvec_max_df,
                        max_features=body_countvec_max_features), 
        body_text_feats
    ),
    remainder='drop'    
)

preprocessor_body_nontext = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', drop='if_binary'), body_bool_feats + body_cat_feats),
    (StandardScaler(), body_num_feats),
    remainder='drop'
)

In [None]:
pipe_header = make_pipeline(
    preprocessor_header,
    XGBClassifier(n_jobs=-1, objective="binary:logistic", enable_categorical=True)
)

pipe_subject = make_pipeline(
    preprocessor_subject,
    XGBClassifier(n_jobs=-1, objective="binary:logistic")
)

pipe_body = make_pipeline(
    preprocessor_body,
    XGBClassifier(n_jobs=-1, objective="binary:logistic")
)

pipe_body_nontext = make_pipeline(
    preprocessor_body_nontext,
    XGBClassifier(n_jobs=-1, objective="binary:logistic", enable_categorical=True)
)

In [None]:
estimators = [
    ("header", pipe_header), 
    ("subject", pipe_subject), 
    ("body", pipe_body),
    ("body_nontext", pipe_body_nontext)
]

sc_svc = StackingClassifier(
    estimators=estimators,
    final_estimator=SVC(
        probability=True, 
        class_weight='balanced',
    ),
    n_jobs=-1,
)

sc_svc.fit(X_train, y_train)

### Train metrics

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = sc_svc.predict(X_train)
print(classification_report(y_train, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_train, y_pred
)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train, y_pred)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"False Positive Rate: {fpr:.4f}")
print(f"False Negative Rate: {fnr:.4f}")

### Validation metrics

In [None]:
y_pred = sc_svc.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred
)

In [None]:
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"False Positive Rate: {fpr:.4f}")
print(f"False Negative Rate: {fnr:.4f}")

### Validation metrics (if `malicious` threshold is set at 0.5)

In [None]:
y_pred_proba = sc_svc.predict_proba(X_test)
y_pred_custom = y_pred_proba[:, 1] > 0.5

y_pred_custom = ['malicious' if y is True else 'benign' for y in y_pred_custom.tolist()]
print(classification_report(y_test, y_pred_custom))

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_custom
)

In [None]:
cm = confusion_matrix(y_test, y_pred_custom)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"False Positive Rate: {fpr:.4f}")
print(f"False Negative Rate: {fnr:.4f}")

### Feature importances

In [None]:
import shap

model = sc_svc.named_estimators_['header'].named_steps['xgbclassifier']
ct = sc_svc.named_estimators_['header'].named_steps['columntransformer']

explainer = shap.TreeExplainer(model)
observations = pd.DataFrame(
    ct.transform(X_train).toarray(),
    columns=ct.get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
shap.summary_plot(shap_values, observations, plot_type="bar")

In [None]:
model = sc_svc.named_estimators_['subject'].named_steps['xgbclassifier']
ct = sc_svc.named_estimators_['subject'].named_steps['columntransformer']

explainer = shap.TreeExplainer(model)
observations = pd.DataFrame(
    ct.transform(X_train).toarray(),
    columns=ct.get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
shap.summary_plot(shap_values, observations, plot_type="bar", max_display=10)

In [None]:
model = sc_svc.named_estimators_['body'].named_steps['xgbclassifier']
ct = sc_svc.named_estimators_['body'].named_steps['columntransformer']

explainer = shap.TreeExplainer(model)
observations = pd.DataFrame(
    ct.transform(X_train).toarray(),
    columns=ct.get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
shap.summary_plot(shap_values, observations, plot_type="bar", max_display=10)

In [None]:
model = sc_svc.named_estimators_['body_nontext'].named_steps['xgbclassifier']
ct = sc_svc.named_estimators_['body_nontext'].named_steps['columntransformer']

explainer = shap.TreeExplainer(model)
observations = pd.DataFrame(
    ct.transform(X_train),
    columns=ct.get_feature_names_out()
)
shap_values = explainer.shap_values(observations)
shap.summary_plot(shap_values, observations, plot_type="bar")