In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

import pickle
import os, sys

sys.path.append(os.path.abspath("../.."))

## Read dataset

In [None]:
train_df = pd.read_parquet(
    '/data/workspace/danishki/git_repo/data/full-dataset/processed/train.parquet'
)

train_df = train_df.copy()
X_train = train_df.drop(columns=['target_1', 'target_2', 'target_3'])
y_train = train_df['target_1']

In [None]:
test_df = pd.read_parquet(
    '/data/workspace/danishki/git_repo/data/full-dataset/processed/test.parquet'
)

test_df = test_df.copy()
X_test = test_df.drop(columns=['target_1', 'target_2', 'target_3'])
y_test = test_df['target_1']

## Set up preprocessor

In [None]:
header_text_feats = 'subject'

header_bool_feats = [
    'url_present_in_subject', 
    'dmarc_authentication_present',
    'dkim_sender_domains_match',
    'to_from_addresses_match', 
    'sender_email_spf_match',
    'different_reply_domains',
    'name_server_match', 
]

header_cat_feats = [
    'dkim_result',
    'spf_result',
    'dmarc_result',
]

header_num_feats = [
    'routing_length_before_ubc',
    'internal_server_transfer_count',
]

body_text_feats = 'text_clean'

body_bool_feats = [
    'non_ascii_present',
    'hidden_text_present',
    'empty_body',
]

body_cat_feats = [
    'html_parsing_error',
]

body_num_feats = [
    'word_count',
    'readable_proportion',
    'whitespace_ratio',
    'alphabet_proportion',
    'grammar_error_rate',
    'english_french_proportion',
    'text_content_count',
    'multimedia_content_count',
    'others_content_count',
    'hyperlink_proportion',
]

In [None]:
preprocessor_header = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', drop='if_binary'), header_bool_feats + header_cat_feats),
    (StandardScaler(), header_num_feats),
    remainder='drop',
)

preprocessor_subject = make_column_transformer(
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), header_text_feats),
    remainder='drop'    
)

preprocessor_body = make_column_transformer(
    (CountVectorizer(strip_accents='unicode', lowercase=True, stop_words=['english', 'french']), body_text_feats),
    remainder='drop'
)

preprocessor_body_nontext = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', drop='if_binary'), body_bool_feats + body_cat_feats),
    (StandardScaler(), body_num_feats),
    remainder='drop'
)

## Set up pipelines

In [None]:
pipe_header = make_pipeline(
    preprocessor_header,
    XGBClassifier(n_jobs=-1, objective="binary:logistic", enable_categorical=True)
)

pipe_subject = make_pipeline(
    preprocessor_subject,
    XGBClassifier(n_jobs=-1, objective="binary:logistic")
)

pipe_body = make_pipeline(
    preprocessor_body,
    XGBClassifier(n_jobs=-1, objective="binary:logistic")
)

pipe_body_nontext = make_pipeline(
    preprocessor_body_nontext,
    XGBClassifier(n_jobs=-1, objective="binary:logistic", enable_categorical=True)
)

## Set up `StackingClassifier`

In [None]:
estimators = [
    ("header", pipe_header), 
    ("subject", pipe_subject), 
    ("body", pipe_body),
    ("body_nontext", pipe_body_nontext)
]

sc_svc = StackingClassifier(
    estimators=estimators,
    final_estimator=SVC(
        probability=True, 
        class_weight='balanced',
    ),
    n_jobs=-1,
)

In [None]:
sc_svc.fit(X_train, y_train)

## Classification report (`train`)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = sc_svc.predict(X_train)
print(classification_report(y_train, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_train, y_pred
)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train, y_pred)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"False Positive Rate: {fpr:.4f}")
print(f"False Negative Rate: {fnr:.4f}")

## Classification report (`test`)

In [None]:
y_pred = sc_svc.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred
)

In [None]:
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"False Positive Rate: {fpr:.4f}")
print(f"False Negative Rate: {fnr:.4f}")

## Adjusted probability threshold

In [None]:
y_pred_proba = sc_svc.predict_proba(X_test)
y_pred_custom = y_pred_proba[:, 1] > 0.5

y_pred_custom = ['malicious' if y is True else 'benign' for y in y_pred_custom.tolist()]
print(classification_report(y_test, y_pred_custom))

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_custom
)

In [None]:
cm = confusion_matrix(y_test, y_pred_custom)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"False Positive Rate: {fpr:.4f}")
print(f"False Negative Rate: {fnr:.4f}")