In [None]:
import os
import sys
import pickle
import warnings

import numpy as np
import pandas as pd

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
)
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    FunctionTransformer,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, classification_report

from xgboost import XGBClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
    StackingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

from scipy.stats import expon, lognorm, loguniform, randint, uniform, norm

# Custom feature extraction modules
username = os.environ.get('USER')
sys.path.append(f'/data/workspace/{username}')
sys.path.append(os.path.join(os.path.abspath("../../"), "src"))
from extract_header_features import *
from extract_text_features import *
from extract_url_features import *
from extract_text_keywords import *

# Hide warnings
warnings.filterwarnings('ignore')


In [None]:
# Code adapted from DSCI571: Lecture 4 
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
# Full dataset
original_df = pd.read_parquet('/data/workspace/dataset/full-dataset/raw/train.parquet')
# original_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/raw/sample-large.parquet')
original_df.head()

In [None]:
# Full dataset
input_df = pd.read_parquet('/data/workspace/dataset/full-dataset/processed/train.parquet')
# input_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/processed/sample-large.parquet')
input_df['url_count'] = get_url_count(original_df.urls)
input_df.head()

In [None]:
# Join original_df with features_df and features_df_2
combined_df = original_df.join(input_df)

# Replace target_1 values benign to malicious when target_2 is spam
# combined_df['target_1'] = np.where(combined_df['target_2'] == 'spam', 'malicious', combined_df['target_1'])
combined_df

In [None]:
# Filter out self-phishing emails
df_without_sp = combined_df[combined_df['target_3'] != 'self_phishing'].copy()
df_without_sp.head()

# Train test split

In [None]:
train_df, test_df = train_test_split(df_without_sp, test_size=0.3, random_state=42)

list_cols = ["Content_types", "attachment_types", "urls"]

for col in list_cols:
    train_df[col] = train_df[col].apply(lambda x: " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x))
    test_df[col] = test_df[col].apply(lambda x: " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x))


train_df["Subject"] = train_df["Subject"].fillna("")
train_df["text_preprocessed"] = train_df["text_preprocessed"].fillna("")

test_df["Subject"] = test_df["Subject"].fillna("")
test_df["text_preprocessed"] = test_df["text_preprocessed"].fillna("")

X_train = train_df.drop(columns=['target_1'])
y_train = train_df['target_1']

X_test = test_df.drop(columns=['target_1'])
y_test = test_df['target_1']

label_encoder = LabelEncoder()
y_train_num = label_encoder.fit_transform(y_train)
y_test_num = label_encoder.transform(y_test)

# Preparing Preprocessor

## First approach:

In [None]:
numeric_feats = [
    "routing_length", 
    "html_parsing_error", 
    "word_count", 
    "readable_proportion", 
    "whitespace_ratio", 
    "alphabet_proportion",
    "grammar_error_rate", 
    "english_french_proportion",
    "url_count"
    ]

binary_feats = [
    "is_multipart",
    "dmarc_authentication_present", 
    "dkim_sender_domains_match",
    "attachments_present", 
    "to_from_addresses_match", 
    "sender_email_spf_match",
    "non_ascii_present", 
    "hidden_text_present",
    "ip_addr_urls",               
    "http_urls_present", 
    "url_at_symbol",
    "url_port_number", 
    "any_long_urls", 
    "url_multiple_subdomains"
    ]

text_feats = [
    "Content_types" ,
    "urls",
    "attachment_types",
    "Subject",
    "text_preprocessed"
    ]

categorical_feats = [
    "From_name", 
    "From_email", 
    "From_email_domain", 
    "To_name", 
    "To_email", 
    "To_email_domain",
    "dkim_result",
    "spf_result", 
    "dmarc_result", 
    "Content-Language"
    ]

drop_feats = [
    "From",                         # Info extracted to From_name, From_email, From_email_domain
    "To",                           # Info extracted to To_name, To_email, To_email_domain
    "Received",                     # Info extracted to routing_length
    "Authentication-Results",       # Info extracted to dmarc_authentication_present, dkim_result, spf_result, dmarc_result
    "received-spf",                 # Info extracted to spf_result, sender_email_spf_match
    "DKIM-Signature",               # Info extracted to dkim_sender_domains_match
    "Reply-To",                     # Mostly missing, not useful
    "Return-Path",                  # Mostly missing, not useful
    "text_plain",                   
    "text_clean", 
    "text_html", 
    "target_2",                     # Level 2 target variable
    "target_3",                     # Level 3 target variable
    ]

## Second appraoch (after feature selection):

In [None]:
fs_numeric_feats = [
    'html_parsing_error',
    'whitespace_ratio',
    'grammar_error_rate',
    'english_french_proportion'
    ]
    
fs_binary_feats = [
    'is_multipart',
    'sender_email_spf_match'
    ]

fs_categorical_feats = [
    'From_email',
    'From_email_domain',
    'To_name',
    'To_email',
    'To_email_domain',
    'spf_result',
    'Content-Language'
    ]

In [None]:
numeric_transformer = make_pipeline(StandardScaler())

binary_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', drop='if_binary'))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', drop='if_binary'))

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_feats),
    (binary_transformer, binary_feats),
    (categorical_transformer, categorical_feats),
    (CountVectorizer(), text_feats[0]), # Content_types
    (CountVectorizer(), text_feats[1]), # urls
    (CountVectorizer(), text_feats[2]), # attachment_types
    (CountVectorizer(), text_feats[3]), # Subject
    (CountVectorizer(), text_feats[4]), # Text Preprocessed
    ("drop", drop_feats + text_feats + categorical_feats + binary_feats)
)

preprocessor_numeric = make_column_transformer(
    (numeric_transformer, numeric_feats),
    ("drop", drop_feats + text_feats + categorical_feats + binary_feats)
)

preprocessor_binary = make_column_transformer(
    (binary_transformer, binary_feats),
    ("drop", drop_feats + text_feats + categorical_feats + numeric_feats)
)

preprocessor_categorical = make_column_transformer(
    (categorical_transformer, categorical_feats),
    ("drop", drop_feats + text_feats + numeric_feats + binary_feats)
)

In [None]:
fs_preprocessor = make_column_transformer(
    (numeric_transformer, fs_numeric_feats),
    (binary_transformer, fs_binary_feats),
    (categorical_transformer, fs_categorical_feats),
    remainder='drop'
)

## Third approach:

### Header Preprocessor

In [None]:
header_numeric_feats = [
    "routing_length"
]

header_binary_feats = [
    "is_multipart",
    "dmarc_authentication_present", 
    "dkim_sender_domains_match",
    "attachments_present", 
    "to_from_addresses_match", 
    "sender_email_spf_match"
]

header_categorical_feats = [
    "From_name", 
    "From_email", 
    "From_email_domain", 
    "To_name", 
    "To_email", 
    "To_email_domain",
    "dkim_result",
    "spf_result", 
    "dmarc_result", 
    "Content-Language"
]

header_text_feats = [
    "Subject"
]

header_drop_feats = [
    "From",                         # Info extracted to From_name, From_email, From_email_domain
    "To",                           # Info extracted to To_name, To_email, To_email_domain
    "Received",                     # Info extracted to routing_length
    "Authentication-Results",       # Info extracted to dmarc_authentication_present, dkim_result, spf_result, dmarc_result
    "received-spf",                 # Info extracted to spf_result, sender_email_spf_match
    "DKIM-Signature",               # Info extracted to dkim_sender_domains_match
    "Reply-To",                     # Mostly missing, not useful
    "Return-Path",                  # Mostly missing, not useful
    "text_plain",                   
    "text_clean", 
    "text_html"
]

subject_vectorizer = make_pipeline(CountVectorizer())

preprocessor_header = make_column_transformer(
    ("passthrough", header_numeric_feats),
    (binary_transformer, header_binary_feats),
    (categorical_transformer, header_categorical_feats),
    (subject_vectorizer, header_text_feats[0]), # Subject
    remainder='drop'
)

### Header without Subject preprocessor

In [None]:
preprocessor_header_without_subject = make_column_transformer(
    ("passthrough", header_numeric_feats),
    (binary_transformer, header_binary_feats),
    (categorical_transformer, header_categorical_feats),
    remainder='drop'
)

### Body Prepocessor

In [None]:
body_numeric_feats = [
        "word_count",
        "readable_proportion",
        "whitespace_ratio",
        "alphabet_proportion",
        "grammar_error_rate",
        "english_french_proportion",
        "url_count"
]

body_binary_feats = [
        "non_ascii_present",
        "hidden_text_present",
        "all_urls_accessible",
        "urls_redirected",
        "ip_addr_urls",
        "http_urls_present",
        "url_at_symbol",
        "url_port_number",
        "any_long_urls",
        "url_multiple_subdomains"
]

body_categorical_feats = [
        "html_parsing_error"
]

body_text_feats = [
        "Content_types",
        "attachment_types",
        "text_preprocessed",
        "urls"
]

content_types_vectorizer = make_pipeline(CountVectorizer())
attachment_types_vectorizer = make_pipeline(CountVectorizer())
text_preprocessed_vectorizer = make_pipeline(CountVectorizer())
urls_vectorizer = make_pipeline(CountVectorizer())


preprocessor_body = make_column_transformer(
        (numeric_transformer, body_numeric_feats),
        (binary_transformer, body_binary_feats),
        (categorical_transformer, body_categorical_feats),
        (content_types_vectorizer, body_text_feats[0]), # content_types
        (attachment_types_vectorizer, body_text_feats[1]), # attachment_types
        (text_preprocessed_vectorizer, body_text_feats[2]), # text_preprocessed
        (urls_vectorizer, body_text_feats[3]), # urls
        remainder='drop'
)

### Body without text_preprocessed preprocessor

In [None]:
preprocessor_body_without_text_pp = make_column_transformer(
        (numeric_transformer, body_numeric_feats),
        (binary_transformer, body_binary_feats),
        (categorical_transformer, body_categorical_feats),
        (CountVectorizer(), body_text_feats[0]), # content_types
        (CountVectorizer(), body_text_feats[1]), # attachment_types
        (CountVectorizer(), body_text_feats[3]), # urls
        remainder='drop'
)

### Text Preprocessor

In [None]:
preprocessor_text = make_column_transformer(
    (CountVectorizer(), text_feats[3]), # Subject
    (CountVectorizer(), text_feats[4]), # Text Preprocessed
    remainder='drop'
)

# Oversampling

## SMOTE

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

X_train_smote, y_train_smote = SMOTE().fit_resample(X_train_transformed, y_train_num)

In [None]:
X_train_fs_transformed = fs_preprocessor.fit_transform(X_train)
X_test_fs_transformed = fs_preprocessor.transform(X_test)

X_train_fs_smote, y_train_fs_smote = SMOTE().fit_resample(X_train_fs_transformed, y_train_num)

## ADASYN

In [None]:
X_train_adasyn, y_train_adasyn = ADASYN().fit_resample(X_train_transformed, y_train_num)
X_train_fs_adasyn, y_train_fs_adasyn = ADASYN().fit_resample(X_train_fs_transformed, y_train_num)

# Modelling

## Dictionaries setup:

In [None]:
# Train evaluation dictionaries
trained_models = {}
train_predictions = {}
train_classification_report_dict = {}
train_confusion_matrices = {}
train_fpr_dict = {}
train_f1_benign_dict = {}
train_f1_malicious_dict = {}

# Validation evaluation dictionaries
test_predictions = {}
test_classification_report_dict = {}
test_confusion_matrices = {}
test_fpr_dict = {}
test_f1_benign_dict = {}
test_f1_malicious_dict = {}

# CV results
scoring = ['accuracy', 'f1', 'precision', 'recall']
results_df = None
results_dict = {}

**Evaluation function:**

In [None]:
def evaluate_and_store_results(model_name, model, X_train, y_train, X_test, y_test, label_encoder):
    y_pred = model.predict(X_train)
    trained_models[model_name] = model
    train_predictions[model_name] = y_pred

    # Training evaluation
    train_classification_report_dict[model_name] = classification_report(
        y_train, y_pred, target_names=label_encoder.classes_, output_dict=True
    )
    train_f1_benign_dict[model_name] = f1_score(y_train, y_pred, pos_label=0)
    train_f1_malicious_dict[model_name] = f1_score(y_train, y_pred, pos_label=1)
    cm = confusion_matrix(y_train, y_pred)
    train_confusion_matrices[model_name] = cm
    TN, FP, FN, TP = cm.ravel()
    train_fpr_dict[model_name] = FP / (FP + TN)

    # Test evaluation
    y_test_pred = model.predict(X_test)
    test_predictions[model_name] = y_test_pred
    test_classification_report_dict[model_name] = classification_report(
        y_test, y_test_pred, target_names=label_encoder.classes_, output_dict=True
    )
    test_f1_benign_dict[model_name] = f1_score(y_test, y_test_pred, pos_label=0)
    test_f1_malicious_dict[model_name] = f1_score(y_test, y_test_pred, pos_label=1)
    cm_test = confusion_matrix(y_test, y_test_pred)
    test_confusion_matrices[model_name] = cm_test
    TN, FP, FN, TP = cm_test.ravel()
    test_fpr_dict[model_name] = FP / (FP + TN)

## 1. Decision Tree

In [None]:
model_name = "DecisionTreeClassifier"

model = DecisionTreeClassifier(class_weight='balanced', random_state=123)
model.fit(X_train_transformed, y_train_num)

evaluate_and_store_results(model_name, model, X_train_transformed, y_train_num, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "DecisionTreeClassifier_fs"

model = DecisionTreeClassifier(class_weight='balanced', random_state=123)
model.fit(X_train_fs_transformed, y_train_num)

evaluate_and_store_results(model_name, model, X_train_fs_transformed, y_train_num, X_test_fs_transformed, y_test_num, label_encoder)

In [None]:
model_name = "DecisionTreeClassifier_SMOTE"

model = DecisionTreeClassifier(class_weight='balanced', random_state=123)
model.fit(X_train_smote, y_train_smote)

evaluate_and_store_results(model_name, model, X_train_smote, y_train_smote, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "DecisionTreeClassifier_SMOTE_fs"

model = DecisionTreeClassifier(class_weight='balanced', random_state=123)
model.fit(X_train_fs_smote, y_train_fs_smote)

evaluate_and_store_results(model_name, model, X_train_fs_smote, y_train_fs_smote, X_test_fs_transformed, y_test_num, label_encoder)

In [None]:
model_name = "DecisionTreeClassifier_ADASYN"

model = DecisionTreeClassifier(class_weight='balanced', random_state=123)
model.fit(X_train_adasyn, y_train_adasyn)

evaluate_and_store_results(model_name, model, X_train_adasyn, y_train_adasyn, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "DecisionTreeClassifier_ADASYN_fs"

model = DecisionTreeClassifier(class_weight='balanced', random_state=123)
model.fit(X_train_fs_adasyn, y_train_fs_adasyn)

evaluate_and_store_results(model_name, model, X_train_fs_adasyn, y_train_fs_adasyn, X_test_fs_transformed, y_test_num, label_encoder)

## 2. Random Forest

In [None]:
model_name = "RandomForestClassifier"

model = RandomForestClassifier(class_weight='balanced', random_state=123, n_jobs=-1)
model.fit(X_train_transformed, y_train_num)

evaluate_and_store_results(model_name, model, X_train_transformed, y_train_num, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "RandomForestClassifier_fs"

model = RandomForestClassifier(class_weight='balanced', random_state=123, n_jobs=-1)
model.fit(X_train_fs_transformed, y_train_num)

evaluate_and_store_results(model_name, model, X_train_fs_transformed, y_train_num, X_test_fs_transformed, y_test_num, label_encoder)

In [None]:
model_name = "RandomForestClassifier_SMOTE"

model = RandomForestClassifier(class_weight='balanced', random_state=123, n_jobs=-1)
model.fit(X_train_smote, y_train_smote)

evaluate_and_store_results(model_name, model, X_train_smote, y_train_smote, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "RandomForestClassifier_SMOTE_fs"

model = RandomForestClassifier(class_weight='balanced', random_state=123, n_jobs=-1)
model.fit(X_train_fs_smote, y_train_fs_smote)

evaluate_and_store_results(model_name, model, X_train_fs_smote, y_train_fs_smote, X_test_fs_transformed, y_test_num, label_encoder)

In [None]:
model_name = "RandomForestClassifier_ADASYN"

model = RandomForestClassifier(class_weight='balanced', random_state=123, n_jobs=-1)
model.fit(X_train_adasyn, y_train_adasyn)

evaluate_and_store_results(model_name, model, X_train_adasyn, y_train_adasyn, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "RandomForestClassifier_ADASYN_fs"

model = RandomForestClassifier(class_weight='balanced', random_state=123, n_jobs=-1)
model.fit(X_train_fs_adasyn, y_train_fs_adasyn)

evaluate_and_store_results(model_name, model, X_train_fs_adasyn, y_train_fs_adasyn, X_test_fs_transformed, y_test_num, label_encoder)

## 3. XGBoost

In [None]:
from sklearn.utils import class_weight
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=train_df['target_1']
)

In [None]:
model_name = "XGBClassifier"

model = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model.fit(X_train_transformed, y_train_num, sample_weight=classes_weights)

evaluate_and_store_results(model_name, model, X_train_transformed, y_train_num, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "XGBClassifier_fs"

model = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model.fit(X_train_fs_transformed, y_train_num, sample_weight=classes_weights)

evaluate_and_store_results(model_name, model, X_train_fs_transformed, y_train_num, X_test_fs_transformed, y_test_num, label_encoder)

In [None]:
model_name = "XGBClassifier_SMOTE"

model = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model.fit(X_train_smote, y_train_smote)

evaluate_and_store_results(model_name, model, X_train_smote, y_train_smote, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "XGBClassifier_SMOTE_fs"

model = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model.fit(X_train_fs_smote, y_train_fs_smote)

evaluate_and_store_results(model_name, model, X_train_fs_smote, y_train_fs_smote, X_test_fs_transformed, y_test_num, label_encoder)

In [None]:
model_name = "XGBClassifier_ADASYN"

model = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model.fit(X_train_adasyn, y_train_adasyn)

evaluate_and_store_results(model_name, model, X_train_adasyn, y_train_adasyn, X_test_transformed, y_test_num, label_encoder)

In [None]:
model_name = "XGBClassifier_ADASYN_fs"

model = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model.fit(X_train_fs_adasyn, y_train_fs_adasyn)

evaluate_and_store_results(model_name, model, X_train_fs_adasyn, y_train_fs_adasyn, X_test_fs_transformed, y_test_num, label_encoder)

## 4. XGBoost (Header only)

In [None]:
model_name = "XGBClassifier_header"

xgb = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model = make_pipeline(preprocessor_header, xgb)
model.fit(X_train, y_train_num, xgbclassifier__sample_weight=classes_weights)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

## 5. XGBoost (Body only)

In [None]:
model_name = "XGBClassifier_body"

xgb = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model = make_pipeline(preprocessor_body, xgb)
model.fit(X_train, y_train_num, xgbclassifier__sample_weight=classes_weights)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

## 5. Stacking (XGBoost: header + body)

In [None]:
pipe_header = make_pipeline(
    preprocessor_header,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

pipe_body = make_pipeline(
    preprocessor_body,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

In [None]:
estimator = [("header", pipe_header), ("body", pipe_body)]

stacking = StackingClassifier(
    estimators=estimator,
    final_estimator=SVC(kernel='rbf', class_weight='balanced', random_state=123),
    n_jobs=-1
)

In [None]:
results_dict['Stacking (XGB_header + XGB_body)'] = mean_std_cross_val_scores(stacking, X_train, y_train_num, cv=5, scoring=scoring, return_train_score=True)

In [None]:
cv_results = pd.DataFrame(results_dict).T
cv_results

In [None]:
model_name = "Stacking (XGB_header + XGB_body)"

model = stacking
model.fit(X_train, y_train_num)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

## 6. Voting (XGBoost: header + body)

In [None]:
voting = VotingClassifier(
    estimators=estimator,
    voting='soft',
    n_jobs=-1,
)

In [None]:
model_name = "Voting (lr_header + lr_body + XGB_header + XGB_body)"

model = voting
model.fit(X_train, y_train_num)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

## 7. XGBoost (Text only: subject + text_preprocessed)

In [None]:
model_name = "XGBClassifier_text"

xgb = XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
model = make_pipeline(preprocessor_text, xgb)
model.fit(X_train, y_train_num, xgbclassifier__sample_weight=classes_weights)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

## 8. Stacking (3 different XGBoost)

In [None]:
pipe_header_without_subject = make_pipeline(
    preprocessor_header_without_subject,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

pipe_body_without_text_preprocessed = make_pipeline(
    preprocessor_body_without_text_pp,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

pipe_subject_text_pp = make_pipeline(
    preprocessor_text,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

In [None]:
estimator = [("header_without_subject", pipe_header_without_subject), ("body_without_text_pp", pipe_body_without_text_preprocessed), ("subject_text_pp", pipe_subject_text_pp)]

stacking_v2 = StackingClassifier(
    estimators=estimator,
    final_estimator=SVC(kernel='rbf', class_weight='balanced', random_state=123),
    n_jobs=-1,
)

In [None]:
model_name = "Stacking (XGB_header_x_subject + XGB_body_x_text_pp + XGB_subject_text_pp)"

model = stacking_v2
model.fit(X_train, y_train_num)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

## Tuned Stacking

In [None]:
# Load the best parameters
stacking_best_param = pickle.load(open("ht_script_result.pkl", "rb"))
stacking_best_param

In [None]:
pipe_header = make_pipeline(
    preprocessor_header,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

pipe_body = make_pipeline(
    preprocessor_body,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)
estimator = [("header", pipe_header), ("body", pipe_body)]

In [None]:
stacking_tuned = StackingClassifier(
    estimators=estimator,
    final_estimator=SVC(kernel='rbf', class_weight='balanced', random_state=123),
    n_jobs=-1
).set_params(**stacking_best_param)

In [None]:
model_name = "Stacking_tuned (XGB_header + XGB_body)"

model = stacking_tuned
model.fit(X_train, y_train_num)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

# Feature selection

**Note:** This part's work has been commented out as we have found that the model performed better without feature selection.

In [None]:
# from sklearn.feature_selection import SelectFromModel
# from sklearn.linear_model import Lasso

In [None]:
# model_name = "feature_selection_decision_tree"

# pipe_lgr_dt = make_pipeline(
#     preprocessor_numeric,
#     SelectFromModel(LogisticRegression(solver="liblinear", penalty="l1", C=0.01)),
#     DecisionTreeClassifier(class_weight='balanced', random_state=123),
# )
# pipe_lgr_dt.fit(X_train, y_train_num)

# y_pred = pipe_lgr_dt.predict(X_train)
    
# trained_models[model_name] = pipe_lgr_dt
# train_predictions[model_name] = y_pred


# # Training evaluation
# # Classification report
# train_classification_report_dict[model_name] = classification_report(
#     y_train_num, y_pred, target_names=label_encoder.classes_
# )

# # F1 scores for each class
# train_f1_benign_dict[model_name] = f1_score(y_train_num, y_pred, pos_label=0)
# train_f1_malicious_dict[model_name] =  f1_score(y_train_num, y_pred, pos_label=1)

# # Confusion matrix
# cm = confusion_matrix(y_train_num, y_pred)
# train_confusion_matrices[model_name] = cm

# # False Positive Rate
# TN, FP, FN, TP = cm.ravel()
# fpr = FP / (FP + TN)
# train_fpr_dict[model_name] = fpr

# # Test evaluation
# y_test_pred = pipe_lgr_dt.predict(X_test)
# test_predictions[model_name] = y_test_pred

# test_classification_report_dict[model_name] = classification_report(
#     y_test_num, y_test_pred, target_names=label_encoder.classes_
# )
# test_f1_benign_dict[model_name] = f1_score(y_test_num, y_test_pred, pos_label=0)
# test_f1_malicious_dict[model_name] = f1_score(y_test_num, y_test_pred, pos_label=1)

# cm_test = confusion_matrix(y_test_num, y_test_pred)
# test_confusion_matrices[model_name] = cm_test
# TN, FP, FN, TP = cm_test.ravel()
# test_fpr_dict[model_name] = FP / (FP + TN)


In [None]:
# l1_coefs = pipe_lgr_dt.named_steps["selectfrommodel"].estimator_.coef_.flatten()
# fs_num = pd.DataFrame(l1_coefs, index=numeric_feats, columns=["l1_coefs"])

# useful_feats = fs_num[fs_num['l1_coefs'] != 0].index.tolist()

# fs_num

In [None]:
# model_name = "feature_selection_decision_tree"

# pipe_lgr_dt = make_pipeline(
#     preprocessor_binary,
#     SelectFromModel(LogisticRegression(solver="liblinear", penalty="l1", C=0.01)),
#     DecisionTreeClassifier(class_weight='balanced', random_state=123),
# )
# pipe_lgr_dt.fit(X_train, y_train_num)

# y_pred = pipe_lgr_dt.predict(X_train)
    
# trained_models[model_name] = pipe_lgr_dt
# train_predictions[model_name] = y_pred


# # Training evaluation
# # Classification report
# train_classification_report_dict[model_name] = classification_report(
#     y_train_num, y_pred, target_names=label_encoder.classes_
# )

# # F1 scores for each class
# train_f1_benign_dict[model_name] = f1_score(y_train_num, y_pred, pos_label=0)
# train_f1_malicious_dict[model_name] =  f1_score(y_train_num, y_pred, pos_label=1)

# # Confusion matrix
# cm = confusion_matrix(y_train_num, y_pred)
# train_confusion_matrices[model_name] = cm

# # False Positive Rate
# TN, FP, FN, TP = cm.ravel()
# fpr = FP / (FP + TN)
# train_fpr_dict[model_name] = fpr

# # Test evaluation
# y_test_pred = pipe_lgr_dt.predict(X_test)
# test_predictions[model_name] = y_test_pred

# test_classification_report_dict[model_name] = classification_report(
#     y_test_num, y_test_pred, target_names=label_encoder.classes_
# )
# test_f1_benign_dict[model_name] = f1_score(y_test_num, y_test_pred, pos_label=0)
# test_f1_malicious_dict[model_name] = f1_score(y_test_num, y_test_pred, pos_label=1)

# cm_test = confusion_matrix(y_test_num, y_test_pred)
# test_confusion_matrices[model_name] = cm_test
# TN, FP, FN, TP = cm_test.ravel()
# test_fpr_dict[model_name] = FP / (FP + TN)


In [None]:
# l1_coefs = pipe_lgr_dt.named_steps["selectfrommodel"].estimator_.coef_.flatten()
# fs_bin = pd.DataFrame(l1_coefs, index=binary_feats, columns=["l1_coefs"])

# # Add the useful features to the list
# # useful_feats += fs_bin[fs_bin['l1_coefs'] != 0].index.tolist()

# fs_bin

In [None]:
# model_name = "feature_selection_decision_tree"

# pipe_lgr_dt = make_pipeline(
#     preprocessor_categorical,
#     SelectFromModel(LogisticRegression(solver="liblinear", penalty="l1", C=0.01)),
#     DecisionTreeClassifier(class_weight='balanced', random_state=123),
# )
# pipe_lgr_dt.fit(X_train, y_train_num)

# y_pred = pipe_lgr_dt.predict(X_train)
    
# trained_models[model_name] = pipe_lgr_dt
# train_predictions[model_name] = y_pred


# # Training evaluation
# # Classification report
# train_classification_report_dict[model_name] = classification_report(
#     y_train_num, y_pred, target_names=label_encoder.classes_
# )

# # F1 scores for each class
# train_f1_benign_dict[model_name] = f1_score(y_train_num, y_pred, pos_label=0)
# train_f1_malicious_dict[model_name] =  f1_score(y_train_num, y_pred, pos_label=1)

# # Confusion matrix
# cm = confusion_matrix(y_train_num, y_pred)
# train_confusion_matrices[model_name] = cm

# # False Positive Rate
# TN, FP, FN, TP = cm.ravel()
# fpr = FP / (FP + TN)
# train_fpr_dict[model_name] = fpr

# # Test evaluation
# y_test_pred = pipe_lgr_dt.predict(X_test)
# test_predictions[model_name] = y_test_pred

# test_classification_report_dict[model_name] = classification_report(
#     y_test_num, y_test_pred, target_names=label_encoder.classes_
# )
# test_f1_benign_dict[model_name] = f1_score(y_test_num, y_test_pred, pos_label=0)
# test_f1_malicious_dict[model_name] = f1_score(y_test_num, y_test_pred, pos_label=1)

# cm_test = confusion_matrix(y_test_num, y_test_pred)
# test_confusion_matrices[model_name] = cm_test
# TN, FP, FN, TP = cm_test.ravel()
# test_fpr_dict[model_name] = FP / (FP + TN)


In [None]:
# ohe = pipe_lgr_dt.named_steps["columntransformer"].named_transformers_['pipeline'].named_steps['onehotencoder']

# # Get the encoded feature names
# feature_names = ohe.get_feature_names_out(categorical_feats)


In [None]:
# l1_coefs = pipe_lgr_dt.named_steps["selectfrommodel"].estimator_.coef_.flatten()
# fs_cat = pd.DataFrame(l1_coefs, index=feature_names, columns=["l1_coefs"])

# # Add the useful features to the list
# # useful_feats += fs_cat[fs_cat['l1_coefs'] != 0].index.tolist()

# # Filter out the features with non-zero coefficients
# fs_cat = fs_cat[fs_cat['l1_coefs'] != 0].index.tolist()
# fs_cat 

In [None]:
# useful_feats

# Result

## Train evaluation:

In [None]:
train_results_df = pd.DataFrame({
    "Model": list(trained_models.keys()),
    "Precision Benign": [train_classification_report_dict[model_name]["benign"]["precision"] for model_name in trained_models.keys()],
    "Precision Malicious": [train_classification_report_dict[model_name]["malicious"]["precision"] for model_name in trained_models.keys()],
    "Recall Benign": [train_classification_report_dict[model_name]["benign"]["recall"] for model_name in trained_models.keys()],
    "Recall Malicious": [train_classification_report_dict[model_name]["malicious"]["recall"] for model_name in trained_models.keys()],
    "F1 Benign": list(train_f1_benign_dict.values()),
    "F1 Malicious": list(train_f1_malicious_dict.values()),
    "FPR": list(train_fpr_dict.values()),
    "confusion_matrix": list(train_confusion_matrices.values()),
})

float_cols = train_results_df.select_dtypes(include='float').columns
train_results_df[float_cols] = train_results_df[float_cols].round(2)
train_results_df = train_results_df.set_index("Model")
train_results_df

## Validation evaluation:

In [None]:
test_results_df = pd.DataFrame({
    "Model": list(trained_models.keys()),
    "Precision Benign": [test_classification_report_dict[model_name]['benign']['precision'] for model_name in trained_models.keys()],
    "Precision Malicious": [test_classification_report_dict[model_name]['malicious']['precision'] for model_name in trained_models.keys()],
    "Recall Benign": [test_classification_report_dict[model_name]['benign']['recall'] for model_name in trained_models.keys()],
    "Recall Malicious": [test_classification_report_dict[model_name]['malicious']['recall'] for model_name in trained_models.keys()],
    "F1 Benign": list(test_f1_benign_dict.values()),
    "F1 Malicious": list(test_f1_malicious_dict.values()),
    "FPR": list(test_fpr_dict.values()),
    "Confusion Matrix": list(test_confusion_matrices.values())
})

float_cols = test_results_df.select_dtypes(include='float').columns
test_results_df[float_cols] = test_results_df[float_cols].round(2)
test_results_df = test_results_df.set_index("Model")
test_results_df

# Understanding the best model's prediction

The stacking model with two xgb model seems to perform the best. Now let's take a look at it's predictions and investigate what aspect of it is doing poorly.

In [None]:
stacking_prediction = test_predictions['Stacking (XGB_header + XGB_body)']

test_df_with_predictions = test_df.copy()

# Move the target_1 column to the last position
target_col = test_df_with_predictions.pop('target_1')
test_df_with_predictions['target_1'] = target_col
test_df_with_predictions['predicted_target'] = stacking_prediction
test_df_with_predictions['predicted_target'] = test_df_with_predictions['predicted_target'].replace({0: 'benign', 1: 'malicious'})
test_df_with_predictions['correct_prediction'] = test_df_with_predictions['predicted_target'] == test_df_with_predictions['target_1']
test_df_with_predictions['True Positive'] = (test_df_with_predictions['predicted_target'] == 'malicious') & (test_df_with_predictions['target_1'] == 'malicious')
test_df_with_predictions['True Negative'] = (test_df_with_predictions['predicted_target'] == 'benign') & (test_df_with_predictions['target_1'] == 'benign')
test_df_with_predictions['False Positive'] = (test_df_with_predictions['predicted_target'] == 'malicious') & (test_df_with_predictions['target_1'] == 'benign')
test_df_with_predictions['False Negative'] = (test_df_with_predictions['predicted_target'] == 'benign') & (test_df_with_predictions['target_1'] == 'malicious')
# test_df_with_predictions.reset_index(inplace=True)
test_df_with_predictions.head()

In [None]:
# # Export the DataFrame as a pickle file
# test_df_with_predictions.to_pickle("/data/workspace/jiaquan/stacking_predictions.pkl")

## Analyse predicted labels mistake

In [None]:
# Read in the pickle dataframe
prediction_df = pd.read_pickle("/data/workspace/dataset/stacking_predictions.pkl")
prediction_df

# Feature importance

In [None]:
# Get the fitted header pipeline from stacking
header_pipe = trained_models['Stacking (XGB_header + XGB_body)'].named_estimators_['header']

# Get the XGBoost model
xgb_model = header_pipe.named_steps['xgbclassifier']

# Get feature importances
importances = xgb_model.feature_importances_

# Get feature names from the preprocessor
def get_feature_names_from_column_transformer(column_transformer):
    feature_names = []
    for name, transformer, cols in column_transformer.transformers_:
        if name == "remainder" and transformer == "drop":
            continue
        if hasattr(transformer, 'get_feature_names_out'):
            names = transformer.get_feature_names_out(cols)
        elif hasattr(transformer, 'named_steps'):
            # For pipelines
            last_step = list(transformer.named_steps.values())[-1]
            if hasattr(last_step, 'get_feature_names_out'):
                names = last_step.get_feature_names_out(cols)
            else:
                names = cols
        else:
            names = cols
        feature_names.extend(names)
    return feature_names

# Use the fitted column transformer from the pipeline
fitted_column_transformer = header_pipe.named_steps['columntransformer']
feature_names = get_feature_names_from_column_transformer(fitted_column_transformer)

# Combine into a DataFrame for easy viewing
header_feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

header_feature_importance_df

In [None]:
# Get the fitted header pipeline from stacking
body_pipe = trained_models['Stacking (XGB_header + XGB_body)'].named_estimators_['body']

# Get the XGBoost model
xgb_model = body_pipe.named_steps['xgbclassifier']

# Get feature importances
importances = xgb_model.feature_importances_

# Get feature names from the preprocessor
def get_feature_names_from_column_transformer(column_transformer):
    feature_names = []
    for name, transformer, cols in column_transformer.transformers_:
        if name == "remainder" and transformer == "drop":
            continue
        if hasattr(transformer, 'get_feature_names_out'):
            names = transformer.get_feature_names_out(cols)
        elif hasattr(transformer, 'named_steps'):
            # For pipelines
            last_step = list(transformer.named_steps.values())[-1]
            if hasattr(last_step, 'get_feature_names_out'):
                names = last_step.get_feature_names_out(cols)
            else:
                names = cols
        else:
            names = cols
        feature_names.extend(names)
    return feature_names

# Use the fitted column transformer from the pipeline
fitted_column_transformer = body_pipe.named_steps['columntransformer']
feature_names = get_feature_names_from_column_transformer(fitted_column_transformer)

# Combine into a DataFrame for easy viewing
body_feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

body_feature_importance_df