# Import

In [None]:
import os
import sys
import pickle
import warnings

import numpy as np
import pandas as pd

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
)
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    FunctionTransformer,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, classification_report

from xgboost import XGBClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
    StackingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

from scipy.stats import expon, lognorm, loguniform, randint, uniform, norm

# Custom feature extraction modules
username = os.environ.get('USER')
sys.path.append(f'/data/workspace/{username}')
sys.path.append(os.path.join(os.path.abspath("../../"), "src"))
from extract_header_features import *
from extract_text_features import *
from extract_url_features import *
from extract_text_keywords import *

# Hide warnings
warnings.filterwarnings('ignore')


## Function for cross-validation

In [None]:
# Code adapted from DSCI571: Lecture 4 
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

# Reading in the data

In [None]:
# Full dataset
original_df = pd.read_parquet('/data/workspace/dataset/full-dataset/raw/train.parquet')
original_df.head()

In [None]:
# Full dataset
input_df = pd.read_parquet('/data/workspace/dataset/full-dataset/processed/train.parquet')
input_df['url_count'] = get_url_count(original_df.urls)
input_df.head()

In [None]:
# Join original_df with features_df and features_df_2
combined_df = original_df.join(input_df)

# Create new feature: empty_email True when text_clean == '' AND attachment_types == [] AND urls == []
combined_df['empty_email'] = (
    (combined_df['text_clean'] == '') &
    (combined_df['attachment_types'].apply(lambda x: len(x) == 0)) &
    (combined_df['urls'].apply(lambda x: len(x) == 0))
)

combined_df.head()

In [None]:
# Filter out self-phishing emails
df_without_sp = combined_df[combined_df['target_3'] != 'self_phishing'].copy()
df_without_sp.head()

In [None]:
df_without_sp['new_target'] = df_without_sp['target_1']

# Replace the new_target from benign to malicious when text_clean == '' AND attachment_types == [] AND urls == []
df_without_sp['new_target'] = np.where(
    (df_without_sp['text_clean'] == '') &
    (df_without_sp['attachment_types'].apply(lambda x: len(x) == 0)) &
    (df_without_sp['urls'].apply(lambda x: len(x) == 0)),
    'malicious',
    df_without_sp['new_target']
)

In [None]:
df_without_sp.head()

# Train Test Split

In [None]:
train_df, test_df = train_test_split(df_without_sp, test_size=0.3, random_state=42)

list_cols = ["Content_types", "attachment_types", "urls"]

for col in list_cols:
    train_df[col] = train_df[col].apply(lambda x: " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x))
    test_df[col] = test_df[col].apply(lambda x: " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x))


train_df["Subject"] = train_df["Subject"].fillna("")
train_df["text_preprocessed"] = train_df["text_preprocessed"].fillna("")

test_df["Subject"] = test_df["Subject"].fillna("")
test_df["text_preprocessed"] = test_df["text_preprocessed"].fillna("")

X_train = train_df.drop(columns=['new_target'])
y_train = train_df['new_target']

X_test = test_df.drop(columns=['new_target'])
y_test = test_df['new_target']

label_encoder = LabelEncoder()
y_train_num = label_encoder.fit_transform(y_train)
y_test_num = label_encoder.transform(y_test)

In [None]:
train_df.head()

# Preparing Preprocessor

## Feature Transformers:

In [None]:
numeric_transformer = make_pipeline(StandardScaler())

binary_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', drop='if_binary'))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', drop='if_binary'))

subject_vectorizer = make_pipeline(CountVectorizer())

text_preprocessed_vectorizer = make_pipeline(CountVectorizer())

## Header features & preprocessor:

In [None]:
header_numeric_feats = [
    "new_routing_length"                    # Newly added by Danish
]

header_binary_feats = [
    "dmarc_authentication_present", 
    "dkim_sender_domains_match",
    "to_from_addresses_match", 
    "sender_email_spf_match",
    "from_reply_to_domain_match",           # Newly added by Danish
    "internal_server_transfer_count",       # Newly added by Danish
    "name_server_match"                     # Newly added by Danish

header_categorical_feats = [
    "dkim_result",
    "spf_result", 
    "dmarc_result"
]

header_text_feats = [
    "Subject"
]

preprocessor_header = make_column_transformer(
    ("passthrough", header_numeric_feats),
    (binary_transformer, header_binary_feats),
    (categorical_transformer, header_categorical_feats),
    (subject_vectorizer, header_text_feats[0]), # Subject
    remainder='drop'
)

## Body features & Preprocessor

In [None]:
body_numeric_feats = [
        "word_count",
        "readable_proportion",
        "whitespace_ratio",
        "alphabet_proportion",
        "grammar_error_rate",
        "english_french_proportion",
        "url_count",                            # Newly added urls: Danish
        "content_types",                        # Newly added content type: Danish (text, multimedia, others)
]

body_binary_feats = [
        "non_ascii_present",
        "hidden_text_present",
        "empty_email"                           # Based on text_clean = empty, no non-text contents (no multimedia, no others), no URLs 
]

body_categorical_feats = [
        "html_parsing_error"
]

body_text_feats = [
        "text_preprocessed"
]

preprocessor_body = make_column_transformer(
        (numeric_transformer, body_numeric_feats),
        (binary_transformer, body_binary_feats),
        (categorical_transformer, body_categorical_feats),
        (text_preprocessed_vectorizer, body_text_feats[0]), # text_preprocessed
        remainder='drop'
)

# Modelling

## Result dictionaries:

In [None]:
# Train evaluation dictionaries
trained_models = {}
train_predictions = {}
train_classification_report_dict = {}
train_confusion_matrices = {}
train_fpr_dict = {}
train_f1_benign_dict = {}
train_f1_malicious_dict = {}

# Validation evaluation dictionaries
test_predictions = {}
test_classification_report_dict = {}
test_confusion_matrices = {}
test_fpr_dict = {}
test_f1_benign_dict = {}
test_f1_malicious_dict = {}

# CV results
scoring = ['accuracy', 'f1', 'precision', 'recall']
results_df = None
results_dict = {}

In [None]:
def evaluate_and_store_results(model_name, model, X_train, y_train, X_test, y_test, label_encoder):
    y_pred = model.predict(X_train)
    trained_models[model_name] = model
    train_predictions[model_name] = y_pred

    # Training evaluation
    train_classification_report_dict[model_name] = classification_report(
        y_train, y_pred, target_names=label_encoder.classes_, output_dict=True
    )
    train_f1_benign_dict[model_name] = f1_score(y_train, y_pred, pos_label=0)
    train_f1_malicious_dict[model_name] = f1_score(y_train, y_pred, pos_label=1)
    cm = confusion_matrix(y_train, y_pred)
    train_confusion_matrices[model_name] = cm
    TN, FP, FN, TP = cm.ravel()
    train_fpr_dict[model_name] = FP / (FP + TN)

    # Test evaluation
    y_test_pred = model.predict(X_test)
    test_predictions[model_name] = y_test_pred
    test_classification_report_dict[model_name] = classification_report(
        y_test, y_test_pred, target_names=label_encoder.classes_, output_dict=True
    )
    test_f1_benign_dict[model_name] = f1_score(y_test, y_test_pred, pos_label=0)
    test_f1_malicious_dict[model_name] = f1_score(y_test, y_test_pred, pos_label=1)
    cm_test = confusion_matrix(y_test, y_test_pred)
    test_confusion_matrices[model_name] = cm_test
    TN, FP, FN, TP = cm_test.ravel()
    test_fpr_dict[model_name] = FP / (FP + TN)

## Stacking (XGB_header + XGB_body)

In [None]:
pipe_header = make_pipeline(
    preprocessor_header,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

pipe_body = make_pipeline(
    preprocessor_body,
    XGBClassifier(n_jobs=-1, eval_metric="error", objective="binary:logistic")
)

In [None]:
estimator = [("header", pipe_header), ("body", pipe_body)]

stacking = StackingClassifier(
    estimators=estimator,
    final_estimator=SVC(kernel='rbf', class_weight='balanced', random_state=123),
    n_jobs=-1
)

In [None]:
stacking

In [None]:
model_name = "Stacking (XGB_header + XGB_body)"

model = stacking
model.fit(X_train, y_train_num)

evaluate_and_store_results(model_name, model, X_train, y_train_num, X_test, y_test_num, label_encoder)

# Result

## Train evaluation:

In [None]:
train_results_df = pd.DataFrame({
    "Model": list(trained_models.keys()),
    "Precision Benign": [train_classification_report_dict[model_name]["benign"]["precision"] for model_name in trained_models.keys()],
    "Precision Malicious": [train_classification_report_dict[model_name]["malicious"]["precision"] for model_name in trained_models.keys()],
    "Recall Benign": [train_classification_report_dict[model_name]["benign"]["recall"] for model_name in trained_models.keys()],
    "Recall Malicious": [train_classification_report_dict[model_name]["malicious"]["recall"] for model_name in trained_models.keys()],
    "F1 Benign": list(train_f1_benign_dict.values()),
    "F1 Malicious": list(train_f1_malicious_dict.values()),
    "FPR": list(train_fpr_dict.values()),
    "confusion_matrix": list(train_confusion_matrices.values()),
})

float_cols = train_results_df.select_dtypes(include='float').columns
train_results_df[float_cols] = train_results_df[float_cols].round(2)
train_results_df = train_results_df.set_index("Model")
train_results_df

## Validation evaluation

In [None]:
test_results_df = pd.DataFrame({
    "Model": list(trained_models.keys()),
    "Precision Benign": [test_classification_report_dict[model_name]['benign']['precision'] for model_name in trained_models.keys()],
    "Precision Malicious": [test_classification_report_dict[model_name]['malicious']['precision'] for model_name in trained_models.keys()],
    "Recall Benign": [test_classification_report_dict[model_name]['benign']['recall'] for model_name in trained_models.keys()],
    "Recall Malicious": [test_classification_report_dict[model_name]['malicious']['recall'] for model_name in trained_models.keys()],
    "F1 Benign": list(test_f1_benign_dict.values()),
    "F1 Malicious": list(test_f1_malicious_dict.values()),
    "FPR": list(test_fpr_dict.values()),
    "Confusion Matrix": list(test_confusion_matrices.values())
})

float_cols = test_results_df.select_dtypes(include='float').columns
test_results_df[float_cols] = test_results_df[float_cols].round(2)
test_results_df = test_results_df.set_index("Model")
test_results_df