# Libraries

In [None]:
import pandas as pd
import numpy as np

import sys
import os
username = os.environ.get('USER')
sys.path.append(f'/data/workspace/{username}')

sys.path.append(os.path.join(os.path.abspath("../../"), "src"))
from extract_header_features import *
from extract_text_features import *
from extract_url_features import *
from extract_text_keywords import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.preprocessing import FunctionTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import make_column_transformer

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, lognorm, loguniform, randint, uniform, norm, randint

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Reading in dataset

In [None]:
# Full dataset
original_df = pd.read_parquet('/data/workspace/dataset/full-dataset/raw/train.parquet')
original_df

In [None]:
# Full dataset
input_df = pd.read_parquet('/data/workspace/dataset/full-dataset/processed/train.parquet')
input_df

In [None]:
# Join original_df with features_df and features_df_2
combined_df = original_df.join(input_df)
combined_df

In [None]:
# Filter out self-phishing emails
combined_df_without_self_phishing = combined_df[combined_df['target_3'] != 'self_phishing'].copy()
combined_df_without_self_phishing

In [None]:
# # Filter out columns that are related to the subject and body
# filter_cols = ['Subject', 'text_preprocessed', 'target_1']
# filtered_combined_df = combined_df.copy()
# filtered_combined_df = combined_df[filter_cols]
# filtered_combined_df


In [None]:
# filtered_combined_df['Subject'] = filtered_combined_df['Subject'].fillna('')
# filtered_combined_df['subject_preprocessed'] = preprocess_text(filtered_combined_df['Subject'])

In [None]:
train_df, test_df = train_test_split(combined_df_without_self_phishing, test_size=0.3, random_state=42)

In [None]:
X_train = train_df.drop(columns=['target_1'])
y_train = train_df['target_1']

X_test = test_df.drop(columns=['target_1'])
y_test = test_df['target_1']

label_encoder = LabelEncoder()
y_train_num = label_encoder.fit_transform(y_train)
y_test_num = label_encoder.transform(y_test)

In [None]:
X_train

# Preprocessor

In [None]:
# numeric_feats = [
#     "routing_length", "html_parsing_error", "word_count", 
#     "readable_proportion", "whitespace_ratio", "alphabet_proportion",
#     "grammar_error_rate", "english_french_proportion",
#     "url_count"
# ]  # apply scaling

# binary_feats = [
#     "is_multipart",
#     "dmarc_authentication_present", "dkim_result",
#     "spf_result", "dmarc_result", "dkim_sender_domains_match",
#     "attachments_present", "to_from_addresses_match", "sender_email_spf_match",
#     "non_ascii_present", "hidden_text_present"
#     #"ip_addr_urls", "http_urls_present", "url_at_symbol",
#     #"url_port_number", "any_long_urls", "url_multiple_subdomains"
#     ]

# text_feats = [
#     "Content_types" ,
#     "urls",
#     "attachment_types",
#     "Subject",
#     "text_preprocessed",
#     ]

# categorical_feats = [
#     "From_name", "From_email", "From_email_domain", 
#     "To_name", "To_email", "To_email_domain",
#     "Content-Language"
#     ]  # apply one-hot encoding

# passthrough_feats = [""]  # do not apply any transformation

# drop_feats = [
#     "From",                         # Info extracted to From_name, From_email, From_email_domain
#     "To",                           # Info extracted to To_name, To_email, To_email_domain
#     "Received",                     # Info extracted to routing_length
#     "Authentication-Results",       # Info extracted to dmarc_authentication_present, dkim_result, spf_result, dmarc_result
#     "received-spf",                 # Info extracted to spf_result, sender_email_spf_match
#     "DKIM-Signature",               # Info extracted to dkim_sender_domains_match
#     "Reply-To",                     # Mostly missing, not useful
#     "Return-Path",                  # Mostly missing, not useful
#     "text_plain",                   
#     "text_clean", 
#     "text_html", 
#     # "attachment_types",             # Info extracted to attachments_present
#     # "urls",                         # Info extracted to url_count, http_urls_present, ip_addr_urls, url_at_symbol, url_port_number, any_long_urls, url_multiple_subdomains (not used yet)
#     "target_2",                     # Level 2 target variable
#     "target_3",                     # Level 3 target variable
#     # "Subject",                      # To be used later
#     # "text_preprocessed",            # To be used later
#     # "Content_types"                 # To be used later
# ]



In [None]:
numeric_feats = [
    "routing_length", "html_parsing_error", "word_count", 
    "readable_proportion", "whitespace_ratio", "alphabet_proportion",
    "grammar_error_rate", "english_french_proportion",
    "url_count"
]  # apply scaling

binary_feats = [
    "is_multipart",
    "dmarc_authentication_present", "dkim_result",
    "spf_result", "dmarc_result", "dkim_sender_domains_match",
    "attachments_present", "to_from_addresses_match", "sender_email_spf_match",
    "non_ascii_present", "hidden_text_present"
    #"ip_addr_urls", "http_urls_present", "url_at_symbol",
    #"url_port_number", "any_long_urls", "url_multiple_subdomains"
    ]

text_feats = [
    "Content_types" ,
    "urls",
    "attachment_types",
    "Subject",
    "text_preprocessed",
    ]

categorical_feats = [
    "From_name", "From_email", "From_email_domain", 
    "To_name", "To_email", "To_email_domain",
    "Content-Language"
    ]  # apply one-hot encoding

passthrough_feats = [""]  # do not apply any transformation

drop_feats = [
    "From",                         # Info extracted to From_name, From_email, From_email_domain
    "To",                           # Info extracted to To_name, To_email, To_email_domain
    "Received",                     # Info extracted to routing_length
    "Authentication-Results",       # Info extracted to dmarc_authentication_present, dkim_result, spf_result, dmarc_result
    "received-spf",                 # Info extracted to spf_result, sender_email_spf_match
    "DKIM-Signature",               # Info extracted to dkim_sender_domains_match
    "Reply-To",                     # Mostly missing, not useful
    "Return-Path",                  # Mostly missing, not useful
    "text_plain",                   
    "text_clean", 
    "text_html", 
    # "attachment_types",             # Info extracted to attachments_present
    # "urls",                         # Info extracted to url_count, http_urls_present, ip_addr_urls, url_at_symbol, url_port_number, any_long_urls, url_multiple_subdomains (not used yet)
    "target_2",                     # Level 2 target variable
    "target_3",                     # Level 3 target variable
    # "Subject",                      # To be used later
    # "text_preprocessed",            # To be used later
    # "Content_types"                 # To be used later
]



In [None]:
cols = ["Content_types", "attachment_types", "urls"]

for col in cols:
    X_train[col] = X_train[col].apply(lambda x: " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x))
    X_test[col] = X_test[col].apply(lambda x: " ".join(x) if isinstance(x, (list, np.ndarray)) else str(x))

numeric_transformer = make_pipeline(StandardScaler())

binary_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', drop='if_binary'))

categorical_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value="None"), OneHotEncoder(handle_unknown='ignore'))

X_train["Subject"] = X_train["Subject"].fillna("")
X_train["text_preprocessed"] = X_train["text_preprocessed"].fillna("")
X_test["Subject"] = X_test["Subject"].fillna("")
X_test["text_preprocessed"] = X_test["text_preprocessed"].fillna("")



In [None]:
tfidf_subject_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="")),
    ("flatten", FunctionTransformer(lambda x: x.ravel(), validate=False)),
    ("tfidf", TfidfVectorizer())
])

tfidf_text_preprocessed_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="")),
    ("flatten", FunctionTransformer(lambda x: x.ravel(), validate=False)),
    ("tfidf", TfidfVectorizer())
])

preprocessor = make_column_transformer(
    # (numeric_transformer, numeric_feats),
    # (binary_transformer, binary_feats),
    # (categorical_transformer, categorical_feats),
    # (CountVectorizer(), "Content_types"),
    # (CountVectorizer(), "urls"),
    # (CountVectorizer(), "attachment_types"),
    (tfidf_subject_pipeline, ["Subject"]),
    (tfidf_text_preprocessed_pipeline, ["text_preprocessed"]),
    ("drop", drop_feats + numeric_feats + binary_feats + categorical_feats + ["Content_types", "urls", "attachment_types"])
)

# XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
pipe_xgb = make_pipeline(
    preprocessor, XGBClassifier(
        random_state=123, verbosity=0
    )
)
pipe_xgb.fit(X_train, y_train_num)


In [None]:
# Code adapted from DSCI571: Lecture 4 
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
scoring = ['accuracy', 'f1', 'precision', 'recall']
results_dict = {}
results_dict["xgb"] = mean_std_cross_val_scores(pipe_xgb, X_train, y_train_num, scoring=scoring, return_train_score=True)
pd.DataFrame(results_dict)

In [None]:
# TRAIN SET EVALUATION
y_pred = pipe_xgb.predict(X_train)
cm = confusion_matrix(y_train_num, y_pred)

TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN)
print("False Positive Rate on Train Set:", fpr)

In [None]:
y_pred_test = pipe_xgb.predict(X_test)

In [None]:
# TEST SET EVALUATION
print(classification_report(y_test_num, y_pred_test, target_names=label_encoder.classes_))

In [None]:
cm_test = confusion_matrix(y_test_num, y_pred_test)
TN_test, FP_test, FN_test, TP_test = cm_test.ravel()

cm_test

In [None]:
f1 = f1_score(y_test_num, y_pred_test)
print("F1 Score on Test Set:", f1)
fpr_test = FP_test / (FP_test + TN_test)
print("False Positive Rate on Test Set:", fpr_test)

In [None]:
f1_benign = f1_score(y_test_num, y_pred_test, pos_label=0)
f1_malicious = f1_score(y_test_num, y_pred_test, pos_label=1)

In [None]:
test_dict = {}
test_dict["xgb"] = {
    "F1_benign": f1_benign,
    "F1_malicious": f1_malicious,
    "FPR": fpr_test,
    "confusion_matrix": cm_test
}
pd.DataFrame(test_dict)

### Balanced class weight with wrapper

In [None]:
from sklearn.utils import class_weight
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=train_df['target_1']
)


In [None]:
pipe_xgb.fit(X_train, y_train_num, xgbclassifier__sample_weight=classes_weights)

In [None]:
results_dict["xgb_balanced"] = mean_std_cross_val_scores(pipe_xgb, X_train, y_train_num, scoring=scoring, return_train_score=True)
pd.DataFrame(results_dict)

In [None]:
# TRAIN SET EVALUATION
y_pred = pipe_xgb.predict(X_train)
cm = confusion_matrix(y_train_num, y_pred)

TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN)
print("False Positive Rate on Train Set:", fpr)

In [None]:
y_pred_test = pipe_xgb.predict(X_test)

In [None]:
# TEST SET EVALUATION
print(classification_report(y_test_num, y_pred_test, target_names=label_encoder.classes_))

In [None]:
cm_test = confusion_matrix(y_test_num, y_pred_test)
TN_test, FP_test, FN_test, TP_test = cm_test.ravel()

cm_test

In [None]:
f1 = f1_score(y_test_num, y_pred_test)
print("F1 Score on Test Set:", f1)
fpr_test = FP_test / (FP_test + TN_test)
print("False Positive Rate on Test Set:", fpr_test)

In [None]:
f1_benign = f1_score(y_test_num, y_pred_test, pos_label=0)
f1_malicious = f1_score(y_test_num, y_pred_test, pos_label=1)

test_dict["xgb_balanced"] = {
    "F1_benign": f1_benign,
    "F1_malicious": f1_malicious,
    "FPR": fpr_test,
    "confusion_matrix": cm_test
}
pd.DataFrame(test_dict)

# Hyperparameter Tuning for XGB

In [None]:
param_dist_xgb = {
    "xgbclassifier__n_estimators": randint(100, 1000),
    "xgbclassifier__learning_rate": uniform(0.01, 0.3),
    "xgbclassifier__max_depth": randint(3, 10),
    "xgbclassifier__min_child_weight": randint(1, 10),
    "xgbclassifier__subsample": uniform(0.5, 0.5),
    "xgbclassifier__colsample_bytree": uniform(0.5, 0.5),
    "xgbclassifier__gamma": uniform(0, 5),
    "xgbclassifier__reg_alpha": uniform(0, 5),
    "xgbclassifier__reg_lambda": uniform(0, 5)
}

random_search_xgb = RandomizedSearchCV(
    pipe_xgb,
    param_distributions=param_dist_xgb,
    n_iter=5,
    scoring="f1",
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=123,
)

In [None]:
random_search_xgb.fit(X_train, y_train_num)

In [None]:
results_dict["xgb_tuned"] = mean_std_cross_val_scores(random_search_xgb.best_estimator_, X_train, y_train_num, scoring=scoring, return_train_score=True)
pd.DataFrame(results_dict)

In [None]:
# TRAIN SET EVALUATION
y_pred = random_search_xgb.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train_num, y_pred)

TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN)
print("False Positive Rate on Train Set:", fpr)

In [None]:
# TEST SET EVALUATION
y_pred_test = random_search_xgb.best_estimator_.predict(X_test)
print(classification_report(y_test_num, y_pred_test, target_names=label_encoder.classes_))

In [None]:
cm_test = confusion_matrix(y_test_num, y_pred_test)
TN_test, FP_test, FN_test, TP_test = cm_test.ravel()

cm_test

In [None]:
f1 = f1_score(y_test_num, y_pred_test)
print("F1 Score on Test Set:", f1)
fpr_test = FP_test / (FP_test + TN_test)
print("False Positive Rate on test set:", fpr_test)

# Random Forest

In [None]:
pipe_rf = make_pipeline(
    preprocessor, RandomForestClassifier(max_depth=2, n_estimators=3, class_weight="balanced", random_state=123)
)
pipe_rf.fit(X_train, y_train_num)


In [None]:
results_dict["rf"] = mean_std_cross_val_scores(pipe_rf, X_train, y_train_num, scoring=scoring, return_train_score=True)
pd.DataFrame(results_dict)

In [None]:
# TRAIN SET EVALUATION
y_pred = pipe_rf.predict(X_train)
cm = confusion_matrix(y_train_num, y_pred)
cm

In [None]:
TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN)
print("False Positive Rate on Train set:", fpr)

In [None]:
# TEST SET EVALUATION
y_pred_test = pipe_rf.predict(X_test)
print(classification_report(y_test_num, y_pred_test, target_names=label_encoder.classes_))


In [None]:
f1 = f1_score(y_test_num, y_pred_test)
print("F1 Score on Test Set:", f1)

In [None]:
cm_test = confusion_matrix(y_test_num, y_pred_test)
TN_test, FP_test, FN_test, TP_test = cm_test.ravel()

cm_test

In [None]:
fpr_test = FP_test / (FP_test + TN_test)
print("False Positive Rate on test set:", fpr_test)

# Hyperparameter Tuning for RF

In [None]:
param_dist = {
    "randomforestclassifier__criterion": ["gini", "entropy", "log_loss"],
    "randomforestclassifier__n_estimators": randint(10, 200),
    "randomforestclassifier__max_depth": randint(1, 20),
    "randomforestclassifier__min_samples_split": randint(2, 20),
    "randomforestclassifier__min_samples_leaf": randint(1, 20),
    "randomforestclassifier__max_features": uniform(0.1, 0.9),
    "randomforestclassifier__bootstrap": [True, False],
    "columntransformer__pipeline-4__tfidf__max_features": [None, 1000, 5000, 10000, 15000, 20000],
    "columntransformer__pipeline-5__tfidf__max_features": [None, 1000, 5000, 10000, 15000, 20000]
}

pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(random_state=123, class_weight="balanced", n_jobs=-1))

random_search_rf = RandomizedSearchCV(
    pipe_rf,
    param_distributions=param_dist,
    n_iter=10,
    scoring="f1",
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=123,
)

In [None]:
random_search_rf.fit(X_train, y_train_num)

In [None]:
results_dict["rf_tuned"] = mean_std_cross_val_scores(random_search_rf.best_estimator_, X_train, y_train_num, scoring=scoring, return_train_score=True)
pd.DataFrame(results_dict)

In [None]:
# TRAIN SET EVALUATION
y_pred = random_search_rf.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train_num, y_pred)

TN, FP, FN, TP = cm.ravel()
fpr = FP / (FP + TN)
print("False Positive Rate on Train Set:", fpr)

In [None]:
# TEST SET EVALUATION
y_pred_test = random_search_rf.best_estimator_.predict(X_test)
print(classification_report(y_test_num, y_pred_test, target_names=label_encoder.classes_))

In [None]:
f1 = f1_score(y_test_num, y_pred_test)
print("F1 Score on Test Set:", f1)

In [None]:
cm_test = confusion_matrix(y_test_num, y_pred_test)
TN_test, FP_test, FN_test, TP_test = cm_test.ravel()

cm_test

In [None]:
fpr_test = FP_test / (FP_test + TN_test)
print("False Positive Rate on test set:", fpr_test)