In [None]:
import sys
import os 
sys.path.append(os.path.join(os.path.abspath("../../"), "src"))

from hashlib import sha1
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE, KMeansSMOTE , ADASYN,SVMSMOTE,KMeansSMOTE,BorderlineSMOTE
from imblearn.pipeline import Pipeline

import altair as alt
alt.data_transformers.enable('vegafusion')

from extract_text_keywords import preprocess_text

In [None]:
# import sample dataset 
original_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/raw/sample-large.parquet')
input_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/processed/sample-large.parquet')

#join the subject text and body text to input_df 
input_df = input_df.join(original_df[['Subject', 'text_preprocessed']])

# preprocess subject text -> this will take a while if we run the full dataset 
input_df['subject_preprocessed'] = preprocess_text(input_df['Subject'].fillna(""))

input_df['text_preprocessed'] = input_df['text_preprocessed'].fillna("")
input_df['subject_preprocessed'] = input_df['subject_preprocessed'].fillna("")

input_df = input_df.join(original_df[['target_1', 'target_3']])

#drop self-phishing
input_df = input_df[input_df['target_3'] != 'self_phishing']

In [None]:
train_df, test_df = train_test_split(input_df, test_size=0.3, random_state=42)

X_train = train_df.iloc[:, :-1]
y_train = train_df['target_1']
X_test = test_df.iloc[:, :-1]
y_test = test_df['target_1']

In [None]:
train_df.select_dtypes(include=['number']).columns

## Preprocessor 

In [None]:
train_df.info()

In [None]:
X_train.select_dtypes(include=['bool, object']).columns

In [None]:
X_train.select_dtypes(include=['bool', 'object']).columns

In [None]:
numeric_features = ['routing_length','word_count',
       'readable_proportion', 'whitespace_ratio', 'alphabet_proportion',
       'grammar_error_rate', 'english_french_proportion', 'url_count']

categorical_features = ['spf_result', 'non_ascii_present', 'http_urls_present', 'any_long_urls', 'html_parsing_error']

text_features = ['text_preprocessed', 'subject_preprocessed']

drop_features = ['dmarc_authentication_present', 'dkim_result', 
       'dmarc_result', 'dkim_sender_domains_match', 'attachments_present',
       'to_from_addresses_match', 'sender_email_spf_match',
        'hidden_text_present', 'ip_addr_urls',
        'url_at_symbol', 'url_port_number',
        'url_multiple_subdomains', 'Subject']

In [None]:
preprocessor = make_column_transformer(   

    (StandardScaler(), numeric_features),
    (OneHotEncoder(drop='if_binary'), categorical_features),
    (CountVectorizer(binary=False), 'text_preprocessed'),
    (CountVectorizer(binary=False), 'subject_preprocessed'),
    ('drop', drop_features)
)

In [None]:
#tesing if preprocessor works 
transformed = preprocessor.fit_transform(X_train)

In [None]:
preprocessor.named_transformers_

In [None]:
preprocessor.named_transformers_['standardscaler'].get_feature_names_out()

In [None]:
preprocessor.named_transformers_['onehotencoder'].get_feature_names_out()

#### Hyperparameter tuning - no SMOTE

In [None]:
svc_pipe = make_pipeline(preprocessor, SVC())

In [None]:
param_grid = {"svc__C": np.logspace(-1, 2, 4),
              "svc__gamma":np.array([1e-3, 1e-2, 1e-1, 1])}

In [None]:
param_grid_search = GridSearchCV(svc_pipe, param_grid, n_jobs=-1, cv=5, return_train_score = True, scoring = 'roc_auc')

In [None]:
param_grid_search.fit(X_train, y_train)

In [None]:
param_grid_search.cv_results_.keys()

In [None]:
pd.DataFrame(param_grid_search.cv_results_)[

    ['rank_test_score',
     'mean_test_score',
     'mean_train_score',
     'mean_fit_time',
     'mean_score_time',
     'param_svc__C',
     'param_svc__gamma' 
    ]
].set_index('rank_test_score').sort_index().head()

In [None]:
best_C = param_grid_search.best_params_['svc__C']
best_gamma = param_grid_search.best_params_['svc__gamma']

In [None]:
svc = SVC(C=best_C, gamma=best_gamma)
x_train_transformed = preprocessor.fit_transform(X_train)
x_test_transformed = preprocessor.transform(X_test)

svc.fit(x_train_transformed, y_train)
y_pred = svc.predict(x_test_transformed)

cm = confusion_matrix(y_test, y_pred)

In [None]:
cm_plot = ConfusionMatrixDisplay(cm)
cm_plot.plot()

In [None]:
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp+tn)
print(fpr)

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))

#### Hyperparameter tuning - with SMOTE

In [None]:
smote = SMOTE(random_state=42)
svc = SVC()

svc_pipe_smote = Pipeline([
    ('preprocessor', preprocessor),        
    ('smote', smote),     
    ('svc', svc)             
])

In [None]:
param_grid_smote = {"svc__C": np.logspace(-1, 2, 4),
              "svc__gamma":np.array([1e-3, 1e-2, 1e-1, 1]),
              'smote__k_neighbors': [3, 5, 7],  
              'smote__sampling_strategy': ['auto', 0.8, 0.6, 0.4]}


In [None]:
param_grid_search_smote = GridSearchCV(svc_pipe_smote, param_grid_smote, n_jobs=-1, cv=5, return_train_score = True, scoring = 'roc_auc')

In [None]:
param_grid_search_smote.fit(X_train, y_train)

In [None]:
pd.DataFrame(param_grid_search_smote.cv_results_)[

    ['rank_test_score',
     'mean_test_score',
     'mean_train_score',
     'mean_fit_time',
     'mean_score_time',
     'param_svc__C',
     'param_svc__gamma',
     'param_smote__k_neighbors',
     'param_smote__sampling_strategy'
    ]
].set_index('rank_test_score').sort_index().head()

In [None]:
best_C = param_grid_search_smote.best_params_['svc__C']
best_gamma = param_grid_search_smote.best_params_['svc__gamma']
best_k = param_grid_search_smote.best_params_['smote__k_neighbors']
best_sample_strategy = param_grid_search_smote.best_params_['smote__sampling_strategy']

In [None]:
svc = SVC(C=best_C, gamma=best_gamma)
smote = SMOTE(sampling_strategy=best_sample_strategy, k_neighbors=best_k)

x_train_transformed = preprocessor.fit_transform(X_train)
x_test_transformed = preprocessor.transform(X_test)

x_train_over, y_train_over = smote.fit_resample(x_train_transformed, y_train)

svc.fit(x_train_over, y_train_over)
y_pred = svc.predict(x_test_transformed)

cm = confusion_matrix(y_test, y_pred)

In [None]:
cm_plot = ConfusionMatrixDisplay(cm)
cm_plot.plot()

In [None]:
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp+tn)
print(fpr)

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))