In [None]:
import os
import sys

sys.path.append(os.path.join(os.path.abspath("../../"), "src"))

import IPython
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import HTML

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.preprocessing import FunctionTransformer

%matplotlib inline
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE, KMeansSMOTE , ADASYN,SVMSMOTE,KMeansSMOTE,BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline

from extract_text_keywords import preprocess_text

In [None]:
# read in sample data set 
original_df = pd.read_parquet('/data/workspace/alexww14/2025-cv/data/sampled-dataset/raw/sample-large.parquet')
inputs_df = pd.read_parquet('/data/workspace/alexww14/2025-cv/data/sampled-dataset/processed/sample-small.parquet')

In [None]:
# only take subject text and subject content to the input_df
inputs_df = inputs_df.join(original_df[['Subject', 'text_preprocessed', 'target_1']])

In [None]:
# train_df, test_df = train_test_split(inputs_df, test_size=0.3, random_state=40)
# X_train_body = train_df['text_preprocessed']
# X_train_subject = train_df['Subject']
# y_train = train_df['target_1']
# X_test_body = test_df['text_preprocessed']
# X_test_subject = test_df['Subject']
# y_test = test_df['target_1']

#### Clean up Subject line and create new feature by combining subject with body 

In [None]:
train_df, test_df = train_test_split(original_df, test_size=0.3, random_state=40)

train_df['subject_preprocessed'] = preprocess_text(train_df['Subject'].fillna(""))
train_df['combined_text_preprocessed'] = train_df['subject_preprocessed'] + " " + train_df['text_preprocessed']

test_df['subject_preprocessed'] = preprocess_text(test_df['Subject'].fillna(""))
test_df['combined_text_preprocessed'] = test_df['subject_preprocessed'] + " " + test_df['text_preprocessed']


In [None]:
# creating different X_trains, each model only use one of these 

X_train_body = train_df['text_preprocessed']
X_train_subject = train_df['subject_preprocessed']
X_train_combined = train_df['combined_text_preprocessed']
y_train = train_df['target_1']

X_test_body = test_df['text_preprocessed']
X_test_subject = test_df['subject_preprocessed']
X_test_combined = test_df['combined_text_preprocessed']
y_test = test_df['target_1']

#### EDA

#### Check class distribution

In [None]:
y_train.value_counts() / len(y_train)

In [None]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, cv = 5, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
scoring = {
    'precision': make_scorer(precision_score, pos_label='malicious'),
    'recall': make_scorer(recall_score, pos_label='malicious'),
    'f1': make_scorer(f1_score, pos_label='malicious')
}

## Bernoulli NB 

In [None]:
inputs_dict = {
    'body': X_train_body, 
    'subject': X_train_subject, 
    'combined':X_train_combined }


#### trying to see which X_train yielded the best result with bernoulli NB

In [None]:
result_dict = {}

for name, X in inputs_dict.items(): 

    count_vec = CountVectorizer(binary=False)
    model = BernoulliNB()
    pipe_nb = make_pipeline(count_vec, model)
    result_dict[f"{name}"] = mean_std_cross_val_scores(pipe_nb, X, y_train, return_train_score=True, scoring=scoring)

pd.DataFrame(result_dict)


#### Hyperparameter tuning - email subject only

In [None]:
param_grid = {
    'bernoullinb__alpha': [0.1, 0.5, 1.0, 2.0],
    'bernoullinb__class_prior': [None, [0.5,0.5], [0.4,0.6], [0.6,0.4], [0.3,0.7], [0.7,0.3]]  
}

In [None]:
count_vec = CountVectorizer(binary=False)
model = BernoulliNB()
pipe_nb = make_pipeline(count_vec, model)

In [None]:
param_grid_search = GridSearchCV(pipe_nb, param_grid, n_jobs=-1, cv=5, return_train_score = True, scoring = scoring, refit='f1')

param_grid_search.fit(X_train_subject, y_train)

In [None]:
param_grid_search.cv_results_.keys()

In [None]:
pd.DataFrame(param_grid_search.cv_results_)[

    ['rank_test_f1',
     'mean_test_f1',
     'mean_train_f1',
     'mean_fit_time',
     'mean_score_time',
     'param_bernoullinb__alpha',
     'param_bernoullinb__class_prior'
    ]
].set_index('rank_test_f1').sort_index()[:5]

#### Test best model

In [None]:
best_nb_alpha = param_grid_search.best_params_['bernoullinb__alpha']
best_class_prior = param_grid_search.best_params_['bernoullinb__class_prior']

count_vec = CountVectorizer(binary=False)
X_train_vec = count_vec.fit_transform(X_train_subject)
X_test_vec = count_vec.transform(X_test_subject)

best_nb = BernoulliNB(alpha=best_nb_alpha,class_prior=best_class_prior)
best_nb.fit(X_train_vec, y_train)
y_pred = best_nb.predict(X_test_vec)

In [None]:
cm = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred))
cm.plot()

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))

## Bernoulli NB - with SMOTE

#### Hyperparameter tuning

In [None]:
pipe_nb = Pipeline([
    ('count_vec', CountVectorizer(binary=False)),  
    ('smote', SMOTE(random_state=42)),     
    ('nb', BernoulliNB())                  
])

param_grid = {
    'count_vec__max_df': [0.8, 0.9, 1.0],          # Filter out very common words
    'count_vec__min_df': [1, 3, 5],                # Filter out rare words
    'count_vec__max_features': [500, 1000, 5000],
    'smote__k_neighbors': [3, 5, 7],  
    'smote__sampling_strategy': ['auto', 1, 0.8, 0.6]  
}


In [None]:
param_grid_search = GridSearchCV(pipe_nb, param_grid, n_jobs=-1, cv=5, return_train_score = True, scoring = 'roc_auc')
param_grid_search.fit(X_train_subject, y_train)

In [None]:
param_grid_search.cv_results_.keys()

In [None]:
pd.DataFrame(param_grid_search.cv_results_)[

    ['rank_test_score',
     'mean_test_score',
     'mean_train_score',
     'mean_fit_time',
     'mean_score_time',
     'param_smote__k_neighbors',
     'param_smote__sampling_strategy', 
     'param_count_vec__max_df', 
     'param_count_vec__max_features', 
     'param_count_vec__min_df'
    ]
].set_index('rank_test_score').sort_index()

#### Try best model 

In [None]:
param_grid_search.best_params_

In [None]:
best_count_vec__max_df = param_grid_search.best_params_['count_vec__max_df']
best_count_vec__max_features = param_grid_search.best_params_['count_vec__max_features']
best_count_vec__min_df = param_grid_search.best_params_['count_vec__min_df']
best_smote__k_neighbors = param_grid_search.best_params_['smote__k_neighbors']
best_smote_sampling_strategy = param_grid_search.best_params_['smote__sampling_strategy']

count_vec = CountVectorizer(max_df=best_count_vec__max_df, max_features=best_count_vec__max_features, min_df=best_count_vec__min_df)
smote = SMOTE(k_neighbors=best_smote__k_neighbors, sampling_strategy=best_smote_sampling_strategy, random_state=42)
model = BernoulliNB()

# vectorize data
X_train_subject_vec = count_vec.fit_transform(X_train_subject)
X_test_subject_vec = count_vec.transform(X_test_subject)

# Upsample with SMOTE 
X_train_subject_vec_over, y_train_over = smote.fit_resample(X_train_subject_vec, y_train)

model.fit(X_train_subject_vec_over, y_train_over)
y_pred = model.predict(X_test_subject_vec)

cm = confusion_matrix(y_test, y_pred)

In [None]:
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp+tn)
print(fpr)

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))

## Guassian NB 

In [None]:
result_dict = {}

for name, X in inputs_dict.items(): 

    tfidf_vec = TfidfVectorizer()
    to_dense = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)
    model = GaussianNB()
    pipe_gnb = make_pipeline(tfidf_vec, to_dense, model)
    result_dict[f"{name}"] = mean_std_cross_val_scores(pipe_gnb, X, y_train, return_train_score=True, scoring=scoring)

pd.DataFrame(result_dict)

In [None]:
tfidf_vec = TfidfVectorizer()

# transform with tfidf and then to dense matrix 
x_train_vec = tfidf_vec.fit_transform(X_train_combined) 
x_train_vec = x_train_vec.toarray()

x_test_vec = tfidf_vec.transform(X_test_combined)
x_test_vec = x_test_vec.toarray()

gnb = GaussianNB(priors=[0.1,0.9])
gnb.fit(x_train_vec, y_train)
y_pred = gnb.predict(x_test_vec)

In [None]:
cm = confusion_matrix(y_test, y_pred)

cm_plot = ConfusionMatrixDisplay(cm)
cm_plot.plot()

In [None]:
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp+tn)

print(fpr)

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))

## Gaussian NB - with SMOTE 

In [None]:
tfidf_vec = TfidfVectorizer()

# transform with tfidf and then to dense matrix 
x_train_vec = tfidf_vec.fit_transform(X_train_combined) 
x_train_vec = x_train_vec.toarray()

x_test_vec = tfidf_vec.transform(X_test_combined)
x_test_vec = x_test_vec.toarray()

# upsample with SMOTE 
oversample=SMOTE(sampling_strategy=1) 
x_train_vec_over, y_train_over = oversample.fit_resample(x_train_vec, y_train)

gnb = GaussianNB()
gnb.fit(x_train_vec_over, y_train_over)
y_pred = gnb.predict(x_test_vec)

In [None]:
cm = confusion_matrix(y_test, y_pred)

cm_plot = ConfusionMatrixDisplay(cm)
cm_plot.plot()

In [None]:
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp+tn)

print(fpr)

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))

#### Hyperparameter tuning - Gaussian NB with SMOTE

In [None]:
to_dense = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)
tfidf_vec = TfidfVectorizer()
smote = SMOTE()
model = GaussianNB()

pipe_gnb = Pipeline([
    ('tfidf', TfidfVectorizer()),          # Step 1: TF-IDF
    ('to_dense', to_dense),                # Step 2: Make it dense for GNB
    ('smote', SMOTE(random_state=42)),     # Step 3: SMOTE oversampling
    ('gnb', GaussianNB())                  # Step 4: Gaussian Naive Bayes
])


In [None]:
param_grid = {
    'tfidf__max_df': [0.7, 0.8, 0.9, 1.0],        # Filter out very common words
    'tfidf__min_df': [1, 3, 5, 7],                # Filter out rare words
    'tfidf__max_features': [500, 1000, 5000],
    'smote__k_neighbors': [3, 5, 7],  
    'smote__sampling_strategy': ['auto', 0.8, 0.6, 0.4]  
}

In [None]:
# scoring = {
#     'precision': make_scorer(precision_score, pos_label='malicious'),
#     'recall': make_scorer(recall_score, pos_label='malicious'),
#     'f1': make_scorer(f1_score, pos_label='malicious'),
#     'roc-auc': make_scorer(roc_auc_score, pos_label='malicious')
# }

In [None]:
param_grid_search = GridSearchCV(pipe_gnb, param_grid, n_jobs=-1, cv=5, return_train_score = True, scoring = 'roc_auc')

param_grid_search.fit(X_train_combined, y_train)

In [None]:
param_grid_search.cv_results_.keys()

In [None]:
pd.DataFrame(param_grid_search.cv_results_)[

    ['rank_test_score',
     'mean_test_score',
     'mean_train_score',
     'mean_fit_time',
     'mean_score_time',
     'param_smote__k_neighbors',
     'param_smote__sampling_strategy', 
     'param_tfidf__max_df', 
     'param_tfidf__max_features', 
     'param_tfidf__min_df'
    ]
].set_index('rank_test_score').sort_index()

#### Try best model

In [None]:
best_tfidf__max_df = param_grid_search.best_params_['tfidf__max_df']
best_tfidf__max_features = param_grid_search.best_params_['tfidf__max_features']
best_tfidf__min_df = param_grid_search.best_params_['tfidf__min_df']
best_smote__k_neighbors = param_grid_search.best_params_['smote__k_neighbors']
best_smote_sampling_strategy = param_grid_search.best_params_['smote__sampling_strategy']


to_dense = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)
tfidf_vec = TfidfVectorizer(max_df=best_tfidf__max_df, max_features=best_tfidf__max_features, min_df=best_tfidf__min_df)
smote = SMOTE(sampling_strategy=best_smote_sampling_strategy, k_neighbors=best_smote__k_neighbors)
model = GaussianNB()

x_train_vec = tfidf_vec.fit_transform(X_train_combined) 
x_train_vec = x_train_vec.toarray()

x_test_vec = tfidf_vec.transform(X_test_combined)
x_test_vec = x_test_vec.toarray()

# upsample with SMOTE 
x_train_vec_over, y_train_over = smote.fit_resample(x_train_vec, y_train)

model = GaussianNB()
model.fit(x_train_vec_over, y_train_over)
y_pred = model.predict(x_test_vec)

In [None]:
cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp+tn)
print(fpr)

In [None]:
cm_plot = ConfusionMatrixDisplay(cm)
cm_plot.plot()

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))