In [2]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)

In [3]:
# Read data
df = pd.read_csv('dataset_phishing.csv')

In [4]:
features = ['shortest_word_path',
 'ratio_intMedia',
 'links_in_tags',
 'nb_hyphens',
 'page_rank',
 'avg_word_path',
 'ratio_extHyperlinks',
 'longest_words_raw',
 'google_index',
 'length_hostname',
 'longest_word_host',
 'domain_registration_length',
 'nb_www',
 'nb_underscore',
 'nb_dots',
 'ratio_extMedia',
 'phish_hints',
 'domain_in_title',
 'web_traffic',
 'safe_anchor',
 'nb_space',
 'shortening_service',
 'ip',
 'domain_age',
 'nb_qm',
 'nb_hyperlinks',
 'nb_slash']

In [5]:
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y = le.fit_transform(df["status"])

# Step 1: Split data into 70% train and 30% temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 2: Split the temp set into 50% validation and 50% test (15% each of the original data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Traditional Method 2: SVM Model

In [17]:
from sklearn.svm import SVC  # Import SVM classifier

# Hyperparameter tuning for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

svm_grid_search = GridSearchCV(SVC(probability=True, random_state=42), svm_param_grid, cv=5)
svm_grid_search.fit(X_train, y_train)

# Best parameters for SVM
print("Best parameters for SVM:", svm_grid_search.best_params_)

# Validate and test the best SVM model
svm_model = svm_grid_search.best_estimator_
y_val_pred_svm = svm_model.predict(X_val)

# Compute metrics for validation set
svm_val_accuracy = accuracy_score(y_val, y_val_pred_svm)
svm_val_precision = precision_score(y_val, y_val_pred_svm)
svm_val_recall = recall_score(y_val, y_val_pred_svm)
svm_val_f1_score = f1_score(y_val, y_val_pred_svm)

# Print validation results
print("SVM Validation Results:")
print("Validation Accuracy:", svm_val_accuracy)
print("Validation Precision:", svm_val_precision)
print("Validation Recall:", svm_val_recall)
print("Validation F1 Score:", svm_val_f1_score)

y_pred_svm = svm_model.predict(X_test)
svm_conf_matrix = confusion_matrix(y_test, y_pred_svm)

# Compute metrics for SVM
svm_tn, svm_fp, svm_fn, svm_tp = svm_conf_matrix.ravel()
svm_accuracy = (svm_tp + svm_tn) / (svm_tp + svm_tn + svm_fp + svm_fn)
svm_precision = svm_tp / (svm_tp + svm_fp)
svm_recall = svm_tp / (svm_tp + svm_fn)
svm_f1_score = (2 * (svm_precision * svm_recall)) / (svm_precision + svm_recall)

# Print SVM results
print("Confusion Matrix:\n", svm_conf_matrix)
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1_score)

Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVM Validation Results:
Validation Accuracy: 0.9574095682613769
Validation Precision: 0.951764705882353
Validation Recall: 0.9619500594530321
Validation F1 Score: 0.9568302779420461
Confusion Matrix:
 [[813  29]
 [ 38 835]]
Accuracy: 0.960932944606414
Precision: 0.9664351851851852
Recall: 0.9564719358533792
F1 Score: 0.9614277489925158


### Traditional Method 3: Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier
# Hyperparameter tuning for Decision Tree
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Validate and test the best Decision Tree model
dt_model = dt_grid_search.best_estimator_
y_val_pred_dt = dt_model.predict(X_val)

# Compute metrics for validation set
dt_val_accuracy = accuracy_score(y_val, y_val_pred_dt)
dt_val_precision = precision_score(y_val, y_val_pred_dt)
dt_val_recall = recall_score(y_val, y_val_pred_dt)
dt_val_f1_score = f1_score(y_val, y_val_pred_dt)

# Print validation results
print("Decision Tree Validation Results:")
print("Validation Accuracy:", dt_val_accuracy)
print("Validation Precision:", dt_val_precision)
print("Validation Recall:", dt_val_recall)
print("Validation F1 Score:", dt_val_f1_score)

y_pred_dt = dt_model.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, y_pred_dt)

# Compute metrics for Decision Tree
dt_tn, dt_fp, dt_fn, dt_tp = dt_conf_matrix.ravel()
dt_accuracy = (dt_tp + dt_tn) / (dt_tp + dt_tn + dt_fp + dt_fn)
dt_precision = dt_tp / (dt_tp + dt_fp)
dt_recall = dt_tp / (dt_tp + dt_fn)
dt_f1_score = (2 * (dt_precision * dt_recall)) / (dt_precision + dt_recall)

# Print Decision Tree results
print("Decision Tree Test Results:")
print("Confusion Matrix:\n", dt_conf_matrix)
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1_score)

Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Validation Results:
Validation Accuracy: 0.9323220536756126
Validation Precision: 0.9239766081871345
Validation Recall: 0.93935790725327
Validation F1 Score: 0.9316037735849056
Decision Tree Test Results:
Confusion Matrix:
 [[796  46]
 [ 42 831]]
Accuracy: 0.9486880466472303
Precision: 0.9475484606613455
Recall: 0.9518900343642611
F1 Score: 0.9497142857142856


### ENSEMBLE METHOD 1: RANDOM FOREST MODEL

In [6]:
# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Validate and test the best Random Forest model
rf_model = rf_grid_search.best_estimator_
y_val_pred_rf = rf_model.predict(X_val)

# Compute metrics for validation set
rf_val_accuracy = accuracy_score(y_val, y_val_pred_rf)
rf_val_precision = precision_score(y_val, y_val_pred_rf)
rf_val_recall = recall_score(y_val, y_val_pred_rf)
rf_val_f1_score = f1_score(y_val, y_val_pred_rf)

# Print validation results
print("Random Forest Validation Results:")
print("Validation Accuracy:", rf_val_accuracy)
print("Validation Precision:", rf_val_precision)
print("Validation Recall:", rf_val_recall)
print("Validation F1 Score:", rf_val_f1_score)

y_pred_rf = rf_model.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, y_pred_rf)

# Compute metrics for Random Forest
rf_tn, rf_fp, rf_fn, rf_tp = rf_conf_matrix.ravel()
rf_accuracy = (rf_tp + rf_tn) / (rf_tp + rf_tn + rf_fp + rf_fn)
rf_precision = rf_tp / (rf_tp + rf_fp)
rf_recall = rf_tp / (rf_tp + rf_fn)
rf_f1_score = (2 * (rf_precision * rf_recall)) / (rf_precision + rf_recall)

# Print Random Forest results
print("Random Forest Test Results:")
print("Confusion Matrix:\n", rf_conf_matrix)
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1_score)

Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Validation Results:
Validation Accuracy: 0.9574095682613769
Validation Precision: 0.9507042253521126
Validation Recall: 0.9631391200951248
Validation F1 Score: 0.9568812758417011
Random Forest Test Results:
Confusion Matrix:
 [[812  30]
 [ 25 848]]
Accuracy: 0.967930029154519
Precision: 0.9658314350797267
Recall: 0.9713631156930126
F1 Score: 0.9685893774985722


# BAGGING: CLASSIFIER

In [14]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

# Define the base estimator
base_estimator = DecisionTreeClassifier(random_state=42)

# Define the hyperparameter grid for Bagging
bagging_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.5, 0.75, 1.0],  # Fraction of samples to use for each base estimator
    'max_features': [0.5, 0.75, 1.0],  # Fraction of features to use for each base estimator
}

# Create the BaggingClassifier with the base_estimator
bagging_classifier = BaggingClassifier(random_state=42)

# Perform Grid Search for Bagging Classifier
bagging_grid_search = GridSearchCV(bagging_classifier, bagging_param_grid, cv=5)
bagging_grid_search.fit(X_train, y_train)

# Best parameters for Bagging Classifier
print("Best parameters for Bagging Classifier:", bagging_grid_search.best_params_)

# Validate and test the best Bagging model
bagging_model = bagging_grid_search.best_estimator_
y_val_pred_bagging = bagging_model.predict(X_val)

# Compute metrics for validation set
bagging_val_accuracy = accuracy_score(y_val, y_val_pred_bagging)
bagging_val_precision = precision_score(y_val, y_val_pred_bagging, average='weighted')
bagging_val_recall = recall_score(y_val, y_val_pred_bagging, average='weighted')
bagging_val_f1_score = f1_score(y_val, y_val_pred_bagging, average='weighted')

# Print validation results
print("Bagging Classifier Validation Results:")
print("Validation Accuracy:", bagging_val_accuracy)
print("Validation Precision:", bagging_val_precision)
print("Validation Recall:", bagging_val_recall)
print("Validation F1 Score:", bagging_val_f1_score)

# Test the best Bagging model
y_pred_bagging = bagging_model.predict(X_test)
bagging_conf_matrix = confusion_matrix(y_test, y_pred_bagging)

# Compute metrics for Bagging Classifier
bagging_tn, bagging_fp, bagging_fn, bagging_tp = bagging_conf_matrix.ravel()
bagging_accuracy = (bagging_tp + bagging_tn) / (bagging_tp + bagging_tn + bagging_fp + bagging_fn)
bagging_precision = bagging_tp / (bagging_tp + bagging_fp)
bagging_recall = bagging_tp / (bagging_tp + bagging_fn)
bagging_f1_score = (2 * (bagging_precision * bagging_recall)) / (bagging_precision + bagging_recall)

# Print Bagging Classifier results
print("Bagging Classifier Test Results:")
print("Confusion Matrix:\n", bagging_conf_matrix)
print("Accuracy:", bagging_accuracy)
print("Precision:", bagging_precision)
print("Recall:", bagging_recall)
print("F1 Score:", bagging_f1_score)


Best parameters for Bagging Classifier: {'max_features': 0.5, 'max_samples': 0.75, 'n_estimators': 100}
Bagging Classifier Validation Results:
Validation Accuracy: 0.9626604434072346
Validation Precision: 0.9628347237180526
Validation Recall: 0.9626604434072346
Validation F1 Score: 0.962663697465558
Bagging Classifier Test Results:
Confusion Matrix:
 [[810  32]
 [ 20 853]]
Accuracy: 0.9696793002915451
Precision: 0.9638418079096045
Recall: 0.97709049255441
F1 Score: 0.9704209328782707


# Bagging: Extra Tree

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

# Define hyperparameter grid for Extra Trees
et_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Perform Grid Search for Extra Trees
et_grid_search = GridSearchCV(ExtraTreesClassifier(random_state=42), et_param_grid, cv=5)
et_grid_search.fit(X_train, y_train)

# Best parameters for Extra Trees
print("Best parameters for Extra Trees:", et_grid_search.best_params_)

# Validate and test the best Extra Trees model
et_model = et_grid_search.best_estimator_
y_val_pred_et = et_model.predict(X_val)

# Compute metrics for validation set
et_val_accuracy = accuracy_score(y_val, y_val_pred_et)
et_val_precision = precision_score(y_val, y_val_pred_et)
et_val_recall = recall_score(y_val, y_val_pred_et)
et_val_f1_score = f1_score(y_val, y_val_pred_et)

# Print validation results
print("Extra Trees Validation Results:")
print("Validation Accuracy:", et_val_accuracy)
print("Validation Precision:", et_val_precision)
print("Validation Recall:", et_val_recall)
print("Validation F1 Score:", et_val_f1_score)

y_pred_et = et_model.predict(X_test)
et_conf_matrix = confusion_matrix(y_test, y_pred_et)

# Compute metrics for Extra Trees
et_tn, et_fp, et_fn, et_tp = et_conf_matrix.ravel()
et_accuracy = (et_tp + et_tn) / (et_tp + et_tn + et_fp + et_fn)
et_precision = et_tp / (et_tp + et_fp)
et_recall = et_tp / (et_tp + et_fn)
et_f1_score = (2 * (et_precision * et_recall)) / (et_precision + et_recall)

# Print Extra Trees results
print("Extra Trees Test Results:")
print("Confusion Matrix:\n", et_conf_matrix)
print("Accuracy:", et_accuracy)
print("Precision:", et_precision)
print("Recall:", et_recall)
print("F1 Score:", et_f1_score)


Best parameters for Extra Trees: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Extra Trees Validation Results:
Validation Accuracy: 0.9579929988331388
Validation Precision: 0.9507620164126612
Validation Recall: 0.9643281807372176
Validation F1 Score: 0.9574970484061394
Extra Trees Test Results:
Confusion Matrix:
 [[816  26]
 [ 28 845]]
Accuracy: 0.9685131195335277
Precision: 0.9701492537313433
Recall: 0.9679266895761741
F1 Score: 0.9690366972477065


In [20]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

# Assuming dt_model and svm_model are your best models from earlier code
# Create a voting classifier to combine the Decision Tree and SVM
ensemble_model = VotingClassifier(estimators=[
    ('decision_tree', dt_model),
    ('svm', svm_model)],
    voting='soft'  # 'soft' voting uses predicted probabilities
)

# Fit the ensemble model on the training data
ensemble_model.fit(X_train, y_train)

# Validate and test the ensemble model
y_val_pred_ensemble = ensemble_model.predict(X_val)

# Compute metrics for validation set
ensemble_val_accuracy = accuracy_score(y_val, y_val_pred_ensemble)
ensemble_val_precision = precision_score(y_val, y_val_pred_ensemble)
ensemble_val_recall = recall_score(y_val, y_val_pred_ensemble)
ensemble_val_f1_score = f1_score(y_val, y_val_pred_ensemble)

# Print validation results for the ensemble model
print("Ensemble Model Validation Results:")
print("Validation Accuracy:", ensemble_val_accuracy)
print("Validation Precision:", ensemble_val_precision)
print("Validation Recall:", ensemble_val_recall)
print("Validation F1 Score:", ensemble_val_f1_score)

# Test the ensemble model
y_pred_ensemble = ensemble_model.predict(X_test)
ensemble_conf_matrix = confusion_matrix(y_test, y_pred_ensemble)

# Compute metrics for the ensemble model
ensemble_tn, ensemble_fp, ensemble_fn, ensemble_tp = ensemble_conf_matrix.ravel()
ensemble_accuracy = (ensemble_tp + ensemble_tn) / (ensemble_tp + ensemble_tn + ensemble_fp + ensemble_fn)
ensemble_precision = ensemble_tp / (ensemble_tp + ensemble_fp)
ensemble_recall = ensemble_tp / (ensemble_tp + ensemble_fn)
ensemble_f1_score = (2 * (ensemble_precision * ensemble_recall)) / (ensemble_precision + ensemble_recall)

# Print ensemble model results
print("Ensemble Model Test Results:")
print("Confusion Matrix:\n", ensemble_conf_matrix)
print("Accuracy:", ensemble_accuracy)
print("Precision:", ensemble_precision)
print("Recall:", ensemble_recall)
print("F1 Score:", ensemble_f1_score)

Ensemble Model Validation Results:
Validation Accuracy: 0.9434072345390898
Validation Precision: 0.9325581395348838
Validation Recall: 0.9536266349583828
Validation F1 Score: 0.9429747207524986
Ensemble Model Test Results:
Confusion Matrix:
 [[804  38]
 [ 36 837]]
Accuracy: 0.9568513119533528
Precision: 0.9565714285714285
Recall: 0.9587628865979382
F1 Score: 0.9576659038901602


#  KNN

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)

# Define KNN model and hyperparameter grid
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': range(1, 21),           # Test k values from 1 to 20
    'weights': ['uniform', 'distance'],    # Uniform or distance-weighted voting
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Different distance metrics
    'p': [1, 2]                            # Power parameter for Minkowski (p=1 is Manhattan, p=2 is Euclidean)
}

# Perform GridSearchCV with validation set
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='recall', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_knn = grid_search.best_estimator_

# Evaluate on the validation set
y_val_pred = best_knn.predict(X_val)
y_val_pred_prob = best_knn.predict_proba(X_val)[:, 1]

# Calculate validation metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_pred_prob)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

# Display validation results
print("\nValidation Results:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print(f"ROC-AUC Score: {val_roc_auc:.4f}")
print(f"Confusion Matrix:\n{val_conf_matrix}")

# Final evaluation on the test set
y_test_pred = best_knn.predict(X_test)
y_test_pred_prob = best_knn.predict_proba(X_test)[:, 1]

# Calculate test metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display final test results
print("\nFinal Test Results:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"ROC-AUC Score: {test_roc_auc:.4f}")
print(f"Confusion Matrix:\n{test_conf_matrix}")

Fitting 5 folds for each of 240 candidates, totalling 1200 fits

Validation Results:
Accuracy: 0.9516
Precision: 0.9397
Recall: 0.9631
F1 Score: 0.9513
ROC-AUC Score: 0.9890
Confusion Matrix:
[[821  52]
 [ 31 810]]

Final Test Results:
Accuracy: 0.9638
Precision: 0.9666
Recall: 0.9622
F1 Score: 0.9644
ROC-AUC Score: 0.9899
Confusion Matrix:
[[813  29]
 [ 33 840]]


# Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)

# Define the Logistic Regression model and a hyperparameter grid for tuning
logreg = LogisticRegression(max_iter=1000)  # Default Logistic Regression
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'solver': ['liblinear', 'lbfgs']  # Solvers
}

# Perform GridSearchCV with validation set
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='recall', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_logreg = grid_search.best_estimator_

# Evaluate on the validation set
y_val_pred = best_logreg.predict(X_val)
y_val_pred_prob = best_logreg.predict_proba(X_val)[:, 1]

# Calculate validation metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_pred_prob)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

# Display validation results
print("\nValidation Results:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print(f"ROC-AUC Score: {val_roc_auc:.4f}")
print(f"Confusion Matrix:\n{val_conf_matrix}")

# Final evaluation on the test set
y_test_pred = best_logreg.predict(X_test)
y_test_pred_prob = best_logreg.predict_proba(X_test)[:, 1]

# Calculate test metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display final test results
print("\nFinal Test Results:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"ROC-AUC Score: {test_roc_auc:.4f}")
print(f"Confusion Matrix:\n{test_conf_matrix}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits

Validation Results:
Accuracy: 0.9347
Precision: 0.9224
Recall: 0.9465
F1 Score: 0.9343
ROC-AUC Score: 0.9819
Confusion Matrix:
[[806  67]
 [ 45 796]]

Final Test Results:
Accuracy: 0.9388
Precision: 0.9486
Recall: 0.9301
F1 Score: 0.9393
ROC-AUC Score: 0.9847
Confusion Matrix:
[[798  44]
 [ 61 812]]


# Naives Bayes

In [25]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold

# Define the models and hyperparameter grids
models = {
    'GaussianNB': (GaussianNB(), {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}),
    'BernoulliNB': (BernoulliNB(), {'alpha': [0.5, 1.0, 1.5, 2.0], 'binarize': [0.0, 0.5, 1.0]})
}

# Initialize results dictionary
results = {}

# Perform GridSearchCV and evaluate model on validation set function
def grid_search_model(model, param_grid, X_train, X_val, y_train, y_val):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall', verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val)
    y_val_pred_prob = best_model.predict_proba(X_val)[:, 1]  # For ROC-AUC score

    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_pred_prob)
    conf_matrix = confusion_matrix(y_val, y_val_pred)

    return {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': conf_matrix
    }

# Perform GridSearchCV and validate each model
for model_name, (model, param_grid) in models.items():
    print(f"Performing Grid Search on {model_name} with validation set...")
    results[model_name] = grid_search_model(model, param_grid, X_train, X_val, y_train, y_val)

# Test the best models on the test set
for model_name, metrics in results.items():
    best_model = metrics['best_model']
    y_test_pred = best_model.predict(X_test)
    y_test_pred_prob = best_model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_test_pred_prob)
    conf_matrix = confusion_matrix(y_test, y_test_pred)

    # Display final results for the test set
    print(f"\nFinal Test Results for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Performing Grid Search on GaussianNB with validation set...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Performing Grid Search on BernoulliNB with validation set...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Final Test Results for GaussianNB:
Accuracy: 0.8880
Precision: 0.9316
Recall: 0.8419
F1 Score: 0.8845
ROC-AUC Score: 0.9544
Confusion Matrix:
[[788  54]
 [138 735]]

Final Test Results for BernoulliNB:
Accuracy: 0.9213
Precision: 0.9222
Recall: 0.9233
F1 Score: 0.9227
ROC-AUC Score: 0.9692
Confusion Matrix:
[[774  68]
 [ 67 806]]


# Combine SVM, Decision Tree using Bagging

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC  # Assuming you have SVC for the ensemble
from sklearn.tree import DecisionTreeClassifier  # Assuming you have Decision Tree
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)
import joblib

# Define the models and hyperparameter grids for GridSearchCV
models = {
    'GaussianNB': (GaussianNB(), {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}),
    'BernoulliNB': (BernoulliNB(), {'alpha': [0.5, 1.0, 1.5, 2.0], 'binarize': [0.0, 0.5, 1.0]}),
    'LogisticRegression': (LogisticRegression(max_iter=1000), {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}),
    'DecisionTree': (DecisionTreeClassifier(), {}),  # Add hyperparameters if needed
    'SVM': (SVC(probability=True), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']})
}

# Initialize results dictionary
results = {}

# Perform GridSearchCV and evaluate model on validation set function
def grid_search_model(model, param_grid, X_train, X_val, y_train, y_val):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall', verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val)
    y_val_pred_prob = best_model.predict_proba(X_val)[:, 1]  # For ROC-AUC score

    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    roc_auc = roc_auc_score(y_val, y_val_pred_prob)
    conf_matrix = confusion_matrix(y_val, y_val_pred)

    return {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': conf_matrix
    }

# Perform GridSearchCV and validate each model
for model_name, (model, param_grid) in models.items():
    print(f"Performing Grid Search on {model_name} with validation set...")
    results[model_name] = grid_search_model(model, param_grid, X_train, X_val, y_train, y_val)

# Create a voting classifier to combine the best models
ensemble_model = VotingClassifier(estimators=[
    ('gaussian_nb', results['GaussianNB']['best_model']),
    ('bernoulli_nb', results['BernoulliNB']['best_model']),
    ('logistic_regression', results['LogisticRegression']['best_model']),
    ('decision_tree', results['DecisionTree']['best_model']),
    ('svm', results['SVM']['best_model'])
], voting='soft')  # 'soft' voting uses predicted probabilities

# Fit the ensemble model on the training data
ensemble_model.fit(X_train, y_train)

# Validate the ensemble model on the validation set
y_val_pred_ensemble = ensemble_model.predict(X_val)
y_val_pred_prob_ensemble = ensemble_model.predict_proba(X_val)[:, 1]

# Compute metrics for the validation set
ensemble_val_accuracy = accuracy_score(y_val, y_val_pred_ensemble)
ensemble_val_precision = precision_score(y_val, y_val_pred_ensemble)
ensemble_val_recall = recall_score(y_val, y_val_pred_ensemble)
ensemble_val_f1 = f1_score(y_val, y_val_pred_ensemble)
ensemble_val_conf_matrix = confusion_matrix(y_val, y_val_pred_ensemble)

# Print validation results for the ensemble model
print("\nEnsemble Model Validation Results:")
print("Validation Accuracy:", ensemble_val_accuracy)
print("Validation Precision:", ensemble_val_precision)
print("Validation Recall:", ensemble_val_recall)
print("Validation F1 Score:", ensemble_val_f1)
print("Validation Confusion Matrix:\n", ensemble_val_conf_matrix)

# Test the ensemble model on the test set
for model_name, metrics in results.items():
    best_model = metrics['best_model']
    y_test_pred = best_model.predict(X_test)
    y_test_pred_prob = best_model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    conf_matrix = confusion_matrix(y_test, y_test_pred)

    print(f"\nFinal Test Results for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")


# Test the ensemble model on the test set
y_test_pred_ensemble = ensemble_model.predict(X_test)
y_test_pred_prob_ensemble = ensemble_model.predict_proba(X_test)[:, 1]

# Compute metrics for the ensemble model on the test set
ensemble_test_accuracy = accuracy_score(y_test, y_test_pred_ensemble)
ensemble_test_precision = precision_score(y_test, y_test_pred_ensemble)
ensemble_test_recall = recall_score(y_test, y_test_pred_ensemble)
ensemble_test_f1 = f1_score(y_test, y_test_pred_ensemble)
ensemble_test_conf_matrix = confusion_matrix(y_test, y_test_pred_ensemble)

# Print ensemble model results on the test set
print("\nEnsemble Model Test Results:")
print("Test Accuracy:", ensemble_test_accuracy)
print("Test Precision:", ensemble_test_precision)
print("Test Recall:", ensemble_test_recall)
print("Test F1 Score:", ensemble_test_f1)
print("Test Confusion Matrix:", ensemble_test_conf_matrix)

Performing Grid Search on GaussianNB with validation set...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Performing Grid Search on BernoulliNB with validation set...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Performing Grid Search on LogisticRegression with validation set...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Performing Grid Search on DecisionTree with validation set...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Performing Grid Search on SVM with validation set...
Fitting 5 folds for each of 6 candidates, totalling 30 fits

Ensemble Model Validation Results:
Validation Accuracy: 0.9439906651108518
Validation Precision: 0.9377203290246768
Validation Recall: 0.9488703923900119
Validation F1 Score: 0.9432624113475178
Validation Confusion Matrix:
 [[820  53]
 [ 43 798]]

Final Test Results for GaussianNB:
Accuracy: 0.8880
Precision: 0.9316
Recall: 0.8419
F1 Score: 0.8845
Confusion Matrix:
[[788  54]
 [138 735]]

Fin