# Hyperparameter Tuning

## Data Aquisition: Phishing Websites Dataset

In [1]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from scipy.io import arff
import os

In [2]:
def fetch_data():
    phishing_websites = fetch_ucirepo(id=327) 
    
    X = phishing_websites.data.features 
    y = phishing_websites.data.targets 

    return pd.concat([X, y], axis=1)

In [3]:
DATASET_PATH= "../data/Training_Dataset.arff"

def fetch_data_local():
    arff_file_path = os.path.abspath(DATASET_PATH)
    data, _ = arff.loadarff(arff_file_path)
    
    df = pd.DataFrame(data)
    X = df.drop('Result', axis=1)
    y = df['Result']

    return pd.concat([X, y], axis=1)

## MLFlow

In [3]:
import mlflow
import mlflow.sklearn

In [60]:
BASE_EXPERIMENT_NAME = "HPT"
mlflow.set_tracking_uri("../mlruns")

In [61]:
def set_experiment(experiment_name):
    experiment_name = f"{BASE_EXPERIMENT_NAME}_{experiment_name}"
    mlflow.set_experiment(experiment_name)

## Hyperparameter Tuning Libraries

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

## Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
set_experiment("GRID_SEARCH")
run_count = 0

2024/11/13 12:25:04 INFO mlflow.tracking.fluent: Experiment with name 'HPT_GRID_SEARCH' does not exist. Creating a new experiment.


In [None]:
model = DecisionTreeClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
param_grid = {
    'max_depth': [3, 5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 5]
}

In [6]:
phishing_websites = fetch_ucirepo(id=327)
X = phishing_websites.data.features
y = phishing_websites.data.targets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)

In [40]:
import matplotlib.pyplot as plt
import seaborn as sns

In [64]:
with mlflow.start_run(run_name=f"Best_{run_count}"):
    run_count += 1
    mlflow.log_param("model", "DecisionTreeClassifier")
    
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    best_score = grid_search.best_score_
    mlflow.log_metric("best_accuracy", best_score)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1", f1)

    class_report = classification_report(y_test, y_pred)
    mlflow.log_param("classification_report", class_report)

    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    cm_image_path = "./images/confusion_matrix.png"
    plt.savefig(cm_image_path)
    plt.close()

    mlflow.log_artifact(cm_image_path, "confusion_matrix")

    mlflow.sklearn.log_model(best_model, "model")

    print("Best parameters found: ", best_params)
    print("Best cross-validation accuracy: {:.4f}".format(best_score))
    print("Test Accuracy: {:.4f}".format(accuracy))
    print("Test Precision: {:.4f}".format(precision))
    print("Test Recall: {:.4f}".format(recall))
    print("Test F1-Score: {:.4f}".format(f1))
    print("\nClassification Report:\n", class_report)
    print("\nConfusion Matrix:\n", cm)

Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best cross-validation accuracy: 0.9588
Test Accuracy: 0.9575
Test Precision: 0.9625
Test Recall: 0.9625
Test F1-Score: 0.9625

Classification Report:
               precision    recall  f1-score   support

          -1       0.95      0.95      0.95       956
           1       0.96      0.96      0.96      1255

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211


Confusion Matrix:
 [[ 909   47]
 [  47 1208]]
