# Hyperparameter Tuning

In [18]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from scipy.io import arff
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
def fetch_data():
    phishing_websites = fetch_ucirepo(id=327) 
    
    X = phishing_websites.data.features 
    y = phishing_websites.data.targets 

    return pd.concat([X, y], axis=1)

In [5]:
DATASET_PATH= "../data/Training_Dataset.arff"

def fetch_data_local():
    arff_file_path = os.path.abspath(DATASET_PATH)
    data, _ = arff.loadarff(arff_file_path)
    
    df = pd.DataFrame(data)
    X = df.drop('Result', axis=1)
    y = df['Result']

    return pd.concat([X, y], axis=1)

# Preprocess

In [7]:
raw_data = fetch_data_local()
X = raw_data.drop('Result', axis=1)
y = raw_data['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
encoder = OneHotEncoder()  # Set sparse=False for dense output

# Assume X_train is a pandas DataFrame
# If X_train is already numerical, apply one-hot encoding to each feature column
# The ColumnTransformer applies OneHotEncoder to all the columns
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', encoder, range(X_train.shape[1]))  # Apply OneHotEncoder to all columns (0-indexed)
    ])

In [8]:
dt = DecisionTreeClassifier(random_state=42)

In [9]:
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=1)

In [11]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits




ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
print("Test Set Score:", grid_search.score(X_test, y_test))