In [2]:
# Adjust the Python path to include the 'src' directory
import sys
import os

# Get the absolute path to the 'src' directory
src_path = os.path.abspath('../src')

# Add 'src' to the Python path
if src_path not in sys.path:
    sys.path.append(src_path)

# Import custom modules
from models.knn_models import KNNClassifierModel
from optimizers.grid_search import GridSearchOptimizer
from loss_functions.factory import LossFunctionFactory

# Import additional libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
import json

%load_ext autoreload
%autoreload 2


In [1]:
# Create a custom Binary Cross-Entropy Loss function
loss_fn = LossFunctionFactory.create_loss_function(
    loss_type="binary_cross_entropy",
    weight_fp=1.0,   # Penalize false positives more
    weight_fn=3.0,   # Penalize false negatives even more
    threshold=0.6    # Set prediction threshold to 0.6
)

# Display the loss function parameters
print("Custom Loss Function Parameters:")
print(json.dumps(loss_fn.parameters, indent=4))


NameError: name 'LossFunctionFactory' is not defined

In [4]:
# Initialize the KNN Classifier with the custom loss function
knn_classifier = KNNClassifierModel(loss_function=loss_fn)

# Display model details
print("Initialized KNN Classifier:")
print(knn_classifier.model)


Initialized KNN Classifier:
KNeighborsClassifier()


In [5]:
# Define the hyperparameter grid for Grid Search
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [6]:
# Initialize the Grid Search Optimizer
optimizer = GridSearchOptimizer(
    model_instance=knn_classifier,
    param_grid=param_grid,
    loss_function=loss_fn,
    cv=5,                # 5-fold cross-validation
    scoring='accuracy'   # Default scoring metric
)

In [7]:
# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the dataset into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")

Training samples: 455
Validation samples: 114


In [8]:
# Perform hyperparameter optimization using Grid Search
best_params, best_score = optimizer.optimize(X_train, y_train)

print("Best Hyperparameters Found:")
print(best_params)
print(f"Best Score (Negative Loss): {best_score:.4f}")


Best Hyperparameters Found:
{'algorithm': 'auto', 'n_neighbors': 11, 'weights': 'distance'}
Best Score (Negative Loss): -4.9342


In [9]:
# Evaluate the optimized model on the validation set
validation_loss = knn_classifier.score(X_val, y_val)

print(f"Validation Loss: {validation_loss:.4f}")


Validation Loss: 6.0595


In [10]:
# Serialize the custom loss function to a JSON string
serialized_loss = loss_fn.serialize()
print("Serialized Loss Function:")
print(serialized_loss)

# Optionally, save the serialized loss function to a file
with open('binary_cross_entropy_loss.json', 'w') as f:
    f.write(serialized_loss)


Serialized Loss Function:
{"name": "BinaryCrossEntropyLoss", "parameters": {"weight_fp": 2.0, "weight_fn": 3.0, "threshold": 0.6}}


In [11]:
# Load the serialized loss function from the file
with open('binary_cross_entropy_loss.json', 'r') as f:
    loaded_serialized_loss = f.read()

# Deserialize the loss function
deserialized_loss_fn = LossFunctionFactory.deserialize_loss_function(loaded_serialized_loss)

# Display deserialized loss function parameters
print("Deserialized Loss Function Parameters:")
print(json.dumps(deserialized_loss_fn.parameters, indent=4))


Deserialized Loss Function Parameters:
{
    "weight_fp": 2.0,
    "weight_fn": 3.0,
    "threshold": 0.6
}


In [12]:
from models.bayesian_models import GaussianNBModel
from optimizers.random_search import RandomSearchOptimizer

# Initialize the Gaussian Naive Bayes Model with the custom loss function
gnb_model = GaussianNBModel(loss_function=loss_fn)

# Define the hyperparameter distributions for Random Search
gnb_param_distributions = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

# Initialize the Random Search Optimizer
gnb_random_optimizer = RandomSearchOptimizer(
    model_instance=gnb_model,
    param_distributions=gnb_param_distributions,
    loss_function=loss_fn,
    n_iter=20,
    scoring='accuracy',
    random_state=42
)

# Perform hyperparameter optimization using Random Search
gnb_best_params = gnb_random_optimizer.optimize(X_train, y_train)

print("GaussianNB Random Search Best Hyperparameters:")
print(gnb_best_params)

# Evaluate the optimized model on the validation set
gnb_validation_loss = gnb_model.score(X_val, y_val)
print(f"GaussianNB Validation Loss: {gnb_validation_loss:.4f}")


GaussianNB Random Search Best Hyperparameters:
{'var_smoothing': 5.336699231206302e-08}
GaussianNB Validation Loss: 4.5447


In [13]:
from models.ensemble_models import RandomForestModel
from optimizers.grid_search import GridSearchOptimizer

# Initialize the Random Forest Model with the custom loss function
rf_model = RandomForestModel(loss_function=loss_fn)

# Define the hyperparameter grid for Grid Search
rf_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Grid Search Optimizer
rf_grid_optimizer = GridSearchOptimizer(
    model_instance=rf_model,
    param_grid=rf_param_grid,
    loss_function=loss_fn,
    cv=5,
    scoring='accuracy'
)

# Perform hyperparameter optimization using Grid Search
rf_best_params, rf_best_score = rf_grid_optimizer.optimize(X_train, y_train)

print("Random Forest Grid Search Best Hyperparameters:")
print(rf_best_params)
print(f"Random Forest Grid Search Best Score (Negative Loss): {rf_best_score:.4f}")

# Evaluate the optimized model on the validation set
rf_validation_loss = rf_model.score(X_val, y_val)
print(f"Random Forest Validation Loss: {rf_validation_loss:.4f}")


Random Forest Grid Search Best Hyperparameters:
{'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest Grid Search Best Score (Negative Loss): -2.9605
Random Forest Validation Loss: 3.6357


In [14]:
from models.linear_models import LogisticModel
from optimizers.random_search import RandomSearchOptimizer

# Initialize the Logistic Regression Model with the custom loss function
logistic_model = LogisticModel(loss_function=loss_fn)

# Define the hyperparameter distributions for Random Search
logistic_param_distributions = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['saga']  # 'saga' supports elasticnet penalty
}

# Initialize the Random Search Optimizer
logistic_random_optimizer = RandomSearchOptimizer(
    model_instance=logistic_model,
    param_distributions=logistic_param_distributions,
    loss_function=loss_fn,
    n_iter=20,
    scoring='accuracy',
    random_state=42
)

# Perform hyperparameter optimization using Random Search
logistic_best_params = logistic_random_optimizer.optimize(X_train, y_train)

print("Logistic Regression Random Search Best Hyperparameters:")
print(logistic_best_params)

# Evaluate the optimized model on the validation set
logistic_validation_loss = logistic_model.score(X_val, y_val)
print(f"Logistic Regression Validation Loss: {logistic_validation_loss:.4f}")


Logistic Regression Random Search Best Hyperparameters:
{'solver': 'saga', 'penalty': 'l1', 'C': 0.0018329807108324356}
Logistic Regression Validation Loss: 5.1506


25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/christopherpuglisi/anaconda3/envs/py38_nlp/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/christopherpuglisi/anaconda3/envs/py38_nlp/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/christopherpuglisi/anaconda3/envs/py38_nlp/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1179, in fit
    raise ValueError("l1_ratio must be specified when penalty is