In [1]:
# Input a string pathway to a pip requirements.txt file or a conda environment.yml file. 
# You can remove this variable or leave it blank to have nb2app attempt to determine the dependencies automatically.
# Examples:
# requirements_file = 'requirements.txt'
# requirements_file = 'my/path/environment.yml'
requirements_file: str = '/Users/christopherpuglisi/Netrias/Projects/hyperparameter_optimizer/src/environment.yml'
# Input string pathways to any files that the notebook will need to function when it is
# containerized. 
# You can delete/leave this empty if there are no external files to include. 
# Example:
# my_resource_files ['/absolute/path/encoders/my_encoder.h5', '/absolute/path/decoders/my_decoder.h5']
resource_files: list = [
]
from pathlib import Path
data_path: Path = Path('/Users/christopherpuglisi/Netrias/Projects/hyperparameter_optimizer/tests/cares_subset_aethr_features_overlapped_01312025.pkl')
objective: str = 'Threat Detection Prediction'
serialized_loss_name: str = 'binary_cross_entropy_loss.json'

In [2]:
# Adjust the Python path to include the 'src' directory
import sys
import os
import ast
import warnings
import datetime
import numpy as np
import pickle
import pandas as pd
import json
warnings.filterwarnings("ignore")
src_path = os.path.abspath('../src')
if src_path not in sys.path:
    sys.path.append(src_path)
from models.knn_models import KNNClassifierModel
from optimizers.grid_search import GridSearchOptimizer
from loss_functions.factory import LossFunctionFactory
from models.bayesian_models import GaussianNBModel
from optimizers.random_search import RandomSearchOptimizer
from models.ensemble_models import RandomForestModel
from models.linear_models import LogisticRegression
from models.ensemble_models import RandomForestModel, GradientBoostingModel
from models.bayesian_models import GaussianNBModel, BernoulliNBModel
from models.knn_models import KNNClassifierModel
from models.svm_models import SVCModel
from models.bayesian_models import BayesianRidgeModel
from models.linear_models import LogisticModel, RidgeModel, LassoModel
from models.knn_models import KNNRegressorModel
from models.svm_models import SVRModel
from models.clustering_models import KMeansModel, DBSCANModel, AgglomerativeClusteringModel, SpectralClusteringModel
from sklearn.model_selection import train_test_split

In [3]:
if objective == 'Threat Detection Prediction':
    loss_type_input: str = 'binary_cross_entropy'
elif objective == 'Threat Severity Prediction':
    loss_type_input: str = 'mean_squared_error'

In [4]:
# Create a custom Binary Cross-Entropy Loss function
weight_fp = 1
weight_fn = 1
threshold = 0.5

loss_fn = LossFunctionFactory.create_loss_function(
    loss_type=loss_type_input,
    weight_fp= weight_fp,   # Penalize false positives more
    weight_fn=weight_fn,   # Penalize false negatives even more
    threshold=threshold   # Set prediction threshold to 0.6
)

# Display the loss function parameters
print("Custom Loss Function Parameters:")
print(json.dumps(loss_fn.parameters, indent=4))

Custom Loss Function Parameters:
{
    "weight_fp": 1,
    "weight_fn": 1,
    "threshold": 0.5
}


In [5]:
with open(data_path, 'rb') as f:
    data = pickle.load(f)

X = data.iloc[:,3:].fillna(0)
y = data['Threat']

In [6]:
X.columns

Index(['A1A5D9', 'A6NDG6', 'O00233', 'O00299', 'O00391', 'O00410', 'O00442',
       'O00560', 'O00592', 'O00743',
       ...
       'Q15717', 'Q4V328', 'Q8N392', 'Q96CX2', 'Q96JP5', 'Q9BW30', 'Q9ULH7',
       'Q9Y4E1', 'Q9Y6H1', 'Q5T1J5'],
      dtype='object', length=561)

In [7]:
# Split the dataset into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")

Training samples: 258
Validation samples: 65


In [8]:
def get_top_model(models_dict):
    best_model_key = max(models_dict, key=lambda k: models_dict[k]['best_score'])
    return best_model_key, models_dict[best_model_key]

In [9]:
def run_model_experiment(objective,
                         X_train,
                         y_train,
                         X_val,
                         y_val,
                         loss_fn=None):
    """
    Orchestrates model building and hyperparameter optimization
    based on the provided objective_type (Classification, Regression, Unsupervised)
    and model_type. Returns a dict of { 'model': ..., 'best_params': ..., etc. }.
    """

    # ------------------- CLASSIFICATION -------------------
    if objective == 'Threat Detection Prediction':
        results_dict = {}
        # ---- Random Forest (Grid Search) ----
        
        rf_model = RandomForestModel(loss_function=loss_fn)
        hyperparam_grid = {
            'n_estimators': [100, 250, 500],
            'max_depth': [None, 10, 25, 50],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        rf_grid_optimizer = GridSearchOptimizer(
            model_instance=rf_model,
            param_grid=hyperparam_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='accuracy'
        )
        # Unpack 3 values so we can use the fitted best_estimator
        best_params, best_score, best_estimator = rf_grid_optimizer.optimize(X_train, y_train)
        
        # Evaluate using the fitted best_estimator
        val_score = best_estimator.score(X_val, y_val)

        results_dict['RandomForestClassifier'] = {
            'model': best_estimator,     # store the fitted model
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

            # If your RandomSearchOptimizer now also returns 3 values:
            # best_params, best_score, best_estimator = ...
            # otherwise, ensure it sets gnb_model to the fitted params.
        gnb_model = GaussianNBModel(loss_function=loss_fn)
        gnb_param_distributions = {
            'var_smoothing': np.logspace(0, -9, num=100)
        }

        gnb_random_optimizer = RandomSearchOptimizer(
            model_instance=gnb_model,
            param_distributions=gnb_param_distributions,
            loss_function=loss_fn,
            n_iter=20,
            scoring='accuracy',
            random_state=42
        )
        # Adjust if your RandomSearchOptimizer also returns best_score, best_estimator
        best_params, best_score, best_estimator = gnb_random_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['GaussianNBClassifier'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        
        bnb_model = BernoulliNBModel(loss_function=loss_fn)
        bnb_param_grid = {
            'alpha': [0.1, 1.0, 10.0],
            'binarize': [0.0, 0.5, 1.0],
            'fit_prior': [True, False]
        }

        bnb_grid_optimizer = GridSearchOptimizer(
            model_instance=bnb_model,
            param_grid=bnb_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='accuracy'
        )
        best_params, best_score, best_estimator = bnb_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['BernoulliNBClassifier'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        
        gb_model = GradientBoostingModel(loss_function=loss_fn)
        gb_param_grid = {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }

        gb_grid_optimizer = GridSearchOptimizer(
            model_instance=gb_model,
            param_grid=gb_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='accuracy'
        )
        best_params, best_score, best_estimator = gb_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['GradientBoostingModel'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        knn_clf_model = KNNClassifierModel(loss_function=loss_fn)
        knn_param_grid = {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }

        knn_grid_optimizer = GridSearchOptimizer(
            model_instance=knn_clf_model,
            param_grid=knn_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='accuracy'
        )
        best_params, best_score, best_estimator = knn_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['KNNClassifier'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
            }
        
        logistic_clf_model = LogisticModel(loss_function=loss_fn)

        logistic_param_grid = {
            'C': [0.1, 1.0, 10.0],  # Regularization strength
            'penalty': ['l1', 'l2'],  # Regularization type
            'solver': ['liblinear', 'saga']  # Solvers that support both l1 and l2
        }

        logistic_grid_optimizer = GridSearchOptimizer(
            model_instance=logistic_clf_model,
            param_grid=logistic_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='accuracy'
        )

        best_params, best_score, best_estimator = logistic_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['LogisticClassifier'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        #svc_model = SVCModel(loss_function=loss_fn)
        #svc_param_grid = {
        #    'C': [0.1, 1.0, 10.0],
        #    'kernel': ['linear', 'rbf'],
        #    'gamma': ['scale', 'auto']
        #}

        #svc_grid_optimizer = GridSearchOptimizer(
        #    model_instance=svc_model,
        #    param_grid=svc_param_grid,
        #    loss_function=loss_fn,
        #    cv=5,
        #    scoring='accuracy'
        #)
        
        #best_params, best_score, best_estimator = svc_grid_optimizer.optimize(X_train, y_train)
        
        #val_score = best_estimator.score(X_val, y_val)
        #results_dict['SVClassifier'] = {
        #    'model': best_estimator,
        #    'best_params': best_params,
        #    'best_score': best_score,
        #    'validation_score': val_score
        #}

        top_model_key, top_model_value = get_top_model(results_dict)
        top_model = top_model_value['model']
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        model_filename = f"best_model_{top_model_key}_{timestamp}.pkl"
        print(model_filename)
        # Save the model using pickle
        with open(model_filename, 'wb') as f:
            pickle.dump(top_model, f)

        print(f"Top model {top_model_key} saved as {model_filename}")

        return top_model_key, top_model_value, results_dict

    # ------------------- REGRESSION -------------------
    elif objective == 'Threat Severity Prediction':
        results_dict = {}
        lr_model = LogisticModel(loss_function=loss_fn)
        hyperparam_grid = [
            {
                'penalty': ['l1', 'l2'],
                'C': [0.01, 0.1, 1.0, 10],
                'max_iter': [2500, 5000, 10000]
            },
            {
                'penalty': [None],
                'max_iter': [1000, 10000]
            }
        ]

        lr_grid_optimizer = GridSearchOptimizer(
            model_instance=lr_model,
            param_grid=hyperparam_grid,
            loss_function=None,
            cv=5,
            scoring='accuracy'
        )
        best_params, best_score, best_estimator = lr_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['LogisticRegression'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        br_model = BayesianRidgeModel(loss_function=loss_fn)
        br_param_grid = {
            'max_iter': [100, 300],
            'alpha_1': [1e-6, 1e-5],
            'alpha_2': [1e-6, 1e-5],
        }

        br_grid_optimizer = GridSearchOptimizer(
            model_instance=br_model,
            param_grid=br_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='r2'
        )
        best_params, best_score, best_estimator = br_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['BayesianRidgeRegression'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        ridge_model = RidgeModel(loss_function=loss_fn)
        ridge_param_grid = {
            'alpha': [0.1, 1.0, 10.0]
        }

        ridge_grid_optimizer = GridSearchOptimizer(
            model_instance=ridge_model,
            param_grid=ridge_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='r2'
        )
        best_params, best_score, best_estimator = ridge_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['RidgeRegression'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        lasso_model = LassoModel(loss_function=loss_fn)
        lasso_param_grid = {
            'alpha': [0.01, 0.1, 1.0, 10.0]
        }

        lasso_grid_optimizer = GridSearchOptimizer(
            model_instance=lasso_model,
            param_grid=lasso_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='r2'
        )
        best_params, best_score, best_estimator = lasso_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['LassoRegression'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        knn_reg_model = KNNRegressorModel(loss_function=loss_fn)
        knn_param_distributions = {
            'n_neighbors': [2, 3, 5, 7, 10],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }

        # If your random search returns 3 values, do the same here.
        knn_random_optimizer = RandomSearchOptimizer(
            model_instance=knn_reg_model,
            param_distributions=knn_param_distributions,
            loss_function=loss_fn,
            n_iter=10,
            scoring='r2',
            random_state=42
        )
        best_params, best_score, best_estimator = knn_random_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['KNNRegressor'] =  {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        svr_model = SVRModel(loss_function=loss_fn)
        svr_param_grid = {
            'C': [0.1, 1.0],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }

        svr_grid_optimizer = GridSearchOptimizer(
            model_instance=svr_model,
            param_grid=svr_param_grid,
            loss_function=loss_fn,
            cv=5,
            scoring='r2'
        )
        best_params, best_score, best_estimator = svr_grid_optimizer.optimize(X_train, y_train)
        val_score = best_estimator.score(X_val, y_val)

        results_dict['SVRegressor'] = {
            'model': best_estimator,
            'best_params': best_params,
            'best_score': best_score,
            'validation_score': val_score
        }

        top_model_key, top_model_value = get_top_model(results_dict)
        top_model = top_model_value['model']
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        model_filename = f"best_model_{top_model_key}_{timestamp}.pkl"
        print(model_filename)
        # Save the model using pickle
        with open(model_filename, 'wb') as f:
            pickle.dump(top_model, f)

        print(f"Top model {top_model_key} saved as {model_filename}")

        return top_model_key, top_model_value, results_dict


    else:
        raise ValueError(f"Unknown objective={objective}")


In [10]:
print("--- Modeling Results Classification ---")
top_model_key, top_model_value, results_dict = run_model_experiment('Threat Detection Prediction', X_train, y_train, X_val, y_val)

--- Modeling Results Classification ---
Before GridSearchCV: {'n_estimators': 100, 'max_depth': None}


After GridSearchCV: {'n_estimators': 250, 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2}
Before GridSearchCV: {'alpha': 1.0, 'binarize': 0.0, 'fit_prior': True}
After GridSearchCV: {'alpha': 0.1, 'binarize': 1.0, 'fit_prior': True}
Before GridSearchCV: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3}
After GridSearchCV: {'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 3}
Before GridSearchCV: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto'}
After GridSearchCV: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'auto'}
Before GridSearchCV: {'max_iter': 200, 'penalty': 'l2', 'C': 1.0}




After GridSearchCV: {'max_iter': 200, 'penalty': 'l2', 'C': 10.0, 'solver': 'liblinear'}
best_model_LogisticClassifier_20250310_183120.pkl
Top model LogisticClassifier saved as best_model_LogisticClassifier_20250310_183120.pkl


In [11]:
print(f"Best Model: {top_model_key}\nDetails: {top_model_value}")

Best Model: LogisticClassifier
Details: {'model': LogisticModel(C=10.0, max_iter=200, penalty='l2', solver='liblinear'), 'best_params': {'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}, 'best_score': 0.9805429864253394, 'validation_score': 0.9846153846153847}


In [12]:
print("--- Modeling Results Regression ---")
all_results = run_model_experiment('Threat Severity Prediction', X_train, y_train, X_val, y_val, loss_fn=loss_fn)

--- Modeling Results Regression ---
Before GridSearchCV: {'max_iter': 200, 'penalty': 'l2', 'C': 1.0}
After GridSearchCV: {'max_iter': 1000, 'penalty': None, 'C': 1.0}
Before GridSearchCV: {'max_iter': 300, 'tol': 0.001, 'alpha_1': 1e-06, 'alpha_2': 1e-06, 'lambda_1': 1e-06, 'lambda_2': 1e-06}
After GridSearchCV: {'max_iter': 100, 'tol': 0.001, 'alpha_1': 1e-06, 'alpha_2': 1e-05, 'lambda_1': 1e-06, 'lambda_2': 1e-06}
Before GridSearchCV: {'alpha': 1.0}
After GridSearchCV: {'alpha': 10.0}
Before GridSearchCV: {'alpha': 1.0}
After GridSearchCV: {'alpha': 0.01}
Before GridSearchCV: {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}
After GridSearchCV: {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}
best_model_LogisticRegression_20250310_183121.pkl
Top model LogisticRegression saved as best_model_LogisticRegression_20250310_183121.pkl


In [13]:
#top_model_key, top_model_value = get_top_model(all_results)
#print(f"Best Model: {top_model_key}\nDetails: {top_model_value}")

In [14]:
# Serialize the custom loss function to a JSON string
serialized_loss = loss_fn.serialize()
print("Serialized Loss Function:")
print(serialized_loss)

# Optionally, save the serialized loss function to a file
with open(serialized_loss_name, 'w') as f:
    f.write(serialized_loss)


Serialized Loss Function:
{"name": "BinaryCrossEntropyLoss", "parameters": {"weight_fp": 1, "weight_fn": 1, "threshold": 0.5}}


In [15]:
# Load the serialized loss function from the file
with open(serialized_loss_name, 'r') as f:
    loaded_serialized_loss = f.read()

# Deserialize the loss function
deserialized_loss_fn = LossFunctionFactory.deserialize_loss_function(loaded_serialized_loss)

# Display deserialized loss function parameters
print("Deserialized Loss Function Parameters:")
print(json.dumps(deserialized_loss_fn.parameters, indent=4))


Deserialized Loss Function Parameters:
{
    "weight_fp": 1,
    "weight_fn": 1,
    "threshold": 0.5
}


In [16]:
import json
import datetime
import os

# Function to generate model metadata automatically
def generate_model_metadata(model_name, model_details, serialized_loss_name, X_train, y_train, X_val, y_val):
    """
    Generates structured metadata for the trained model automatically.
    """
    # Load the serialized loss function
    with open(serialized_loss_name, 'r') as f:
        loaded_serialized_loss = f.read()

    # Deserialize the loss function
    deserialized_loss_fn = LossFunctionFactory.deserialize_loss_function(loaded_serialized_loss)

    metadata = {
        "Model Identification and Versioning": {
            "model_id": f"{model_name}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}",
            "name": model_name,
            "version": "1.0",  # Can be dynamically set based on retraining
            "creation_date": str(datetime.datetime.now()),
            "model_lineage": "BaselineModel_v0.9",  # If applicable, pull from model history
            "repo_link": "https://gitlab.com/path/to/models",  # If stored in a repository
            "associated_reports": ["Report ##: Model deployment metadata", "Report ##: Model tracking"]
        },
        "Model Parameters and Architecture": {
            "model_type": model_details['model'].__class__.__name__,  # Extract model type
            "input_features": list(X_train.columns),  # Features used in training
            "target_column": "Threat",  # Adjust based on dataset label
            "loss_function": serialized_loss_name,  # Reference to stored loss function
            "optimizer": model_details['best_params'].get('solver', "Unknown"),  # Extract solver if exists
            "hyperparameters": model_details['best_params']  # Full hyperparameter set
        },
        "Training and Performance Metrics": {
            "dataset_used": {
                "train_samples": X_train.shape[0],
                "val_samples": X_val.shape[0],
                "feature_dim": X_train.shape[1]
            },
            "dataset_properties": {
                "train_target_balance": dict(y_train.value_counts().to_dict()),  # Auto-calculate class distribution
                "val_target_balance": dict(y_val.value_counts().to_dict())  # Auto-calculate class distribution
            },
            "performance_metrics": {
                "best_score": model_details['best_score'],  # Best score from model tuning
                "validation_score": model_details['validation_score']  # Validation score
            },
            "benchmark_metrics": {
                "baseline_model_score": 0.95  # If a benchmark exists, pull dynamically
            },
            "uncertainty_quant": {
                "weight_fp": deserialized_loss_fn.parameters.get("weight_fp", 1),
                "weight_fn": deserialized_loss_fn.parameters.get("weight_fn", 1),
                "threshold": deserialized_loss_fn.parameters.get("threshold", 0.5)
            }
        },
        "Threat Domain Information": {
            "threat_type_modeled": "Biological Threat",  # Can be parameterized
            "use_cases": ["Threat classification", "Anomaly detection"],  # Example use cases
            "feature_explainability": {
                "SHAP_analysis": "Pending",  # Auto-compute if using SHAP later
                "Top_features": ["feature_1", "feature_2"]  # Placeholder; should compute SHAP feature importances
            },
            "compliance_metrics": ["Bias assessment: Passed", "Security audit: Passed"]  # Example compliance checks
        }
    }

    return metadata

# Automatically generate metadata
model_metadata = generate_model_metadata(top_model_key, top_model_value, serialized_loss_name, X_train, y_train, X_val, y_val)

# Save metadata as JSON file
metadata_filename = f"model_metadata_{top_model_key}.json"
with open(metadata_filename, "w") as f:
    json.dump(model_metadata, f, indent=4)

print(f"Model metadata saved to {metadata_filename}")

Model metadata saved to model_metadata_LogisticClassifier.json


In [17]:
model_metadata

{'Model Identification and Versioning': {'model_id': 'LogisticClassifier_20250310_183121',
  'name': 'LogisticClassifier',
  'version': '1.0',
  'creation_date': '2025-03-10 18:31:21.948939',
  'model_lineage': 'BaselineModel_v0.9',
  'repo_link': 'https://gitlab.com/path/to/models',
  'associated_reports': ['Report ##: Model deployment metadata',
   'Report ##: Model tracking']},
 'Model Parameters and Architecture': {'model_type': 'LogisticModel',
  'input_features': ['A1A5D9',
   'A6NDG6',
   'O00233',
   'O00299',
   'O00391',
   'O00410',
   'O00442',
   'O00560',
   'O00592',
   'O00743',
   'O14545',
   'O14579',
   'O14672',
   'O14734',
   'O14744',
   'O14841',
   'O14874',
   'O14879',
   'O14933',
   'O14976',
   'O14979',
   'O15066',
   'O15118',
   'O15162',
   'O15258',
   'O15355',
   'O15371',
   'O15400',
   'O15455',
   'O43148',
   'O43390',
   'O43490',
   'O43653',
   'O43657',
   'O43678',
   'O43747',
   'O43768',
   'O43809',
   'O43823',
   'O43837',
   'O438

    elif objective_type == 'Detect Latent Patterns':
        model_type = 'KMeans'
        if model_type == 'KMeans':
            km_model = KMeansModel(loss_function=loss_fn)
            km_param_grid = {
                'n_clusters': [2, 3, 5, 8],
                'init': ['k-means++', 'random'],
                'max_iter': [100, 300],
            }

            km_grid_optimizer = GridSearchOptimizer(
                model_instance=km_model,
                param_grid=km_param_grid,
                loss_function=loss_fn,  
                cv=None,
                scoring=None
            )
            # We assume 3 values returned:
            best_params, best_score, best_estimator = km_grid_optimizer.optimize(X_train, None)
            val_score = best_estimator.score(X_val, None)

            return {
                'model': best_estimator,
                'best_params': best_params,
                'best_score': best_score,
                'validation_score': val_score
            }

        elif model_type == 'DBSCAN':
            dbscan_model = DBSCANModel(loss_function=loss_fn)
            dbscan_param_distributions = {
                'eps': [0.1, 0.2, 0.5, 1.0],
                'min_samples': [3, 5, 10]
            }

            dbscan_random_optimizer = RandomSearchOptimizer(
                model_instance=dbscan_model,
                param_distributions=dbscan_param_distributions,
                loss_function=loss_fn,
                n_iter=5,
                scoring=None,
                random_state=42
            )
            best_params, best_score, best_estimator = dbscan_random_optimizer.optimize(X_train, None)
            val_score = best_estimator.score(X_val, None)

            return {
                'model': best_estimator,
                'best_params': best_params,
                'best_score': best_score,
                'validation_score': val_score
            }

        elif model_type == 'AgglomerativeClustering':
            agg_model = AgglomerativeClusteringModel(loss_function=loss_fn)
            agg_param_grid = {
                'n_clusters': [2, 3, 5, 8],
                'affinity': ['euclidean', 'manhattan'],
                'linkage': ['ward', 'complete', 'average']
            }

            agg_grid_optimizer = GridSearchOptimizer(
                model_instance=agg_model,
                param_grid=agg_param_grid,
                loss_function=loss_fn,
                cv=None,
                scoring=None
            )
            best_params, best_score, best_estimator = agg_grid_optimizer.optimize(X_train, None)
            val_score = best_estimator.score(X_val, None)

            return {
                'model': best_estimator,
                'best_params': best_params,
                'best_score': best_score,
                'validation_score': val_score
            }

        elif model_type == 'SpectralClustering':
            spectral_model = SpectralClusteringModel(loss_function=loss_fn)
            spectral_param_grid = {
                'n_clusters': [2, 3, 5, 8],
                'n_init': [5, 10],
                'gamma': [0.1, 1.0, 10.0]
            }

            spectral_grid_optimizer = GridSearchOptimizer(
                model_instance=spectral_model,
                param_grid=spectral_param_grid,
                loss_function=loss_fn,
                cv=None,
                scoring=None
            )
            best_params, best_score, best_estimator = spectral_grid_optimizer.optimize(X_train, None)
            val_score = best_estimator.score(X_val, None)

            return {
                'model': best_estimator,
                'best_params': best_params,
                'best_score': best_score,
                'validation_score': val_score
            }

        else:
            raise ValueError(f"Unknown unsupervised model_type={model_type}")