# TUNER PROTOTYPES

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from scipy.stats import  zscore
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.plots import plot_objective, plot_convergence, plot_histogram, plot_gaussian_process

import warnings

# Data Processing
digits = load_digits()

cal_housing = fetch_california_housing(as_frame=True)

cleanData = cal_housing.data
cleanData['y'] = cal_housing.target

cleanData = cleanData.drop(columns=['Longitude', 'Latitude'])

for feature in ['AveBedrms', 'AveRooms', 'AveOccup', 'Population']:
    cleanData = cleanData[(np.abs(zscore(cleanData[feature])) < 2.5)]
cleanTarget = cleanData['y'].to_list()

cleanData.drop(columns=['y'], inplace=True)

modelData = cleanData.copy()


# TODO: Add GridSearchCV into Pipeline for testing scoringCriteria param space?
# TODO: Generate SKOPT Plots for regression models, Plot Feature Importance for all models
# TODO: Insert Outlier script into Pipeline

# Suppress warnings
warnings.filterwarnings("ignore")

class ModelTuner:
    def __init__(self, classifiers_param_spaces, scoring_criteria):
        self.classifiers_param_spaces = classifiers_param_spaces
        self.scoring_criteria = scoring_criteria
        self.tuned_classifiers = {}

    def tune_classifier(self, search_type, model_name, model, param_space, X_train, y_train, X_test, y_test):
        """Tune the classifier using the specified search type."""
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('classify', model())
        ])
        
        if search_type == 'bayesian':
            search = BayesSearchCV(
                estimator=pipe,
                search_spaces=param_space,
                n_iter=50,
                cv=4,
                scoring= None,  # Default scoring
                n_jobs=-1,
                random_state=42
            )
        
        for scoring_metric in self.scoring_criteria['classification']:
            print(f"Tuning {model_name} with {search_type} search and scoring metric: {scoring_metric}")
            search.set_params(scoring=scoring_metric)
            search.fit(X_train, y_train)

            print(f"Best Params: {search.best_params_}")
            print(f"Best Score (Train): {search.best_score_}")
            test_score = search.score(X_test, y_test)
            print(f"Test Score: {test_score}")
            print("-" * 80)

            self.plot_confusion_matrix(search.best_estimator_, X_test, y_test)
            self.plot_optimization_results(search)

            # self.plot_regression_results(search.best_estimator_, X_test, y_test)

        self.tuned_classifiers[model_name] = search
        

        return search

    def evaluate_model(self, model_name, X_test, y_test):
        """Evaluate the tuned model and display confusion matrix."""
        search = self.tuned_classifiers.get(model_name)
        if not search:
            raise ValueError(f"Model {model_name} not tuned yet!")

        y_pred = search.best_estimator_.predict(X_test)
        print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")

    def tune_all_classifiers(self, search_type, X_train, y_train, X_test, y_test):
        """Tune all classifiers in the parameter spaces."""
        for model_name, model_dict in self.classifiers_param_spaces.items():
            if search_type in model_dict:
                param_space = model_dict[search_type]
                model = globals()[model_name]
                self.tune_classifier(search_type, model_name, model, param_space, X_train, y_train, X_test, y_test)

    def plot_confusion_matrix(self, best_estimator, X_test, y_test):
        y_pred = best_estimator.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap=plt.cm.Blues)
        plt.title("Confusion Matrix")
        plt.show()

    def plot_optimization_results(self, search):
        """Plot optimization-related visualizations."""
        # Plot convergence
        # ax = plot_convergence(search.optimizer_results_[0])
        # plt.title("Optimization Convergence")
        # plt.subplots_adjust(wspace=0.5, hspace=0.5)
        # plt.show()

        # Plot objective function
        ax = plot_objective(search.optimizer_results_[0], n_minimum_search=int(1e8))
        plt.title("Objective Function")
        plt.subplots_adjust(wspace=0.5, hspace=0.5)
        plt.show()

        # Plot hyperparameter distribution
        # ax = plot_histogram(dimension_identifier=search.optimizer_results_[0])
        # plt.title("Hyperparameter Sampling Distribution")
        # plt.show()
    '''
    def plot_regression_results(self, best_estimator, X_test, y_test):
        """Plot regression results."""
        y_pred = best_estimator.predict(X_test)
        plt.scatter(y_test, y_pred, alpha=0.7, edgecolors='k')
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        plt.xlabel("Actual")
        plt.ylabel("Predicted")
        plt.title("Regression Results")
        plt.show()
    '''

# Define scoring criteria
scoring_criteria = {
    'classification': ['accuracy', 'f1_macro', 'balanced_accuracy', 'precision_macro', 'recall_macro', 'average_precision']
}

# Classifiers parameter spaces
classifier_param_spaces = {
    'DecisionTreeClassifier': {
        'bayesian': {
            'classify__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'classify__splitter': Categorical(['best', 'random']),
            'classify__max_depth': Integer(1, 1000),
            'classify__min_samples_split': Real(0.01, 0.9),
            'classify__min_samples_leaf': Real(0.01, 0.9),
            'classify__max_features': Real(0.01, 0.9),
            'classify__max_leaf_nodes': Integer(2, 4000),
            'classify__min_impurity_decrease': Real(0.0, 1.0),
            'classify__ccp_alpha': Real(0.01, 0.9),
            'pca__n_components': Integer(1, len(load_digits().data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False]
        }
    },
    'RandomForestClassifier': {
        'bayesian': {
            'classify__n_estimators': Integer(10, 2000),
            'classify__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'classify__max_depth': Integer(1, 1000),
            'classify__min_samples_split': Real(0.01, 0.9),
            'classify__min_samples_leaf': Real(0.01, 0.9),
            'classify__max_features': Real(0.01, 0.9),
            'classify__max_leaf_nodes': Integer(2, 4000),
            'classify__min_impurity_decrease': Real(0.0, 1.0),
            'classify__oob_score': Categorical([True, False]),
            'classify__warm_start': Categorical([True, False]),
            'classify__max_samples': Real(0.01, 0.9),
            'pca__n_components': Integer(1, len(load_digits().data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False]
        }
    }
}

# Load dataset and split
digits = load_digits()

X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=42)

# Instantiate and run
tuner = ModelTuner(classifier_param_spaces, scoring_criteria)
tuner.tune_all_classifiers('bayesian', X_train, y_train, X_test, y_test)

# Evaluate a specific model
tuner.evaluate_model('DecisionTreeClassifier', X_test, y_test)


## OLD REG/CLF CODE FROM SUBMISSION

### CLASSIFICATION

In [None]:
'''

TODO: Build a class, model_tuner(), to run BayesianSearch optmization to test predefined param search criteria for each model 
        --> return printed evaluation and scoring report, plot confusion matrix for all models

'''

# Define scoring criteria
scoring_criteria = {
    'classification': ['accuracy', 'f1_macro', 'balanced_accuracy', 'precision_macro', 'recall_macro', 'average_precision']
}

# Classifiers parameter spaces for GridSearchCV, RandomizedSearchCV, and BayesSearchCV
classifier_param_spaces = {
    'DecisionTreeClassifier': {
        'bayesian': {
            'model__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'model__splitter': Categorical(['best', 'random']),
            'classify__max_depth': Integer(1, 1000),
            'model__min_samples_split': Real(0.01, 0.9),
            'classify__min_samples_leaf': Real(0.01, 0.9),
            'classify__max_features': Real(0.01, 0.9),
            'classify__max_leaf_nodes': Integer(2, 4000),
            'classify__min_impurity_decrease': Real(0.0, 1.0),
            'classify__ccp_alpha': Real(0.01, 0.9),
            'pca__n_components': Integer(1, len(digits.data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False]
        }
    },

    'RandomForestClassifier': {
        'bayesian': {
            'classify__n_estimators': Integer(10, 2000),
            'classify__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'classify__max_depth': Integer(1, 1000),
            'classify__min_samples_split': Real(0.01, 0.9),
            'classify__min_samples_leaf': Real(0.01, 0.9),
            'classify__max_features': Real(0.01, 0.9),
            'classify__max_leaf_nodes': Integer(1, 2000),
            'classify__min_impurity_decrease': Real(0.01, 0.9),
            'classify__oob_score': Categorical([True, False]),
            'classify__warm_start': Categorical([True, False]),
            'classify__max_samples': Real(0.01, 0.9),
            'pca__n_components': Integer(1, len(digits.data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False]
        }
    }
}

def tune_classifier(search_type, model_name, model, param_space, X_train, y_train, X_test, y_test):
    pipe = Pipeline([ 
        ('scaler', StandardScaler()), 
        ('pca', PCA()), 
        ('classify', model()) 
    ])

    # Set search class based on the search_type
    if search_type == 'bayesian':
        search_class = BayesSearchCV
        search_params = {
            'estimator': pipe,
            'search_spaces': param_space,  # BayesSearchCV uses search_spaces
            'n_iter': 50,  # Define number of iterations for Bayesian Optimization
            'cv': 4,
            'scoring': 'accuracy',
            'n_jobs': -1,
            'random_state': 42
        }

    search = search_class(**search_params)

    # Loop over scoring metrics for evaluation
    for scoring_metric in scoring_criteria['classification']:
        print(f'Tuning {model_name} with {search_type} search and scoring metric: {scoring_metric}')
        search.set_params(scoring=scoring_metric)
        search.fit(X_train, y_train)
        
        print(f'Evaluation Metric: {scoring_metric}')
        print(f"Best Score (Train): {search.best_score_}")
        print(f"Test Score: {search.score(X_test, y_test)}")
        print('-' * 80)

    return search

# Split the data
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=42)

# Instantiate tuned models dictionary
tuned_classifiers = {}

# Iterate through each classifier and perform tuning
for model_name, model_dict in classifier_param_spaces.items():
    # Run BayesianSearchCV first
    if 'bayesian' in model_dict:
        param_space = model_dict['bayesian']
        search = tune_classifier('bayesian', model_name, globals()[model_name], param_space, X_train, y_train, X_test, y_test)
        tuned_classifiers[model_name] = search


### REGRESSION

In [None]:
'''

TODO: Build a class, model_tuner(), to run BayesianSearch optmization to test predefined param search criteria for each model 
        --> return printed evaluation and scoring report, plot confusion matrix for all models

'''

# Define scoring criteria
scoring_criteria = {
    'regression': [
        'neg_mean_squared_error', 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_log_error', 
        'neg_median_absolute_error', 'max_error', 'explained_variance', 'neg_root_mean_squared_error'
    ]
}

# Regressors parameter spaces for GridSearchCV and RandomizedSearchCV
regression_param_spaces = {
    'DecisionTreeRegressor': {
        'bayesian': {
            'regress__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'regress__splitter': ['best', 'random'],
            'regress__max_depth': Integer(2, 101),
            'regress__min_samples_split': Real(0.01, 0.89),
            'regress__min_samples_leaf': Real(0.01, 0.89),
            'regress__max_features': Real(0.01, 0.49),
            'regress__ccp_alpha': Real(0.01, 0.89),
            'pca__n_components': Integer(1, len(modelData.columns)),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False]
        }
    },
    'RandomForestRegressor': {
        'bayesian': {
            'regress__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'regress__n_estimators': Integer(50, 501),
            'regress__max_depth': Integer(2, 101),
            'regress__min_samples_split': Real(0.01, 0.89),
            'regress__min_samples_leaf': Real(0.01, 0.89),
            'regress__max_features': Real(0.01, 0.89),
            'regress__ccp_alpha': Real(0.01, 0.89),
            'regress__max_samples': Real(0.01, 0.89),
            'pca__n_components': Integer(1, len(modelData.columns)),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False]
        }
    }
}

def tune_classifier(search_type, model_name, model, param_space, X_train, y_train, X_test, y_test):
    pipe = Pipeline([ 
        ('scaler', StandardScaler()), 
        ('pca', PCA()), 
        ('regress', model()) 
    ])

    # Set search class based on the search_type
    if search_type == 'bayesian':
        search_class = BayesSearchCV
        search_params = {
            'estimator': pipe,
            'search_spaces': param_space,  # BayesSearchCV uses search_spaces
            'n_iter': 50,  # Define number of iterations for Bayesian Optimization
            'cv': 4,
            'scoring': 'accuracy',
            'n_jobs': -1,
            'random_state': 42
        }

    search = search_class(**search_params)

    # Loop over scoring metrics for evaluation
    for scoring_metric in scoring_criteria['regression']:
        print(f'Tuning {model_name} with {search_type} search and scoring metric: {scoring_metric}')
        search.set_params(scoring=scoring_metric)
        search.fit(X_train, y_train)
        
        print(f'Evaluation Metric: {scoring_metric}')
        print(f"Best Score (Train): {search.best_score_}")
        print(f"Test Score: {search.score(X_test, y_test)}")
        print('-' * 80)

    return search

# Split the data
X_train, X_test, y_train, y_test = train_test_split(modelData, cleanTarget, test_size=0.3, random_state=42)

# Instantiate tuned models dictionary
tuned_classifiers = {}

# Iterate through each classifier and perform tuning
for model_name, model_dict in regression_param_spaces.items():
    # Run BayesianSearchCV first
    if 'bayesian' in model_dict:
        param_space = model_dict['bayesian']
        search = tune_classifier('bayesian', model_name, globals()[model_name], param_space, X_train, y_train, X_test, y_test)
        tuned_classifiers[model_name] = search


## ATTEMPTED COMBINED TUNER (DOESN'T WORK)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error, r2_score
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from scipy.stats import zscore
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

class ModelTuner:
    def __init__(self, models_param_spaces, scoring_criteria):
        self.models_param_spaces = models_param_spaces
        self.scoring_criteria = scoring_criteria
        self.tuned_models = {}

    def tune_model(self, task_type, search_type, model_name, model, param_space, X_train, y_train, X_test, y_test):
        """Tunes the model using the specified search type."""
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('model', model())
        ])
        
        # Initialize search object
        if search_type == 'bayesian':
            search = BayesSearchCV(
                estimator=pipe,
                search_spaces=param_space,
                n_iter=50,
                cv=4,
                scoring=None,  # Will set dynamically
                n_jobs=-1,
                random_state=42
            )
        
        # Loop over scoring metrics
        for scoring_metric in self.scoring_criteria[task_type]:
            print(f"Tuning {model_name} ({task_type}) with {search_type} search and scoring metric: {scoring_metric}")
            search.set_params(scoring=scoring_metric)
            search.fit(X_train, y_train)

            print(f"Best Params: {search.best_params_}")
            print(f"Best Score (Train): {search.best_score_}")
            test_score = search.score(X_test, y_test)
            print(f"Test Score: {test_score}")
            print("-" * 80)

            self.plot_confusion_matrix(search.best_estimator_, X_test, y_test)
            
            if task_type == 'regression':
                self.plot_regression_results(search.best_estimator_, X_test, y_test)

        # Save the tuned model
        self.tuned_models[model_name] = search

        return search

    def tune_all_models(self, task_type, search_type, X_train, y_train, X_test, y_test):
        """Tunes all models in the parameter spaces."""
        for model_name, model_dict in self.models_param_spaces.items():
            if search_type in model_dict:
                param_space = model_dict[search_type]
                model = globals()[model_name]
                self.tune_model(task_type, search_type, model_name, model, param_space, X_train, y_train, X_test, y_test)

    def evaluate_model(self, model_name, task_type, X_test, y_test):
        """Evaluate the tuned model and display relevant results."""
        search = self.tuned_models.get(model_name)
        if not search:
            raise ValueError(f"Model {model_name} not tuned yet!")

        y_pred = search.best_estimator_.predict(X_test)

        if task_type == 'classification':
            print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")
        elif task_type == 'regression':
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            print(f"Evaluation Metrics for {model_name}:\nMean Squared Error: {mse:.4f}\nR² Score: {r2:.4f}")

    def plot_confusion_matrix(self, best_estimator, X_test, y_test):
        """Plot confusion matrix for classification."""
        y_pred = best_estimator.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap=plt.cm.Blues)
        plt.title("Confusion Matrix")
        plt.show()

    def plot_regression_results(self, best_estimator, X_test, y_test):
        """Plot regression results."""
        y_pred = best_estimator.predict(X_test)
        plt.scatter(y_test, y_pred, alpha=0.7, edgecolors='k')
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        plt.xlabel("Actual")
        plt.ylabel("Predicted")
        plt.title("Regression Results")
        plt.show()


# Scoring criteria
scoring_criteria = {
    'classification': ['accuracy', 'f1_macro', 'balanced_accuracy', 'precision_macro', 'recall_macro', 'average_precision'],
    'regression': [
        'neg_mean_squared_error', 'r2', 'neg_mean_absolute_error', 'neg_median_absolute_error', 
        'max_error', 'explained_variance', 'neg_root_mean_squared_error'
    ]
}

# Classifier and regressor parameter spaces
models_param_spaces = {
    'DecisionTreeClassifier': {
        'bayesian': {
            'model__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'model__splitter': Categorical(['best', 'random']),
            'model__max_depth': Integer(1, 1000),
            'model__min_samples_split': Real(0.01, 0.9),
            'model__min_samples_leaf': Real(0.01, 0.9),
            'model__min_weight_fraction_leaf': Real(0.0,0.5),
            'model__max_features': Real(0.01,0.9),
            'model__max_leaf_nodes': Integer(2, 4000), 
            'model__min_impurity_decrease': Real(0.0, 1.0),
            'model__ccp_alpha': Real(0.01, 0.9),
            'pca__n_components': Integer(1, len(digits.data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    },
    'RandomForestClassifier': {
        'bayesian': {
            'model__n_estimators': Integer(10, 2000),
            'model__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'model__max_depth': Integer(1, 1000), 
            'model__min_samples_split': Real(0.01, 0.9), 
            'model__min_samples_leaf': Real(0.01, 0.9),
            'model__min_weight_fraction_leaf': Real(0.01,0.5),
            '_model_max_features': Real(0.01,0.9),
            'model__max_leaf_nodes': Integer(1,2000),
            'model__min_impurity_decrease': Real(0.01,0.9),
            # 'classify__bootstrap': Categorical([True, False]),
            'model__oob_score': Categorical([True, False]),
            'model__warm_start': Categorical([True, False]),
            'model__max_samples':Real(0.01,0.9),
            'pca__n_components': Integer(1, len(digits.data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    },
    'DecisionTreeRegressor': {
        'bayesian': {
            'model__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'model__splitter': ['best', 'random'],
            'model__max_depth': Integer(2, 1000),
            'model__min_samples_split': Real(0.01, 0.9),
            'model__min_samples_leaf': Real(0.01, 0.9),
            'model__min_weight_fraction_leaf': Real(0.0, 0.5),
            'model__max_features': Real(0.01, 0.5),
            'model__max_leaf_nodes': Integer(2, 1000),
            'model__min_impurity_decrease': Real(0.0, 0.9),
            'model__ccp_alpha': Real(0.01, 0.9),
            'pca__n_components': Integer(1, len(modelData.columns)),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    },
    'RandomForestRegressor': {
        'bayesian': {
            'model__n_estimators': Integer(50, 500),
            'model__criterion': Categorical(['squared_error', 'friedman_mse', 'absolute_error', 'poisson']),
            'model__max_depth': Integer(2, 1000), 
            'model__min_samples_split': Real(0.01, 0.9),
            'model__min_samples_leaf': Real(0.01, 0.9),
            'model__min_weight_fraction_leaf': Real(0.01, 0.5),
            'model__max_features': Real(0.01,0.9),
            # 'regress__max_features': Categorical(['sqrt', 'log2']), 
            'model__max_leaf_nodes': Integer(2,1000),
            'model__min_impurity_decrease': Real(0.01, 0.9),
            # 'regress__bootstrap': [True, False],
            'model__oob_score': [True, False],
            'model__warm_start': [True, False],
            'model__ccp_alpha': Real(0.01, 0.9),
            'model__max_samples': Real(0.01, 0.9),
            'pca__n_components': Integer(1, len(modelData.columns)),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    }
}

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(digits.data, digits.target, test_size=0.3, random_state=42)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(modelData, cleanTarget, test_size=0.3, random_state=42)

tuner = ModelTuner(models_param_spaces, scoring_criteria)
tuner.tune_all_models('classification', 'bayesian', X_train_clf, y_train_clf, X_test_clf, y_test_clf)
tuner.tune_all_classifiers('regression', 'bayesian', X_train_reg, y_train_reg, X_test_reg, y_test_reg)

# Evaluate a specific model
tuner.evaluate_model('DecisionTreeClassifier', X_test, y_test)
tuner.evaluate_model('RandomForestClassifier', X_test, y_test)
tuner.evaluate_model('DecisionTreeRegressor', X_test, y_test)
tuner.evaluate_model('RandomForestRegressor', X_test, y_test)

