Wybrane zbiory danych: 
- Abalone https://www.openml.org/search?type=data&status=active&id=720
- ada_prior https://www.openml.org/search?type=data&status=active&id=1037
- spambase https://www.openml.org/search?type=data&status=active&id=44
- phoneme https://www.openml.org/search?type=data&status=active&id=1489

Wybrane algorytmy:
* GradientBoosting
* RandomForest
* Sieci neuronowe

W przypadku sieci neuronowych startową siatkę hiperparametrów zaczerpnięto z artykułu https://www.degruyter.com/document/doi/10.1515/comp-2020-0227/html

Stworzenie mapy dataset-id oraz siatek hiperparametrów

In [None]:
import numpy as np
import csv
import openml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [1]:
data_sets = {
    # 4.17k
    'abalone' : {
        'id' : 720,
    },
    # 4.56k
    'ada_prior' : {
        'id' : 1037,
    },
    # 5.4k
    'phoneme' : {
        'id' : 1489,
    },
    # 4.6k
    'spambase' : {
        'id' : 44,
    },
}

search_spaces = {
    'random_forest' : {
        "estimator__n_estimators" : np.arange(1, 2000),
        "estimator__max_depth" : np.arange(1, 20),
        "estimator__max_samples" : np.linspace(0.1, 1, num=100),
        "estimator__max_features" : ['sqrt', 'log2'],
        "estimator__min_samples_split" :  np.linspace(0.1, 0.5, num=100),
    },
    'neural_network' : {
        "estimator__hidden_layer_sizes" : np.arange(10, 500),
        "estimator__activation" : ['relu', 'identity', 'logistic', 'tanh'],
        "estimator__learning_rate_init" : np.logspace(-6, 1, base=2.0, num=1000),
        "estimator__alpha" : np.logspace(-6, 1, base=2.0, num=1000),
    },
    'gradient_boosting' : {
        "estimator__n_estimators" : np.arange(1, 5000, step=50),
        "estimator__learning_rate" : np.logspace(-10, 0, base=2.0, num=10),
        "estimator__subsample" : np.linspace(0.1, 1, num=10),
        "estimator__loss" : ['log_loss', 'exponential'],
        "estimator__max_depth" : np.arange(1, 16, step=2),
        "estimator__min_samples_split" : [2**(i+1) for i in range(7)],
        "estimator__max_features" : np.arange(0.05, 1.05, 0.05),
    }
}

In [2]:
class HPO():
    def __init__(self, estimator, search_space, random_state=0, test_size=0.2, n_iter=10, n_jobs=1): 
        num_pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='mean')),
            ('scale', MinMaxScaler())
        ])

        cat_pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        preprocessing = ColumnTransformer(transformers=[
            ('num_pipeline',num_pipeline, make_column_selector(dtype_include=np.number)),
            ('cat_pipeline',cat_pipeline, make_column_selector(dtype_include=np.object_))
            ],
            remainder='drop',
            n_jobs=-1)

        self.pipeline = Pipeline(steps=[
            ('preprocessing', preprocessing),
            ('estimator', estimator)
        ])

        self.n_iter = n_iter
        self.search_space = search_space
        self.random_state = random_state
        self.test_size = test_size
        self.n_jobs = n_jobs
        self.estimator_name = estimator.__class__.__name__

    def load_dataset(self, data_set):
        dataset = openml.datasets.get_dataset(data_set['id'])
        self.dataset_id = data_set['id']

        X, _, _, _ = dataset.get_data(dataset_format="dataframe")

        y = X.iloc[:,-1]
        X = X.iloc[:,:-1]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size)

    def save_to_file(self, type, results):
        csv_file_path = '{0}-{1}-{2}-{3}.csv'.format(type, self.dataset_id, self.estimator_name, self.random_state)

        with open(csv_file_path, 'w', newline='') as csvfile:
            all_param_names = set()
            for params_dict in results['params']:
                all_param_names.update(params_dict.keys())

            fieldnames = ['Iteracja'] + list(all_param_names) + ['Srednia dokladnosc', 'Czas']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
            writer.writeheader()

            for i in range(len(results['params'])):
                row_data = {'Iteracja': i + 1}
                row_data.update(results['params'][i])
                row_data.update({
                    'Srednia dokladnosc': results['mean_test_score'][i],
                    'Czas': results['mean_fit_time'][i]
                })

                writer.writerow(row_data)

        print(f"Wyniki zostały zapisane do pliku CSV: {csv_file_path}")

    def run_random_search(self):
        rs = RandomizedSearchCV(self.pipeline, self.search_space, n_iter=self.n_iter, random_state=self.random_state, n_jobs=self.n_jobs)
        rs.fit(self.X_train, self.y_train)
        score = rs.score(self.X_test, self.y_test)
        print(score)
        self.save_to_file('random_search', rs.cv_results_)


    def run_bayes_search(self):
        bs = BayesSearchCV(self.pipeline, self.search_space, n_iter=self.n_iter, random_state=self.random_state, n_jobs=self.n_jobs)
        bs.fit(self.X_train, self.y_train)
        score = bs.score(self.X_test, self.y_test)
        print(score)
        self.save_to_file('bayes_search', bs.cv_results_)

In [3]:
def run_all():
    algorithms = [
        ('random_forest', RandomForestClassifier()),
        ('neural_network', MLPClassifier()),
        ('gradient_boosting', GradientBoostingClassifier())
    ]


    for algorithm in algorithms:
        hpo = HPO(algorithm[1], search_spaces[algorithm[0]], n_iter=500, n_jobs=-1)
        for data_set in data_sets:
            hpo.load_dataset(data_sets[data_set])
            hpo.run_random_search()
            hpo.run_bayes_search()


In [None]:
run_all()