Wybrane zbiory danych: 
- spambase https://www.openml.org/search?type=data&status=active&sort=runs&qualities.NumberOfInstances=between_1000_10000&id=44 
- phoneme https://www.openml.org/search?type=data&status=active&id=1489
- tic-tac-toe https://www.openml.org/search?type=data&sort=runs&status=active&id=50 
- SpeedDating https://www.openml.org/search?type=data&sort=runs&status=active&qualities.NumberOfClasses=%3D_2&id=40536

Wybrane algorytmy:
* GradientBoosting
* RandomForest
* Sieci neuronowe

W przypadku sieci neuronowych startową siatkę hiperparametrów zaczerpnięto z artykułu https://www.degruyter.com/document/doi/10.1515/comp-2020-0227/html

TODO: 
1. Sprawdzić dane - done
2. Uporządkować metody w ramach jednej klasy - done
3. Zapisywanie wyników - done
4. Wybranie siatek hiperparamterów
5. Obliczanie tuningu 
6. Wykresiki i docs 

In [3]:
import numpy as np

data_sets = {
    'sick' : {
        'id' : 38,
        'label_y' : 'class'
    },
    'speeddating' : {
        'id' : 40536,
        'label_y' : 'match'
    },
    'phoneme' : {
        'id' : 1489,
        'label_y' : 'class'
    },
    'spambase' : {
        'id' : 44,
        'label_y' : 'class'
    },
}

search_spaces = {
    'random_forest' : {
        "estimator__n_estimators " : np.arange(1, 2000),
        "estimator__max_depth" : np.arange(1, 100),
        "estimator__bootstrap" : [True, False],
        "estimator__max_samples" : np.arange(0.1, 1, 0.05),
        "estimator__max_features" : np.arange(0.1, 1, 0.05),
        "estimator__min_samples_split" :  np.arange(0.1, 1, 0.05),
    },
    'neural_network' : {
        "estimator__hidden_layer_sizes" : np.arange(10, 500),
        "estimator__activation" : ['relu', 'identity', 'logistic', 'tanh'],
        "estimator__learning_rate_init" : np.exp(np.arange(-6, 1, 0.1)),
        "estimator__alpha" : np.exp(np.arange(-6, 1, 0.1)),
    },
    'gradient_boosting' : {
        "estimator__n_estimators" : np.arange(1, 5000),
        "estimator__learning_rate" : np.arange(2**-10, 2**0),
        "estimator__subsample" : np.arange(0.1, 1, 0.05),
        "estimator__loss" : ['log_loss', 'exponential'],
        "estimator__max_depth" : np.arange(1, 15),
        "estimator__min_samples_split" : np.arange(2**0, 2**7),
        "estimator__max_features" : np.arange(0, 1, 0.05),
        "estimator__reg_lambda" : np.arange(2**-10, 2**10),
        "estimator__reg_alpha" : np.arange(2**-10, 2**10),
    }
}

In [4]:
import csv
import openml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV

class HPO():
    def __init__(self, estimator, search_space, random_state=0, test_size=0.2, n_iter=10): 
        num_pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='mean')),
            ('scale', MinMaxScaler())
        ])

        cat_pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        preprocessing = ColumnTransformer(transformers=[
            ('num_pipeline',num_pipeline, make_column_selector(dtype_include=np.number)),
            ('cat_pipeline',cat_pipeline, make_column_selector(dtype_include=np.object_))
            ],
            remainder='drop',
            n_jobs=-1)

        self.pipeline = Pipeline(steps=[
            ('preprocessing', preprocessing),
            ('estimator', estimator)
        ])

        self.n_iter = n_iter
        self.search_space = search_space
        self.random_state = random_state
        self.test_size = test_size
        self.estimator_name = estimator.__class__.__name__

    def load_dataset(self, data_set):
        dataset = openml.datasets.get_dataset(data_set['id'])
        self.dataset_id = data_set['id']

        X, _, _, _ = dataset.get_data(dataset_format="dataframe")

        y = X.iloc[:,-1]
        X = X.iloc[:,:-1]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size)

    def save_to_file(self, type, results):
        csv_file_path = '{0}-{1}-{2}-{3}.csv'.format(type, self.dataset_id, self.estimator_name, self.random_state)

        with open(csv_file_path, 'w', newline='') as csvfile:
            fieldnames = ['Iteracja', 'Parametry', 'Średnia dokładność', 'Czas']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()

            for i in range(len(results['params'])):
                writer.writerow({
                    'Iteracja': i + 1,
                    'Parametry': str(results['params'][i]),
                    'Średnia dokładność': results['mean_test_score'][i],
                    'Czas': results['mean_fit_time'][i]
                })

        print(f"Wyniki zostały zapisane do pliku CSV: {csv_file_path}")

    def run_random_search(self):
        rs = RandomizedSearchCV(self.pipeline, self.search_space, n_iter=self.n_iter, random_state=self.random_state)
        rs.fit(self.X_train, self.y_train)
        score = rs.score(self.X_test, self.y_test)
        print(score)
        self.save_to_file('random_search', rs.cv_results_)


    def run_bayes_search(self):
        bs = BayesSearchCV(self.pipeline, self.search_space, n_iter=self.n_iter, random_state=self.random_state)
        bs.fit(self.X_train, self.y_train)
        score = bs.score(self.X_test, self.y_test)
        print(score)
        self.save_to_file('bayes_search', bs.cv_results_)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

def run_all():
    algorithms = [
        ('random_forest', RandomForestClassifier()),
        ('neural_network', MLPClassifier()),
        ('gradient_boosting', GradientBoostingClassifier())
    ]


    for algorithm in algorithms:
        hpo = HPO(algorithm[1], search_spaces[algorithm[0]])
        for data_set in data_sets:
            hpo.load_dataset(data_sets[data_set])
            hpo.run_random_search()
            hpo.run_bayes_search()


In [5]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

hpo = HPO(MLPClassifier(), search_spaces['neural_network'], n_iter = 10)
hpo.load_dataset(data_sets['sick'])
hpo.run_random_search()

0.9483443708609272
Wyniki zostały zapisane do pliku CSV: random_search-38-MLPClassifier-0.csv
