Wybrane zbiory danych: 
- Abalone https://www.openml.org/search?type=data&status=active&id=720
- ada_prior https://www.openml.org/search?type=data&status=active&id=1037
- spambase https://www.openml.org/search?type=data&status=active&id=44
- phoneme https://www.openml.org/search?type=data&status=active&id=1489

Wybrane algorytmy:
* GradientBoosting
* RandomForest
* Sieci neuronowe

W przypadku sieci neuronowych startową siatkę hiperparametrów zaczerpnięto z artykułu https://www.degruyter.com/document/doi/10.1515/comp-2020-0227/html

TODO: 
1. Sprawdzić dane - done
2. Uporządkować metody w ramach jednej klasy - done
3. Zapisywanie wyników - done
4. Wybranie siatek hiperparamterów - done
5. Obliczanie tuningu 
6. Wykresiki i docs 

In [1]:
import numpy as np

data_sets = {
    # 4.17k
    'abalone' : {
        'id' : 720,
        'label_y' : 'target'
    },
    # 4.56k
    'ada_prior' : {
        'id' : 1037,
        'label_y' : 'target'
    },
    # 5.4k
    'phoneme' : {
        'id' : 1489,
        'label_y' : 'class'
    },
    # 4.6k
    'spambase' : {
        'id' : 44,
        'label_y' : 'class'
    },
}

search_spaces = {
    'random_forest' : {
        "estimator__n_estimators" : np.arange(1, 2000),
        "estimator__max_depth" : np.arange(1, 20),
        "estimator__max_samples" : np.linspace(0.1, 1, num=100),
        "estimator__max_features" : ['sqrt', 'log2'],
        "estimator__min_samples_split" :  np.linspace(0.1, 0.5, num=100),
    },
    'neural_network' : {
        "estimator__hidden_layer_sizes" : np.arange(10, 500),
        "estimator__activation" : ['relu', 'identity', 'logistic', 'tanh'],
        "estimator__learning_rate_init" : np.logspace(-6, 1, base=2.0, num=1000),
        "estimator__alpha" : np.logspace(-6, 1, base=2.0, num=1000),
    },
    'gradient_boosting' : {
        "estimator__n_estimators" : np.arange(1, 5000, step=50),
        "estimator__learning_rate" : np.logspace(-10, 0, base=2.0, num=10),
        "estimator__subsample" : np.linspace(0.1, 1, num=10),
        "estimator__loss" : ['log_loss', 'exponential'],
        "estimator__max_depth" : np.arange(1, 16, step=2),
        "estimator__min_samples_split" : [2**(i+1) for i in range(7)],
        "estimator__max_features" : np.arange(0.05, 1.05, 0.05),
    }
}

In [2]:
import csv
import openml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV

class HPO():
    def __init__(self, estimator, search_space, random_state=0, test_size=0.2, n_iter=10, n_jobs=1): 
        num_pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='mean')),
            ('scale', MinMaxScaler())
        ])

        cat_pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        preprocessing = ColumnTransformer(transformers=[
            ('num_pipeline',num_pipeline, make_column_selector(dtype_include=np.number)),
            ('cat_pipeline',cat_pipeline, make_column_selector(dtype_include=np.object_))
            ],
            remainder='drop',
            n_jobs=-1)

        self.pipeline = Pipeline(steps=[
            ('preprocessing', preprocessing),
            ('estimator', estimator)
        ])

        self.n_iter = n_iter
        self.search_space = search_space
        self.random_state = random_state
        self.test_size = test_size
        self.n_jobs = n_jobs
        self.estimator_name = estimator.__class__.__name__

    def load_dataset(self, data_set):
        dataset = openml.datasets.get_dataset(data_set['id'])
        self.dataset_id = data_set['id']

        X, _, _, _ = dataset.get_data(dataset_format="dataframe")

        y = X.iloc[:,-1]
        X = X.iloc[:,:-1]

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size)

    def save_to_file(self, type, results):
        csv_file_path = '{0}-{1}-{2}-{3}.csv'.format(type, self.dataset_id, self.estimator_name, self.random_state)

        with open(csv_file_path, 'w', newline='') as csvfile:
            all_param_names = set()
            for params_dict in results['params']:
                all_param_names.update(params_dict.keys())

            fieldnames = ['Iteracja'] + list(all_param_names) + ['Srednia dokladnosc', 'Czas']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')
            writer.writeheader()

            for i in range(len(results['params'])):
                row_data = {'Iteracja': i + 1}
                row_data.update(results['params'][i])
                row_data.update({
                    'Srednia dokladnosc': results['mean_test_score'][i],
                    'Czas': results['mean_fit_time'][i]
                })

                writer.writerow(row_data)

        print(f"Wyniki zostały zapisane do pliku CSV: {csv_file_path}")

    def run_random_search(self):
        rs = RandomizedSearchCV(self.pipeline, self.search_space, n_iter=self.n_iter, random_state=self.random_state, n_jobs=self.n_jobs)
        rs.fit(self.X_train, self.y_train)
        score = rs.score(self.X_test, self.y_test)
        print(score)
        self.save_to_file('random_search', rs.cv_results_)


    def run_bayes_search(self):
        bs = BayesSearchCV(self.pipeline, self.search_space, n_iter=self.n_iter, random_state=self.random_state, n_jobs=self.n_jobs)
        bs.fit(self.X_train, self.y_train)
        score = bs.score(self.X_test, self.y_test)
        print(score)
        self.save_to_file('bayes_search', bs.cv_results_)

In [57]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

def run_all():
    algorithms = [
        ('random_forest', RandomForestClassifier()),
        ('neural_network', MLPClassifier()),
        ('gradient_boosting', GradientBoostingClassifier())
    ]


    for algorithm in algorithms:
        hpo = HPO(algorithm[1], search_spaces[algorithm[0]], n_iter=500, n_jobs=-1)
        for data_set in data_sets:
            hpo.load_dataset(data_sets[data_set])
            hpo.run_random_search()
            hpo.run_bayes_search()


In [None]:
run_all()

In [3]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

hpo = HPO(GradientBoostingClassifier(), search_spaces['gradient_boosting'], n_iter = 500, n_jobs=-1)
hpo.load_dataset(data_sets['spambase'])
hpo.run_bayes_search()

In [32]:
import glob
import os
import pandas as pd

algorithms = [
    "GradientBoostingClassifier",
    "MLPClassifier",
    "RandomForestClassifier"
]

optimalization_method = [
    "bayes_search",
    "random_search"
]

data_folder = 'data'

data = {}

for algorithm in algorithms:
    data[algorithm] = {}
    for method in optimalization_method:
        file_pattern = f'{method}-*-{algorithm}-0.csv'
        file_paths = glob.glob(os.path.join(data_folder, file_pattern))
        data_frames = []
        for file_path in file_paths:
            df = pd.read_csv(file_path, sep=';')
            df = df.iloc[:, :-1]
            data_frames.append(df)

        data[algorithm][method] = data_frames


In [35]:
import pandas as pd
best_configuration_index_per_algorithm = {}

for algorithm in data.keys():
    data_sets = data[algorithm]['random_search']
    first_data_set = data_sets[0]

    for i in range(1, len(data_sets)):
        other_data_set = data_sets[i]
        last_column = other_data_set.iloc[:, -1]
        first_data_set = pd.concat([first_data_set, last_column], axis=1)

    last_columns = first_data_set.iloc[:, -len(data_sets):]
    max_avg_index = last_columns.mean(axis=1).idxmax()
    best_configuration = first_data_set.loc[max_avg_index]
    best_configuration_index_per_algorithm[algorithm] = max_avg_index

In [None]:
for algorithm in data.keys():
    for method in data[algorithm].keys():
        data_sets = data[algorithm][method]

        for i in range(len(data_sets)):
            diffs = data_sets[i]['Srednia dokladnosc'][best_configuration_index_per_algorithm[algorithm]] - data_sets[i]['Srednia dokladnosc']
            print(algorithm, method)
            print(diffs)
            data_sets[i]['Differents'] = diffs

In [46]:
import matplotlib.pyplot as plt

for algorithm in data.keys():
    for method in data[algorithm].keys():
        data_sets = data[algorithm][method]

        for i in range(len(data_sets)):
            plt.figure()
            plt.boxplot(data_sets[i]['Differents'])
            plt.xlabel(algorithm)
            plt.ylabel('Difference')
            plt.title(f'{algorithm} - {method}')
            plt.savefig(f'plots/{algorithm}-{method}-{i}.svg')
            plt.close()


In [71]:

import matplotlib.pyplot as plt

for algorithm in data.keys():
    data_sets = data[algorithm]['bayes_search']

    for i in range(len(data_sets)):
        current_data_set = data_sets[i]

        current_best = 0
        current_index = 0
        y = []
        x = []

        for value in current_data_set['Srednia dokladnosc']:
            current_index += 1
            if(abs(value -  current_best) < 0.005 * current_best or value > current_best):
                y.append(value)
                x.append(current_index)

            if(value > current_best):
                current_best = value

        plt.figure()
        plt.scatter(x, y, s=10)
        plt.xlabel('Iteration')
        plt.ylabel('AUX')
        plt.title(f'{algorithm} - Dataset {i+1}')
        plt.savefig(f'scatters/{algorithm}-{i}.svg')
        plt.close()

