# Eksperyment

In [34]:
# Importowanie bibliotek
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import sklearn.model_selection as skm
from time import time
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.datasets import fetch_openml
import os

In [35]:
# Dla łatwiejszej modyfikacji parametrów
RANDOM_STATE = 16
N_JOBS = -1
CV_SPLITS = 5
SCORING = 'roc_auc'
N_ITER = 100
TEST_SIZE = 0.3
DATASET_IDS = [37]
# DATASET_IDS = [37, 1464, 1063, 1480, 50, 29, 23381, 6332, 40994, 1510, 15, 1494, 1068, 1462, 1049, 1050, 1067, 1487, 1485, 3] 
# 37 - diabetes, 1464 - blood-transfusion-service-center, 1063 - kc2, 1480 - ilpd, 50 - tic-tac-toe
# 29 - credit-approval, 23381 - dresses-sales, 6332 - cylinder-bands, 40994 - climate-model-simulation-crashes, 1510 - wdbc
# 15 - breast-w, 1494 - qsar-biodeg, 1068 - pc1, 1462 - banknote-authentication, 1049 - pc4
# 1050 - pc3, 1067 - kc1, 1487 - ozone-level-8hr, 1485 - madelon,  3 - kr-vs-kp

#### Kroswalidacja

In [36]:
kfold = skm.KFold(CV_SPLITS, random_state = RANDOM_STATE, shuffle=True)

### Surrogate models, acquisition functions

In [37]:
SURROGATE_MODELS = ['GP', 'RF', 'ET']  # Gaussian Process, Random Forest, Extra Trees
ACQUISITION_FUNCTIONS = ['LCB', 'EI', 'PI']  # Lower Confidence Bound, Expected Improvement, Probability of Improvement

# Wszystkie kombinacje
SURROGATE_ACQ = []
for surrogate in SURROGATE_MODELS:
    for acq in ACQUISITION_FUNCTIONS:
        SURROGATE_ACQ.append({
            'name': f'{surrogate}_{acq}',
            'base_estimator': surrogate,
            'acq_func': acq
        })

Modele i hiperparametry

In [None]:
MODELS = {
    'XGBoost': {
        'estimator': XGBClassifier(
            random_state=RANDOM_STATE,
            enable_categorical=False,
            eval_metric='logloss'
        ),
        'params': {
            'bayes': {
                'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
                'model__max_depth': Integer(3, 9),
                'model__n_estimators': Integer(50, 300),
                'model__subsample': Real(0.6, 1.0),
                'model__reg_lambda': Real(0, 10)
            },
            'random': {
                'model__learning_rate': np.logspace(-2, -0.1, 100),
                'model__max_depth': np.arange(3, 11),
                'model__n_estimators': np.arange(50, 201),
                'model__subsample': np.linspace(0.6, 1.0, 10),
                'model__reg_lambda': np.linspace(0, 10, 10)
            },
            'grid': {
                'model__learning_rate': [0.01, 0.1, 0.2],
                'model__max_depth': [3, 6, 9],
                'model__n_estimators': [100, 200],
                'model__subsample': [0.6, 0.8, 1.0],
                'model__reg_lambda': [0, 5, 10]
            }
        }
    },
    
    'LightGBM': {
        'estimator': LGBMClassifier(
            random_state=RANDOM_STATE,
            force_col_wise=True,
            min_gain_to_split=0.01
        ),
        'params': {
            'bayes': {
                'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
                'model__num_leaves': Integer(31, 100), 
                'model__n_estimators': Integer(50, 300),
                'model__subsample': Real(0.6, 1.0),
                'model__reg_lambda': Real(0, 10)
            },
            'random': {
                'model__learning_rate': np.logspace(-2, -0.1, 100),
                'model__num_leaves': np.arange(31, 101),
                'model__n_estimators': np.arange(50, 301),
                'model__subsample': np.linspace(0.6, 1.0, 10),
                'model__reg_lambda': np.linspace(0, 10, 10)
            },
            'grid': {
                'model__learning_rate': [0.01, 0.1, 0.3],
                'model__num_leaves': [31, 50, 100],
                'model__n_estimators': [100, 200, 300],
                'model__subsample': [0.6, 0.8, 1.0],
                'model__reg_lambda': [0, 5, 10]
            }
        }
    },
    
    'Random Forest': {
        'estimator': RandomForestClassifier(
            random_state=RANDOM_STATE,
            bootstrap=True
        ),
        'params': {
            'bayes': {
                'model__n_estimators': Integer(50, 300),
                'model__max_depth': Integer(3, 20),
                'model__min_samples_split': Integer(2, 20),
                'model__min_samples_leaf': Integer(1, 20)
            },
            'random': {
                'model__n_estimators': np.arange(50, 301),
                'model__max_depth': np.arange(3, 21),
                'model__min_samples_split': np.arange(2, 21),
                'model__min_samples_leaf': np.arange(1, 21)
            },
            'grid': {
                'model__n_estimators': [100, 200, 300],
                'model__max_depth': [3, 6, 9, None],
                'model__min_samples_split': [2, 5, 10],
                'model__min_samples_leaf': [1, 2, 4]
            }
        }
    }
}

In [45]:
all_params_set = {
    param_name 
    for model_config in MODELS.values() 
    for method in ['bayes', 'random', 'grid'] 
    for param_name in model_config.get('params', {}).get(method, {})
}

CV_RESULTS_COLUMNS = [
    'mean_test_score', 'std_test_score', 
    'mean_train_score', 'std_train_score',
    'rank_test_score', 'mean_fit_time', 'std_fit_time'
]

META_COLUMNS = ['dataset_id', 'model_name', 'searcher_name', 'n_iter']

ALL_COLUMNS = META_COLUMNS + CV_RESULTS_COLUMNS + sorted(all_params_set)

In [46]:
all_iterations_data = []

In [48]:
results = []

In [53]:
for dataset_id in DATASET_IDS:
    ######################## Dane ########################
    df = fetch_openml(data_id=dataset_id)
    y = df.target
    X = df.data

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    numerical_features = list(X_train.dtypes[(X_train.dtypes != 'object') & (X_train.dtypes != 'category')].index)
    categorical_features = list(X_train.dtypes[(X_train.dtypes == 'object') | (X_train.dtypes == 'category')].index)

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])

    numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer([
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

    ######################## Modele ########################
    for model_name, model_config in MODELS.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model_config['estimator'])
        ])
        
        searchers = []
        
        for sur_acq in SURROGATE_ACQ:
            searchers.append((
                f"Bayes_{sur_acq['name']}", 
                BayesSearchCV(
                    estimator=pipeline,
                    search_spaces=model_config['params']['bayes'],
                    n_iter=N_ITER,
                    cv=kfold,
                    scoring=SCORING,
                    n_jobs=N_JOBS,
                    random_state=RANDOM_STATE,
                    optimizer_kwargs={
                        'base_estimator': sur_acq['base_estimator'],
                        'acq_func': sur_acq['acq_func']
                    },
                    return_train_score=True
                )
            ))
        
        searchers.append((
            'RandomSearch',
            RandomizedSearchCV(
                estimator=pipeline,
                param_distributions=model_config['params']['random'],
                n_iter=N_ITER,
                cv=kfold,
                scoring=SCORING,
                n_jobs=N_JOBS,
                random_state=RANDOM_STATE,
                return_train_score=True
            )
        ))
        
        searchers.append((
            'GridSearch',
            GridSearchCV(
                estimator=pipeline,
                param_grid=model_config['params']['grid'],
                cv=kfold,
                scoring=SCORING,
                n_jobs=N_JOBS,
                return_train_score=True
            )
        ))

        ######################## Zapisywanie wyników ########################
        for searcher_name, searcher in searchers:
            start_time = time()
            searcher.fit(X_train, y_train)
            elapsed_time = time() - start_time

            df_iterations = pd.DataFrame(searcher.cv_results_)
            
            df_iterations['n_iter'] = np.arange(1, len(df_iterations) + 1)
            

            params_df = pd.json_normalize(df_iterations['params'])
            params_df = params_df.reindex(columns=sorted(all_params_set), fill_value=np.nan)
            
            df_iterations = pd.concat([
                df_iterations[CV_RESULTS_COLUMNS + ['n_iter']],
                params_df
            ], axis=1)
            
            df_iterations['dataset_id'] = dataset_id
            df_iterations['model_name'] = model_name
            df_iterations['searcher_name'] = searcher_name
            df_iterations['time'] = elapsed_time
            
            df_iterations = df_iterations.reindex(columns=ALL_COLUMNS, fill_value=np.nan)

            all_iterations_data.append(df_iterations)

            # Zapis wyników dla najlepszej iteracji
            best_searcher = searcher.best_estimator_
            auc_test = roc_auc_score(y_test, best_searcher.predict_proba(X_test)[:, 1])
            
            results.append({
                'dataset': dataset_id,
                'model': model_name,
                'method': searcher_name,
                'best_score': round(searcher.best_score_, 4),
                'score_test': round(auc_test, 4),
                'time': round(elapsed_time, 4),
                'best_params': str(searcher.best_params_)
            })

KeyboardInterrupt: 

In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
if all_iterations_data:
    all_iterations_df = pd.concat(all_iterations_data, ignore_index=True)
    all_iterations_df('all_iterations_results.csv', index=False)

In [None]:
# Sprawdzamy, czy wyniki zostały zapisane poprawnie
all_iterations_df = pd.read_csv("all_iterations_results.csv")

all_iterations_df