# Eksperyment

In [10]:
# Importowanie bibliotek
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import sklearn.model_selection as skm
from time import time
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.datasets import fetch_openml

In [11]:
# Dla łatwiejszej modyfikacji parametrów
RANDOM_STATE = 16
N_JOBS = -1
CV_SPLITS = 5
SCORING = 'roc_auc'
N_ITER = 100
TEST_SIZE = 0.3
DATASET_IDS = [37, 1464, 1063, 1480, 50] # 37 - diabetes, 1464 - blood-transfusion-service-center, 1063 - kc2, 1480 - ilpd, 50 - tic-tac-toe

#### Kroswalidacja

In [12]:
kfold = skm.KFold(CV_SPLITS, random_state = RANDOM_STATE, shuffle=True)

### Surrogate models, acquisition functions

In [13]:
SURROGATE_MODELS = ['GP', 'RF', 'ET']  # Gaussian Process, Random Forest, Extra Trees
ACQUISITION_FUNCTIONS = ['LCB', 'EI', 'PI']  # Lower Confidence Bound, Expected Improvement, Probability of Improvement

# Wszystkie kombinacje
SURROGATE_ACQ = []
for surrogate in SURROGATE_MODELS:
    for acq in ACQUISITION_FUNCTIONS:
        SURROGATE_ACQ.append({
            'name': f'{surrogate}_{acq}',
            'base_estimator': surrogate,
            'acq_func': acq
        })

Modele i hiperparametry

In [14]:
MODELS = {
    'XGBoost': {
        'estimator': XGBClassifier(
            random_state=RANDOM_STATE,
            enable_categorical=False,
            eval_metric='logloss'
        ),
        'params': {
            'bayes': {
                'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
                'model__max_depth': Integer(3, 9),
                'model__n_estimators': Integer(50, 300),
                'model__subsample': Real(0.6, 1.0),
                'model__reg_lambda': Real(0, 10)
            },
            'random': {
                'model__learning_rate': np.logspace(-2, -0.1, 100),
                'model__max_depth': np.arange(3, 11),
                'model__n_estimators': np.arange(50, 201),
            },
            'grid': {
                'model__learning_rate': [0.01, 0.1, 0.2],
                'model__max_depth': [3, 6, 9],
                'model__n_estimators': [100, 200],
            }
        }
    },
    
    # z LightGBM jest bardzo dużo warningów, których niezbyt wiem jak się pozbyć, ale też nie wiem, czy są istotne
    # 'LightGBM': {
    #     'estimator': LGBMClassifier(
    #         random_state=RANDOM_STATE,
    #         force_col_wise=True
    #     ),
    #     'params': {
    #         'bayes': {
    #             # Podstawowe parametry kontrolujące ogólną wydajność
    #             'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    #             'model__num_leaves': Integer(31, 100),  # Zwiększ dolny limit do 31
    #             'model__n_estimators': Integer(50, 300),
                
    #             # Najważniejsze parametry regularyzacji
    #             'model__subsample': Real(0.6, 1.0),      # Stochastic Gradient Descent
    #             'model__feature_fraction': Real(0.5, 1.0),  # Losowy wybór cech
    #             'model__reg_lambda': Real(0, 10)          # L2
    #         }
    #     }
    # }
}

In [15]:
results = []

In [16]:
for dataset_id in DATASET_IDS:
    ######################## Dane ########################
    # przydało by się dodać trochę więcej wstępnej obróbki danych, ale na razie zostawiam,
    # bo nie wiem, czy te konkretne zbiory danych na pewno zostaną
    df = fetch_openml(data_id = dataset_id)
    y = df.target
    X = df.data

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    numerical_features = list(X_train.dtypes[(X_train.dtypes != 'object') & (X_train.dtypes != 'category')].index)
    categorical_features = list(X_train.dtypes[(X_train.dtypes == 'object') | (X_train.dtypes == 'category')].index)

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')), 
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])

    numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  
        ('scaler', StandardScaler())  
    ])

    preprocessor = ColumnTransformer([
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

    ######################## Modele ########################
    for model_name, model_config in MODELS.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model_config['estimator'])
        ])
        
        searchers = []
        
        for sur_acq in SURROGATE_ACQ:
            searchers.append((
                f"Bayes_{sur_acq['name']}", 
                BayesSearchCV(
                    estimator=pipeline,
                    search_spaces=model_config['params']['bayes'],
                    n_iter=N_ITER,
                    cv=kfold,
                    scoring=SCORING,
                    n_jobs=N_JOBS,
                    random_state=RANDOM_STATE,
                    optimizer_kwargs={
                        'base_estimator': sur_acq['base_estimator'],
                        'acq_func': sur_acq['acq_func']
                    }
                )
            ))
        
        searchers.append((
            'RandomSearch',
            RandomizedSearchCV(
                estimator=pipeline,
                param_distributions=model_config['params']['random'],
                n_iter=N_ITER,
                cv=kfold,
                scoring=SCORING,
                n_jobs=N_JOBS,
                random_state=RANDOM_STATE
            )
        ))
        
        searchers.append((
            'GridSearch',
            GridSearchCV(
                estimator=pipeline,
                param_grid=model_config['params']['grid'],
                cv=kfold,
                scoring=SCORING,
                n_jobs=N_JOBS
            )
        ))

    for searcher_name, searcher in searchers:
        start_time = time()
        searcher.fit(X_train, y_train)
        elapsed_time = time() - start_time

        best_searcher = searcher.best_estimator_
        auc_test = roc_auc_score(y_test, best_searcher.predict_proba(X_test)[:, 1])
        
        results.append({
            'dataset': dataset_id,
            'model': model_name,
            'method': searcher_name,
            'best_score': round(searcher.best_score_, 4),
            'score_test': round(auc_test, 4),
            'time': round(elapsed_time, 4),
            'best_params': str(searcher.best_params_)
        })

In [17]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,dataset,model,method,best_score,score_test,time,best_params
0,37,XGBoost,Bayes_GP_LCB,0.8016,0.8639,90.9568,"OrderedDict([('model__learning_rate', 0.012879..."
1,37,XGBoost,Bayes_GP_EI,0.8026,0.8633,89.3627,"OrderedDict([('model__learning_rate', 0.092919..."
2,37,XGBoost,Bayes_GP_PI,0.8008,0.8631,106.671,"OrderedDict([('model__learning_rate', 0.021094..."
3,37,XGBoost,Bayes_RF_LCB,0.8016,0.8681,44.5606,"OrderedDict([('model__learning_rate', 0.049005..."
4,37,XGBoost,Bayes_RF_EI,0.8011,0.863,47.5409,"OrderedDict([('model__learning_rate', 0.017090..."
5,37,XGBoost,Bayes_RF_PI,0.8033,0.8657,46.7319,"OrderedDict([('model__learning_rate', 0.041188..."
6,37,XGBoost,Bayes_ET_LCB,0.8016,0.8649,44.8176,"OrderedDict([('model__learning_rate', 0.013649..."
7,37,XGBoost,Bayes_ET_EI,0.8005,0.8696,45.2184,"OrderedDict([('model__learning_rate', 0.103952..."
8,37,XGBoost,Bayes_ET_PI,0.8018,0.8684,43.7881,"OrderedDict([('model__learning_rate', 0.024522..."
9,37,XGBoost,RandomSearch,0.799,0.8641,3.8981,"{'model__n_estimators': 174, 'model__max_depth..."
