# Eksperyment

In [25]:
# Importowanie bibliotek
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import sklearn.model_selection as skm
from time import time
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
# Dla łatwiejszej modyfikacji parametrów
RANDOM_STATE = 16
N_JOBS = -1
CV_SPLITS = 5
SCORING = 'roc_auc'
DATASET_IDS = [1, 2, 3]
N_ITER = 100
TEST_SIZE = 0.3

### TUTAJ PREPROCESSING - wstępna wersja

In [3]:
# diabetes, id = 37
from sklearn.datasets import fetch_openml
df = fetch_openml(data_id = 37)
y = df.target
X = df.data

In [4]:
X.isnull().sum()

preg    0
plas    0
pres    0
skin    0
insu    0
mass    0
pedi    0
age     0
dtype: int64

In [5]:
y.isnull().sum()

0

In [6]:
# Jak wygląda zbalansowanie danych:
print('sum(y = tested_positive) / sum(y = tested_negative):', round(sum(y == 'tested_positive') / (len(y) - sum(y == 'tested_positive')), 4))

sum(y = tested_positive) / sum(y = tested_negative): 0.536


In [7]:
y = pd.get_dummies(y).iloc[:, 0]

In [8]:
y

0      False
1       True
2      False
3       True
4      False
       ...  
763     True
764     True
765     True
766    False
767     True
Name: tested_negative, Length: 768, dtype: bool

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [10]:
numerical_features = list(X.dtypes[(X.dtypes != 'object') & (X.dtypes != 'category')].index)
categorical_features = list(X.dtypes[(X.dtypes == 'object') | (X.dtypes == 'category')].index)

print('Numerical features:', numerical_features)
print('Categorical features:', categorical_features)

Numerical features: ['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age']
Categorical features: []


In [11]:
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  
    ('scaler', StandardScaler())  
])

In [12]:
preprocessor = ColumnTransformer([
    ('numerical', numerical_transformer, numerical_features),
    ('categorical', categorical_transformer, categorical_features)
])

#### Kroswalidacja

In [13]:
kfold = skm.KFold(CV_SPLITS, random_state = RANDOM_STATE, shuffle=True)

### Surrogate models, acquisition functions

In [14]:
SURROGATE_MODELS = ['GP', 'RF', 'ET']  # Gaussian Process, Random Forest, Extra Trees
ACQUISITION_FUNCTIONS = ['LCB', 'EI', 'PI']  # Lower Confidence Bound, Expected Improvement, Probability of Improvement

# Wszystkie kombinacje
SURROGATE_ACQ = []
for surrogate in SURROGATE_MODELS:
    for acq in ACQUISITION_FUNCTIONS:
        SURROGATE_ACQ.append({
            'name': f'{surrogate}_{acq}',
            'base_estimator': surrogate,
            'acq_func': acq
        })

Modele i hiperparametry

In [30]:
MODELS = {
    'XGBoost': {
        'estimator': XGBClassifier(
            random_state=RANDOM_STATE,
            enable_categorical=False,
            eval_metric='logloss'
        ),
        'params': {
            'bayes': {
                # Podstawowe parametry kontrolujące ogólną wydajność
                'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
                'model__max_depth': Integer(3, 9),
                'model__n_estimators': Integer(50, 300),
                
                # Najważniejsze parametry regularyzacji
                'model__subsample': Real(0.6, 1.0),  # Zapobiega przetrenowaniu
                'model__reg_lambda': Real(0, 10)      # L2 na wagach
            },
            'random': {
                'model__learning_rate': np.logspace(-2, -0.1, 100),
                'model__max_depth': np.arange(3, 11),
                'model__n_estimators': np.arange(50, 201),
            },
            'grid': {
                'model__learning_rate': [0.01, 0.1, 0.2],
                'model__max_depth': [3, 6, 9],
                'model__n_estimators': [100, 200],
            }
        }
    },
    
    # z LightGBM jest bardzo dużo warningów, których niezbyt wiem jak się pozbyć, ale też nie wiem, czy są istotne
    # 'LightGBM': {
    #     'estimator': LGBMClassifier(
    #         random_state=RANDOM_STATE,
    #         force_col_wise=True
    #     ),
    #     'params': {
    #         'bayes': {
    #             # Podstawowe parametry kontrolujące ogólną wydajność
    #             'model__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    #             'model__num_leaves': Integer(31, 100),  # Zwiększ dolny limit do 31
    #             'model__n_estimators': Integer(50, 300),
                
    #             # Najważniejsze parametry regularyzacji
    #             'model__subsample': Real(0.6, 1.0),      # Stochastic Gradient Descent
    #             'model__feature_fraction': Real(0.5, 1.0),  # Losowy wybór cech
    #             'model__reg_lambda': Real(0, 10)          # L2
    #         }
    #     }
    # }
}

Etap 1: Testowanie wszystkich kombinacji Surrogate + Acquisition Function


In [18]:
surrogate_results = []

In [19]:
for model_name, model_config in MODELS.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model_config['estimator'])
    ])
    
    for sur_acq in SURROGATE_ACQ:
        searcher = BayesSearchCV(
            estimator=pipeline,
            search_spaces=model_config['params']['bayes'],
            n_iter=N_ITER,
            cv=kfold,
            scoring=SCORING,
            n_jobs=N_JOBS,
            random_state=RANDOM_STATE,
            optimizer_kwargs={
            'base_estimator': sur_acq['base_estimator'],
            'acq_func': sur_acq['acq_func']
        }
        )
        
        start_time = time()
        searcher.fit(X_train, y_train)
        elapsed_time = time() - start_time

        best_searcher = searcher.best_estimator_
        auc_test = roc_auc_score(y_test, best_searcher.predict_proba(X_test)[:, 1])
        
        surrogate_results.append({
            #'dataset': dataset_id,
            'model': model_name,
            'method': f"Bayes_{sur_acq['base_estimator']}_{sur_acq['acq_func']}",
            'best_score': round(searcher.best_score_, 4),
            'score_test': round(auc_test, 4),
            'time': round(elapsed_time, 4),
            'best_params': str(searcher.best_params_)
        })
        

In [20]:
surrogate_results_df = pd.DataFrame(surrogate_results)
surrogate_results_df

Unnamed: 0,model,method,best_score,score_test,time,best_params
0,XGBoost,Bayes_GP_LCB,0.8017,0.8633,89.198,"OrderedDict([('model__learning_rate', 0.012920..."
1,XGBoost,Bayes_GP_EI,0.8027,0.8671,91.7039,"OrderedDict([('model__learning_rate', 0.022964..."
2,XGBoost,Bayes_GP_PI,0.7988,0.8669,104.475,"OrderedDict([('model__learning_rate', 0.018960..."
3,XGBoost,Bayes_RF_LCB,0.8024,0.8661,42.1657,"OrderedDict([('model__learning_rate', 0.025069..."
4,XGBoost,Bayes_RF_EI,0.8005,0.8693,46.6366,"OrderedDict([('model__learning_rate', 0.017681..."
5,XGBoost,Bayes_RF_PI,0.7996,0.8648,45.1479,"OrderedDict([('model__learning_rate', 0.079829..."
6,XGBoost,Bayes_ET_LCB,0.8016,0.8649,42.3899,"OrderedDict([('model__learning_rate', 0.013649..."
7,XGBoost,Bayes_ET_EI,0.8033,0.8668,44.027,"OrderedDict([('model__learning_rate', 0.070285..."
8,XGBoost,Bayes_ET_PI,0.8018,0.8684,43.7163,"OrderedDict([('model__learning_rate', 0.024522..."


Najlepsze: ET, PI

In [23]:
SURROGATE_ACQ

[{'name': 'GP_LCB', 'base_estimator': 'GP', 'acq_func': 'LCB'},
 {'name': 'GP_EI', 'base_estimator': 'GP', 'acq_func': 'EI'},
 {'name': 'GP_PI', 'base_estimator': 'GP', 'acq_func': 'PI'},
 {'name': 'RF_LCB', 'base_estimator': 'RF', 'acq_func': 'LCB'},
 {'name': 'RF_EI', 'base_estimator': 'RF', 'acq_func': 'EI'},
 {'name': 'RF_PI', 'base_estimator': 'RF', 'acq_func': 'PI'},
 {'name': 'ET_LCB', 'base_estimator': 'ET', 'acq_func': 'LCB'},
 {'name': 'ET_EI', 'base_estimator': 'ET', 'acq_func': 'EI'},
 {'name': 'ET_PI', 'base_estimator': 'ET', 'acq_func': 'PI'}]

In [24]:
SURROGATE_ACQ_CHOSEN = [{'name': 'ET_PI', 'base_estimator': 'ET', 'acq_func': 'PI'}]

In [35]:
results = []

In [36]:
for model_name, model_config in MODELS.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model_config['estimator'])
    ])
    
    # Definicja metod optymalizacji
    searchers = [
        ('Bayes_ET_PI', BayesSearchCV(
            estimator=pipeline,
            search_spaces=model_config['params']['bayes'],
            n_iter=N_ITER,
            cv=kfold,
            scoring=SCORING,
            n_jobs=N_JOBS,
            random_state=RANDOM_STATE,
            optimizer_kwargs={
            'base_estimator': SURROGATE_ACQ_CHOSEN[0]['base_estimator'],
            'acq_func': SURROGATE_ACQ_CHOSEN[0]['acq_func']
            }
        )),
        ('RandomSearch', RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=model_config['params']['random'],
            n_iter=N_ITER,
            cv=kfold,
            scoring=SCORING,
            n_jobs=N_JOBS,
            random_state=RANDOM_STATE
        )),
        ('GridSearch', GridSearchCV(
            estimator=pipeline,
            param_grid=model_config['params']['grid'],
            cv=kfold,
            scoring=SCORING,
            n_jobs=N_JOBS
        ))
    ]

for searcher_name, searcher in searchers:
    start_time = time()
    searcher.fit(X_train, y_train)
    elapsed_time = time() - start_time

    best_searcher = searcher.best_estimator_
    auc_test = roc_auc_score(y_test, best_searcher.predict_proba(X_test)[:, 1])
    
    results.append({
        #'dataset': dataset_id,
        'model': model_name,
        'method': searcher_name,
        'best_score': round(searcher.best_score_, 4),
        'score_test': round(auc_test, 4),
        'time': round(elapsed_time, 4),
        'best_params': str(searcher.best_params_)
    })




In [37]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model,method,best_score,score_test,time,best_params
0,XGBoost,Bayes_ET_PI,0.7986,0.8714,47.8538,"OrderedDict([('model__learning_rate', 0.047213..."
1,XGBoost,RandomSearch,0.799,0.8641,4.0509,"{'model__n_estimators': 174, 'model__max_depth..."
2,XGBoost,GridSearch,0.7951,0.8606,0.9027,"{'model__learning_rate': 0.01, 'model__max_dep..."
