# Wczytywanie paczek i danych
Dane zostały przygotowane w ten sposób, aby macierz eksperymentów była macierzą rzadką. 
Wynika to z faktu, że po OneHotEncoderze powstało wiele kolumn z duża ilością zer. 

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from scipy.sparse import load_npz

In [2]:
np.random.seed(321)
X = load_npz('data/x.npz')
y = np.load('data/y.npy')
type(X), type(y)

(scipy.sparse.csr.csr_matrix, numpy.ndarray)

Poprzednio y został zaenkodowany, w ten sposób, że 0 - 1 tyczyło się tego, czy pacjent ponownie przyjęty przed 30, czy po 30 dniach.
Natomiast 2 oznaczała brak ponownego przyjęcia.

In [3]:
y = np.where((y == 0) | (y == 1), 1, 0)
np.unique(y)

array([0, 1])

W nowym enkodingu 0 oznacza brak ponownego przyjęcia, natomiast 1 ponowne przyjęcie .

# Default

In [4]:
from sklearn.model_selection import cross_validate

metrics = ['roc_auc', 'f1', 'accuracy']
cv = 5
xgb_default = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
score_default = cross_validate(xgb_default, X, y, cv=cv, scoring=metrics)
res = pd.DataFrame(score_default)
res

Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_accuracy
0,3.781112,0.099984,0.68545,0.500264,0.627868
1,3.536879,0.083776,0.688002,0.544257,0.632683
2,4.702118,0.077793,0.676557,0.567136,0.620646
3,4.432848,0.097249,0.702413,0.6287,0.644408
4,4.238864,0.071316,0.702484,0.625487,0.645489


## Wyniki

In [5]:
res.describe()

Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_accuracy
count,5.0,5.0,5.0,5.0,5.0
mean,4.138364,0.086024,0.690981,0.573169,0.634219
std,0.475409,0.012349,0.011297,0.054791,0.010698
min,3.536879,0.071316,0.676557,0.500264,0.620646
25%,3.781112,0.077793,0.68545,0.544257,0.627868
50%,4.238864,0.083776,0.688002,0.567136,0.632683
75%,4.432848,0.097249,0.702413,0.625487,0.644408
max,4.702118,0.099984,0.702484,0.6287,0.645489


# Ręczna zmiana

In [6]:
xgb = XGBClassifier(eta=0.2, gamma=1.5, max_depth=8, use_label_encoder=False, eval_metric='logloss')
score_default = cross_validate(xgb, X, y, cv=cv, scoring=metrics)
res = pd.DataFrame(score_default)
res

Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_accuracy
0,5.078144,0.114693,0.686062,0.502916,0.627328
1,4.827207,0.10472,0.687109,0.542394,0.632732
2,4.695625,0.104228,0.680447,0.573923,0.625903
3,5.287105,0.120677,0.704349,0.631181,0.6449
4,5.854117,0.175038,0.702415,0.626907,0.645588


## Wyniki

In [7]:
res.describe()

Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_accuracy
count,5.0,5.0,5.0,5.0,5.0
mean,5.14844,0.123871,0.692077,0.575464,0.63529
std,0.455626,0.029433,0.010649,0.055023,0.00944
min,4.695625,0.104228,0.680447,0.502916,0.625903
25%,4.827207,0.10472,0.686062,0.542394,0.627328
50%,5.078144,0.114693,0.687109,0.573923,0.632732
75%,5.287105,0.120677,0.702415,0.626907,0.6449
max,5.854117,0.175038,0.704349,0.631181,0.645588


# Grid search

In [8]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'eta': [0.1, 0.4],
    'gamma': [1, 2],
    'max_depth': [6, 10],
    'min_child_weight': [0.5, 1],
    'subsample': [0.7, 1],
    'use_label_encoder': [False],
    'eval_metric': ['logloss']
}

gridsearch = GridSearchCV(XGBClassifier(), parameters, metrics, refit='roc_auc', cv=cv)
gridsearch.fit(X, y)



GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, v

## Najlepsze parametry  i wyniki

In [9]:
gridsearch.best_params_

{'eta': 0.1,
 'eval_metric': 'logloss',
 'gamma': 2,
 'max_depth': 6,
 'min_child_weight': 1,
 'subsample': 0.7,
 'use_label_encoder': False}

In [10]:
res = pd.DataFrame(gridsearch.cv_results_)
res.sort_values(by='mean_test_roc_auc', ascending=False)[['mean_test_roc_auc', 'mean_test_accuracy', 'mean_test_f1']]

Unnamed: 0,mean_test_roc_auc,mean_test_accuracy,mean_test_f1
10,0.693639,0.636833,0.567763
2,0.693585,0.637177,0.567641
0,0.693444,0.636489,0.566524
3,0.693366,0.635683,0.566078
8,0.693269,0.636332,0.56611
11,0.693102,0.635555,0.565522
12,0.693022,0.636794,0.573996
1,0.692943,0.635899,0.566177
14,0.692841,0.6353,0.572667
9,0.692786,0.635349,0.565113


In [11]:
gridsearch.best_score_

0.6936390084278511

# Randomized search

In [12]:
import scipy.stats
parameters_dict = {
    'eta': scipy.stats.uniform(),
    'gamma': scipy.stats.expon(scale=2),
    'max_depth': scipy.stats.poisson(loc=2, mu=5),
    'min_child_weight': scipy.stats.uniform(loc=0.2, scale=2),
    'subsample': scipy.stats.uniform(loc=0.4, scale=0.6),
    'use_label_encoder': [False],
    'eval_metric': ['logloss']
}


In [13]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(XGBClassifier(),
                                   parameters_dict,
                                   n_iter=15,
                                   scoring=metrics,
                                   cv=cv,
                                   refit='roc_auc')
random_search.fit(X, y)

RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000198EEC042B0>,
                                        'max_depth': <sc

## Najlepsze parametry  i wyniki

In [14]:
random_search.best_params_

{'eta': 0.18578928224710567,
 'eval_metric': 'logloss',
 'gamma': 5.241902239056384,
 'max_depth': 7,
 'min_child_weight': 2.090460476921252,
 'subsample': 0.8188008141658946,
 'use_label_encoder': False}

In [15]:
res = pd.DataFrame(random_search.cv_results_)
res.sort_values(by='mean_test_roc_auc', ascending=False)[['mean_test_roc_auc', 'mean_test_accuracy', 'mean_test_f1']]

Unnamed: 0,mean_test_roc_auc,mean_test_accuracy,mean_test_f1
9,0.695282,0.637865,0.575026
2,0.695128,0.637688,0.569668
11,0.694413,0.63643,0.568677
5,0.694307,0.636224,0.568199
12,0.693608,0.636538,0.572562
10,0.684113,0.630671,0.574304
4,0.683699,0.630475,0.57711
1,0.682123,0.627733,0.573198
13,0.678541,0.625915,0.572993
6,0.677432,0.626986,0.546774


In [16]:
random_search.best_score_

0.6952823219899329

# Bayesian search

In [17]:
import skopt
search_spaces = {
    'eta': skopt.space.space.Real(0.1, 1),
    'gamma': skopt.space.space.Real(0.5, 6),
    'max_depth': skopt.space.space.Integer(2, 15),
    'min_child_weight': skopt.space.space.Real(0.5, 6),
    'subsample': skopt.space.space.Real(0.4, 1),
    'use_label_encoder': [False],
    'eval_metric': ['logloss']
}

In [18]:
opt = skopt.BayesSearchCV(
        estimator=XGBClassifier(),
        search_spaces=search_spaces,
        scoring='roc_auc',
        n_iter=15,
        verbose=0,
        cv=cv
)
opt.fit(X, y)

BayesSearchCV(cv=5,
              estimator=XGBClassifier(base_score=None, booster=None,
                                      colsample_bylevel=None,
                                      colsample_bynode=None,
                                      colsample_bytree=None,
                                      enable_categorical=False, gamma=None,
                                      gpu_id=None, importance_type=None,
                                      interaction_constraints=None,
                                      learning_rate=None, max_delta_step=None,
                                      max_depth=None, min_child_weight=None,
                                      missing=nan, monotone_constraints=None...
              search_spaces={'eta': Real(low=0.1, high=1, prior='uniform', transform='normalize'),
                             'eval_metric': ['logloss'],
                             'gamma': Real(low=0.5, high=6, prior='uniform', transform='normalize'),
                 

## Najlepsze parametry  i wyniki

In [19]:
opt.best_params_

OrderedDict([('eta', 0.1),
             ('eval_metric', 'logloss'),
             ('gamma', 6.0),
             ('max_depth', 9),
             ('min_child_weight', 0.601287375759972),
             ('subsample', 0.5567221893212329),
             ('use_label_encoder', False)])

In [20]:
opt.best_score_

0.6955181467398923

In [None]:
score_default = cross_validate(opt.best_estimator_, X, y, cv=cv, scoring=metrics)
pd.DataFrame(score_default).describe()

# Podsumowanie
Jak widzimy wraz z kolejnymi etapami dostawaliśmy lepsze wyniki. Jednak różnica pomiędzy wynikami była niewielka, ponieważ wyniki wachały się w zakresie
$0.69 - 0.6955$.