# PD2 - Adam Frej

## Import danych

In [135]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 60
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import scipy.stats, skopt
np.random.seed(123)

In [2]:
data = pd.read_csv("diabetic_data_preprocessed.csv")

In [19]:
data.head()

Unnamed: 0,race,gender,age,payer_code,medical_specialty,diag_1,diag_2,diag_3,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,readmitted
0,3,0,0,0,16,138,122,126,2,1,1,1,1,1,1,1,0,5,23,0,1,41,0,1,1,0
1,3,0,1,0,0,28,11,138,2,1,1,1,1,1,3,0,1,0,0,6,3,59,0,18,9,1
2,1,0,2,0,0,104,10,138,2,1,2,1,1,1,1,1,1,0,0,6,2,11,5,13,6,0
3,3,1,3,0,0,124,133,48,2,1,1,1,1,1,3,0,1,0,0,6,2,44,1,16,7,0
4,3,1,4,0,0,8,133,8,2,1,2,1,1,1,2,0,1,0,0,6,1,51,0,8,5,0


In [66]:
x_data = data.drop(columns='readmitted')
y_data = data['readmitted']

## Modelowanie - GradientBoostingClassifier

### Domyślne parametry

In [142]:
gbc = GradientBoostingClassifier()
gbc_def = cross_validate(gbc, x_data, y_data, cv=5, scoring=['roc_auc', 'accuracy'])
gbc_def

{'fit_time': array([12.1364255 , 11.83867073, 12.31958342, 12.23300886, 11.99380374]),
 'score_time': array([0.07156181, 0.07156086, 0.06755829, 0.06805873, 0.06505585]),
 'test_roc_auc': array([0.63688693, 0.65501016, 0.63206664, 0.63698394, 0.64235791]),
 'test_accuracy': array([0.59015427, 0.5996659 , 0.59146072, 0.59268904, 0.59431042])}

In [144]:
gbc_def['test_roc_auc'].mean()

0.6406611174237551

In [145]:
gbc_def['test_accuracy'].mean()

0.5936560688422402

### Ręczna zmiana hiperparametrów

In [146]:
gbc = GradientBoostingClassifier(n_estimators = 150, max_depth=5)
gbc_adj = cross_validate(gbc, x_data, y_data, cv=5, scoring=['roc_auc', 'accuracy'])
gbc_adj

{'fit_time': array([29.60343051, 29.85214424, 29.86065197, 30.25198793, 30.9716053 ]),
 'score_time': array([0.15213084, 0.15863657, 0.14412403, 0.13811874, 0.13961959]),
 'test_roc_auc': array([0.6484925 , 0.66165558, 0.65003123, 0.6577971 , 0.65746475]),
 'test_accuracy': array([0.6052373 , 0.61013118, 0.60305606, 0.60875547, 0.60620056])}

In [147]:
gbc_adj['test_roc_auc'].mean()

0.6550882315090395

In [148]:
gbc_adj['test_accuracy'].mean()

0.6066761142111747

### Random search

In [129]:
parameters_distribution = {'n_estimators':scipy.stats.randint(50,150), 'max_depth':scipy.stats.randint(2,4)}
gbc = GradientBoostingClassifier()
clf_random = RandomizedSearchCV(gbc, parameters_distribution, scoring=['roc_auc', 'accuracy'], refit='roc_auc', cv=5)
clf_random.fit(x_data, y_data)

RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(),
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021618C88708>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021618C8B608>},
                   refit='roc_auc', scoring=['roc_auc', 'accuracy'])

In [131]:
clf_random.cv_results_

{'mean_fit_time': array([10.1672338 ,  4.93984337, 10.71910801,  5.31376438,  9.35043173,
         9.5068666 ,  9.68071594,  8.95138927,  4.48875537, 10.43726573]),
 'std_fit_time': array([0.36567493, 0.12561238, 0.16550311, 0.07868958, 0.16116392,
        0.16069498, 0.18194392, 0.05396719, 0.02718094, 0.23601792]),
 'mean_score_time': array([0.06015191, 0.03913388, 0.06105285, 0.04093547, 0.0578505 ,
        0.05905137, 0.05795031, 0.05704937, 0.03773327, 0.06115274]),
 'std_score_time': array([0.00267453, 0.00165682, 0.00327406, 0.00177352, 0.0021137 ,
        0.00238939, 0.00290749, 0.00295189, 0.00326789, 0.00402085]),
 'param_max_depth': masked_array(data=[3, 2, 2, 2, 3, 3, 2, 3, 2, 2],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[84, 60, 127, 65, 77, 80, 120, 76, 56, 125],
              mask=[False, False, False, False, F

In [159]:
clf_random.cv_results_['mean_test_roc_auc']

array([0.62174191, 0.62248   , 0.62306423, 0.630174  , 0.63035782,
       0.63083794, 0.63771775, 0.63793066, 0.63799115, 0.63849355])

Najlepszy wynik: 0.63849355, dla parametrów: {'max_depth': 2, 'n_estimators': 125}.

### Grid search

In [80]:
parameters = {'n_estimators':[50,100,150], 'max_depth':[2,3,4]}
gbc = GradientBoostingClassifier()
clf = GridSearchCV(gbc, parameters, scoring=['roc_auc', 'accuracy'], return_train_score=True, refit='roc_auc', cv=5)
clf.fit(x_data, y_data)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'max_depth': [2, 3, 4],
                         'n_estimators': [50, 100, 150]},
             refit='roc_auc', return_train_score=True,
             scoring=['roc_auc', 'accuracy'])

In [81]:
clf.cv_results_

{'mean_fit_time': array([ 4.07339921,  7.98115549, 11.985396  ,  5.97273078, 12.04774909,
        17.9623302 ,  8.0598206 , 16.12905421, 23.73298736]),
 'std_fit_time': array([0.06352266, 0.03539926, 0.09432218, 0.08670149, 0.39907465,
        0.09776181, 0.0671331 , 0.33270422, 0.46449541]),
 'mean_score_time': array([0.0362309 , 0.05164437, 0.06805854, 0.04433794, 0.06955948,
        0.0905777 , 0.05474687, 0.09047813, 0.11609979]),
 'std_score_time': array([0.00244355, 0.00250052, 0.00313354, 0.00265915, 0.00232618,
        0.00327385, 0.00269689, 0.00480433, 0.00425815]),
 'param_max_depth': masked_array(data=[2, 2, 2, 3, 3, 3, 4, 4, 4],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[50, 100, 150, 50, 100, 150, 50, 100, 150],
              mask=[False, False, False, False, False, False, False, False,
                    False],
     

In [160]:
clf.cv_results_['mean_test_roc_auc']

array([0.62065615, 0.62887269, 0.63350562, 0.63241274, 0.64066036,
       0.64541672, 0.64007185, 0.64852722, 0.65279393])

Najlepszy wynik: 0.65279393, dla parametrów: {'max_depth': 4, 'n_estimators': 150}.

### Optymalizacja bayesowska

In [140]:
search_spaces = {'n_estimators':skopt.space.space.Integer(5, 150), 'max_depth':skopt.space.space.Integer(2,5)}
gbc = GradientBoostingClassifier()
opt = skopt.BayesSearchCV(gbc, search_spaces=search_spaces, n_iter=10, scoring='roc_auc', cv=5, verbose=0)
opt.fit(x_data, y_data)

BayesSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_iter=10,
              scoring='roc_auc',
              search_spaces={'max_depth': Integer(low=2, high=5, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=5, high=150, prior='uniform', transform='normalize')})

In [141]:
opt.cv_results_

{'mean_fit_time': array([10.46999445, 17.72082319,  1.44354043,  5.67687593,  8.25609255,
         8.26840377,  1.3180326 , 10.56147308, 18.65792828,  6.62119184]),
 'std_fit_time': array([0.60462205, 0.29039039, 0.02382977, 0.16001597, 0.20189207,
        0.13751133, 0.02002269, 0.3500312 , 0.32389133, 0.10498034]),
 'mean_score_time': array([0.03352838, 0.04734049, 0.01401181, 0.0236238 , 0.02922482,
        0.03112659, 0.01371164, 0.03292818, 0.05214467, 0.02541823]),
 'std_score_time': array([0.00223749, 0.00186175, 0.00126652, 0.00146838, 0.00150473,
        0.00285556, 0.00116686, 0.0011586 , 0.00182931, 0.00124469]),
 'param_max_depth': masked_array(data=[3, 4, 3, 2, 4, 4, 2, 3, 5, 3],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[83, 105, 11, 67, 48, 48, 15, 86, 92, 55],
              mask=[False, False, False, False, Fal

In [161]:
opt.cv_results_['mean_test_score']

array([0.63834644, 0.6489053 , 0.60887977, 0.62327735, 0.63917617,
       0.63917675, 0.60308095, 0.63894039, 0.6518811 , 0.63347987])

Najlepszy wynik: 0.6518811 , dla parametrów: ('max_depth', 5), ('n_estimators', 92).

## Wnioski
Generalnie wyniki wszędzie były podobne. Nie jest zaskoczeniem, że powiększanie parametrów ('max_depth', 'n_estimators') polepsza wyniki, bo zwiększają one złożoność modelu. Jednak jest to bardzo kosztowne. Dla większych modeli z kroswalidacją czasy dochodziły do 10 minut.