# Warsztaty Badawcze
## Praca domowa 2
*Marcel Witas*

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
import scipy, skopt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Wczytanie danych

Wczytujemy zbiór danych, na którym został wykonany preprocessing podczas poprzedniej pracy domowej.

In [2]:
diabetes_df = pd.read_csv("diabetes_preprocessed.csv")
diabetes_df.head()

Unnamed: 0,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,discharge_disposition_id_23,discharge_disposition_id_30,admission_source_id_0,admission_source_id_1,admission_source_id_2,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_30
0,0,5,1,41,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,0,15,3,59,0,18,0,0,0,9,...,0,0,0,0,0,0,0,0,1,0
2,0,25,2,11,5,13,2,0,1,6,...,0,0,0,0,0,0,0,0,1,0
3,1,35,2,44,1,16,0,0,0,7,...,0,0,0,0,0,0,0,0,1,0
4,1,45,1,51,0,8,0,0,0,5,...,0,0,0,0,0,0,0,0,1,0


In [3]:
X = diabetes_df.drop('readmitted', axis=1)
y = diabetes_df['readmitted']

## Podział na zbiory treningowe i testowe

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state=13, train_size=0.7)

## Model
Użyty zostanie ExtraTreesClassifier. Ta klasa implementuje meta estymator, który dopasowuje liczbę losowych drzew decyzyjnych (tzw. dodatkowych drzew) do różnych podpróbek zbioru danych i używa uśredniania w celu poprawy dokładności predykcyjnej i kontroli nadmiernego dopasowania.
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

## Model z domyślnymi hiperparametrami

In [5]:
xTree = ExtraTreesClassifier()
xTree.fit(X_train, y_train)


ExtraTreesClassifier()

In [6]:
pr = xTree.predict(X_test)

In [7]:
prp = xTree.predict_proba(X_test)

In [8]:
accuracy_score(y_test, pr)

0.6240254209526306

In [9]:
roc_auc_score(y_test, prp[:, 1])

0.6729110563586784

## Ręczna zmiana hiperparametrów


Zdecydowałem się na zmianę następujących hiperparametrów:
* n_estimators 
* criterion
* min_samples_split
* max_features 
* min_samples_leaf

In [10]:
xTree_man = ExtraTreesClassifier(n_estimators=500,
                                 criterion="entropy",
                                 min_samples_split=3,
                                 max_features = 'sqrt',
                                 min_samples_leaf = 3
                                )


In [11]:
xTree_man.fit(X_train, y_train)

ExtraTreesClassifier(criterion='entropy', max_features='sqrt',
                     min_samples_leaf=3, min_samples_split=3, n_estimators=500)

In [12]:
pr = xTree_man.predict(X_test)

In [13]:
prp = xTree_man.predict_proba(X_test)

In [14]:
accuracy_score(y_test, pr)

0.6375876302168643

In [15]:
roc_auc_score(y_test, prp[:, 1])

0.693390865494723

Teraz spróbujemy zoptymalizować te hiperparametry.

## Random search (RS)


In [5]:
xTree_rs = ExtraTreesClassifier()

parameters_distribution = {'n_estimators': scipy.stats.randint(80, 600), 
                          'min_samples_split' : scipy.stats.randint(2,6), 
                           'max_features' : scipy.stats.randint(5, 11), 'criterion':['gini', 'entropy'],
                           'min_samples_leaf' : scipy.stats.randint(1, 8)}




In [6]:
rs_clf = RandomizedSearchCV(xTree_rs, parameters_distribution, random_state=13, cv=3)

In [7]:
rs_clf.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=ExtraTreesClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000029898EDC550>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000029898EDC7F0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000029898EDC3D0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000029898EDC0D0>},
                   random_state=13)

In [8]:
rs_clf.best_params_

{'criterion': 'entropy',
 'max_features': 10,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 550}

In [9]:
pr = rs_clf.predict(X_test)


In [10]:
prp = rs_clf.predict_proba(X_test)

In [11]:
accuracy_score(y_test, pr)

0.6403066238616262

In [12]:
roc_auc_score(y_test, prp[:, 1])

0.6939242173741684

## Grid search (GS)


In [13]:
parameters = {'criterion':('gini', 'entropy'), 
              'min_samples_split':[4,5], 
               'max_features' : [6,7,8]}
xTree_grid = ExtraTreesClassifier()

In [14]:
grid_clf = GridSearchCV(xTree_grid, parameters, scoring = 'roc_auc',  cv=3)


In [15]:
grid_clf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=ExtraTreesClassifier(),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_features': [6, 7, 8],
                         'min_samples_split': [4, 5]},
             scoring='roc_auc')

In [16]:
grid_clf.best_params_

{'criterion': 'entropy', 'max_features': 7, 'min_samples_split': 5}

In [17]:
pr = grid_clf.predict(X_test)

In [18]:
prp = grid_clf.predict_proba(X_test)

In [19]:
accuracy_score(y_test, pr)

0.6297254799187578

In [20]:
roc_auc_score(y_test, prp[:, 1])

0.6800297269855862

## Optymalizacja bayesowska (BO)


In [21]:
search_spaces = {
        'criterion': ['gini', 'entropy'],
        'n_estimators': skopt.space.space.Integer(10, 500),
        'max_features': skopt.space.space.Integer(5, 12),
    'min_samples_leaf':skopt.space.space.Integer(1, 10),
}

xTree_bo = ExtraTreesClassifier()

In [22]:
bo_clf = skopt.BayesSearchCV(
        estimator=xTree_bo,
        search_spaces=search_spaces,
        scoring='roc_auc',
        n_jobs=4,
        n_iter=15,
        verbose=0,
        random_state=13
    )

In [23]:
bo_clf.fit(X_train, y_train)

BayesSearchCV(estimator=ExtraTreesClassifier(), n_iter=15, n_jobs=4,
              random_state=13, scoring='roc_auc',
              search_spaces={'criterion': ['gini', 'entropy'],
                             'max_features': Integer(low=5, high=12, prior='uniform', transform='normalize'),
                             'min_samples_leaf': Integer(low=1, high=10, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=10, high=500, prior='uniform', transform='normalize')})

In [24]:
bo_clf.best_params_

OrderedDict([('criterion', 'entropy'),
             ('max_features', 12),
             ('min_samples_leaf', 10),
             ('n_estimators', 390)])

In [25]:
pr = bo_clf.predict(X_test)

In [26]:
prp = bo_clf.predict_proba(X_test)

In [27]:
accuracy_score(y_test, pr)

0.6420756076787001

In [28]:
roc_auc_score(y_test, prp[:, 1])

0.6958901121776782

## Podsumowanie

<table>
<thead>
  <tr>
    <th></th>  
    <th>Ręczna zmiana</th>
    <th>default</th>
    <th>GS</th>
    <th>RS</th>
    <th>BO</th>   
  </tr>
</thead>
<tbody>
   <tr>
    <td>AUC</td>
    <td>0.6934</td>
    <td>0.6729</td>
    <td>0.6800</td>
    <td>0.6939</td>
    <td>0.6959</td>
      </tr>
   <tr>
    <td>Accuracy</td>
    <td>0.6376</td>
    <td>0.6240</td>
    <td>0.6297</td>
    <td>0.6403</td>
    <td>0.6420</td>
      </tr>
  </tbody>
</table>



Różnice w AUC score nie okazały się zbyt duże. Wszystkie modele z optymalizacją hiperparametrów były lepsze, niż domyślne. Najlepiej sprawdziła się Optymalizacja Bayesowska, zarówno pod względem metryki AUC, jak i accuracy. Niewiele gorsze była RandomSearch. Wadą jest oczywiście o wiele dłuższy czas wykonanania

n_estimators, criterion, min_samples_leaf, min_samples_split, max_features