# WB AutoML - praca domowa 2
## Trenowanie modeli i tuning parametrów
### Michał Tomczyk

##### wstępne operacje

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils.fixes import loguniform
df = pd.read_csv("df_from_hw1.csv")
df = df.drop([df.columns[0]],axis = 1)
df = df.astype(int)
df.head(20)

Unnamed: 0,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,15,1,1,7,3,59,0,18,0,0,...,0,3,1,0,0,0,0,0,1,1
1,25,1,1,7,2,11,5,13,2,0,...,0,1,1,0,0,0,0,1,1,2
2,35,1,1,7,2,44,1,16,0,0,...,0,3,1,0,0,0,0,0,1,2
3,45,1,1,7,1,51,0,8,0,0,...,0,2,1,0,0,0,0,0,1,2
4,55,2,1,2,3,31,6,16,0,0,...,0,2,1,0,0,0,0,1,1,1
5,65,3,1,2,4,70,1,21,0,0,...,0,2,1,0,0,0,0,0,1,2
6,75,1,1,7,5,73,0,12,0,0,...,0,1,1,0,0,0,0,1,1,1
7,85,2,1,4,13,68,2,28,0,0,...,0,2,1,0,0,0,0,0,1,2
8,95,3,3,4,12,33,3,18,0,0,...,0,2,1,0,0,0,0,0,1,2
9,45,1,1,7,9,47,2,17,0,0,...,0,2,1,0,0,0,0,1,1,1


##### zamiana zmiennej objaśnianej na binarną

In [2]:
df.loc[df['readmitted'] != 2, 'readmitted'] = 1
df.loc[df['readmitted'] == 2, 'readmitted'] = 0

In [3]:
df_X = df.drop(['readmitted'], axis = 1)
df_Y = df.readmitted
df_X_train, df_X_test, df_Y_train, df_Y_test = train_test_split(df_X, df_Y, test_size = 0.3)
df_Y_train

57643    1
58612    0
71505    1
56448    1
60119    1
        ..
2012     0
93172    1
22741    0
75824    0
20162    1
Name: readmitted, Length: 68637, dtype: int32

### Trenowanie modeli
##### domyślne parametry

Jako model wybierzmy Random Forest Classifier. Jako dodatkową metrykę wybierzmy F1 score

In [4]:
model = RandomForestClassifier()
model.fit(df_X_train, df_Y_train)

RandomForestClassifier()

In [5]:
preds_prob = model.predict_proba(df_X_test)
preds = model.predict(df_X_test)
preds

array([1, 1, 1, ..., 1, 1, 0])

In [6]:
metrics.roc_auc_score(df_Y_test, preds_prob[:,1])

0.6827298416196539

In [7]:
metrics.f1_score(df_Y_test, preds)

0.5722433460076045

##### ręczna zmiana parametrów

Zwiększmy wartość n_estimators oraz min_samples split, zmieńmy typ parametru criterion 

In [8]:
model1 = RandomForestClassifier(n_estimators = 250, criterion = 'entropy', min_samples_split = 10)
model1.fit(df_X_train, df_Y_train)

RandomForestClassifier(criterion='entropy', min_samples_split=10,
                       n_estimators=250)

In [9]:
preds1 = model1.predict(df_X_test)
preds1_prob = model1.predict_proba(df_X_test)

In [10]:
metrics.roc_auc_score(df_Y_test, preds1_prob[:,1])

0.6935137273450676

In [11]:
metrics.f1_score(df_Y_test, preds1)

0.5785711464792069

##### random search

In [12]:
modelRS = RandomForestClassifier()
params = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [13]:
random_search = RandomizedSearchCV(estimator = modelRS, param_distributions = params,
                                   cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [14]:
random_search.fit(df_X_train, df_Y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 16.4min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [15]:
random_search.best_params_

{'n_estimators': 600,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': False}

In [16]:
best_rs = random_search.best_estimator_
best_rs.fit(df_X_train, df_Y_train)
preds_best_rs = best_rs.predict(df_X_test)
preds_best_rs_prob = best_rs.predict_proba(df_X_test)
print(metrics.roc_auc_score(df_Y_test, preds_best_rs_prob[:,1]))
print(metrics.f1_score(df_Y_test, preds_best_rs))

0.6946164624570876
0.5759336099585063


##### grid search

In [24]:
grid_search = GridSearchCV(estimator = modelRS, param_grid =  {'bootstrap': [True, False],
 'max_depth': [90, 100, 110],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 4, 6],
 'min_samples_split': [5, 10],
 'n_estimators': [200, 400, 600]}, cv=3, n_jobs = -1, verbose = 2)

In [25]:
grid_search.fit(df_X_train, df_Y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 21.4min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 50.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 104.8min
[Parallel(n_jobs=-1)]: Done 648 out of 648 | elapsed: 107.9min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [90, 100, 110],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [2, 4, 6],
                         'min_samples_split': [5, 10],
                         'n_estimators': [200, 400, 600]},
             verbose=2)

In [28]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 400}

In [29]:
best_gs = grid_search.best_estimator_
best_gs.fit(df_X_train, df_Y_train)
preds_best_gs = best_gs.predict(df_X_test)
preds_best_gs_prob = best_gs.predict_proba(df_X_test)
print(metrics.roc_auc_score(df_Y_test, preds_best_gs_prob[:,1]))
print(metrics.f1_score(df_Y_test, preds_best_gs))

0.6954017735210856
0.5770854911959207


##### optymalizacja bayesowska

In [35]:
modelBO = RandomForestClassifier()
from skopt import BayesSearchCV
bay_opt = BayesSearchCV(model, {'bootstrap': [True, False],
 'max_depth': [90, 100, 110],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 4, 6],
 'min_samples_split': [5, 10],
 'n_estimators': [200, 400, 600]}, n_iter = 10, cv = 3, n_jobs = -1, verbose = 2)

In [37]:
bay_opt.fit(df_X_train, df_Y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   42.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   39.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   58.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   55.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   55.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   19.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   30.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   39.7s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   58.5s finished


BayesSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=10, n_jobs=-1,
              search_spaces={'bootstrap': [True, False],
                             'max_depth': [90, 100, 110],
                             'max_features': ['auto', 'sqrt'],
                             'min_samples_leaf': [2, 4, 6],
                             'min_samples_split': [5, 10],
                             'n_estimators': [200, 400, 600]},
              verbose=2)

In [38]:
bay_opt.best_params_

OrderedDict([('bootstrap', False),
             ('max_depth', 90),
             ('max_features', 'sqrt'),
             ('min_samples_leaf', 6),
             ('min_samples_split', 6),
             ('n_estimators', 400)])

In [41]:
best_bo = bay_opt.best_estimator_
best_bo.fit(df_X_train, df_Y_train)
preds_best_bo = best_bo.predict(df_X_test)
preds_best_bo_prob = best_bo.predict_proba(df_X_test)
print(metrics.roc_auc_score(df_Y_test, preds_best_bo_prob[:,1]))
print(metrics.f1_score(df_Y_test, preds_best_bo))

0.6951475922361665
0.5775189191330877


### Podsumowanie

Każda z metod tuningu hiperparametrów dała nam lekką poprawę wyniku AUC. Najlepszy wynik uzyskaliśmy korzystając z metody Grid Search.