In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

## 학습과 예측에 사용할 데이터셋 만들기

In [2]:
df = pd.read_csv("data/diabetes_feature.csv")
df.shape

(768, 16)

In [3]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [4]:
# feature engineering 결과 더 잘 나왔던 변수들만 사용하도록 - 예측변수

X = df[[ 'Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
       'Insulin_nan', 'low_glu_insulin']]
X.shape

(768, 9)

In [5]:
y = df['Outcome']
y.shape

(768,)

In [6]:
# 사이킷런에서 제공하는 model_selection의 train_test_split으로 만든다.
# random state 는 매번 실행때마다 결과가 달라지는 것을 방지.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 여러 개의 알고리즘을 사용해서 비교하기

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)]
estimators

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=42, splitter='best'),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 GradientBoost

In [27]:
results = []
for estimator in estimators:
    result=[]
    result.append(estimator.__class__.__name__)
    results.append(result)

results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [41]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)

max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {'max_depth':max_depth, 'max_features':max_features}
# 그런데 알고리즘에 따라 파라미터가 다 다를 수 있다. 그래서 밑에 if 조건 걸어줌

# verbose ; 로그 찍냐 마냐

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions['n_estimators'] = np.random.randint(100, 200, 10)
        
    clf = RandomizedSearchCV(estimator, param_distributions=param_distributions, n_iter=100,
                  scoring ='accuracy', n_jobs=-1, cv=5, verbose=2)
    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)

results

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    4.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.3min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.3min finished


[['DecisionTreeClassifier',
  {'max_features': 0.6740581159285474, 'max_depth': 7},
  0.871398107423697,
  0.8311688311688312,
  {'mean_fit_time': array([0.01084738, 0.01097064, 0.0095758 , 0.01446538, 0.01586123,
          0.01099262, 0.01205101, 0.01337323, 0.0157866 , 0.01204987,
          0.01176291, 0.01318069, 0.01018939, 0.00973935, 0.01189513,
          0.00993381, 0.01179681, 0.01309471, 0.0143631 , 0.01303563,
          0.01375961, 0.01176453, 0.01075549, 0.01276703, 0.01276484,
          0.01040068, 0.01176829, 0.01276917, 0.01117029, 0.0106792 ,
          0.01156902, 0.01965952, 0.02594781, 0.02485819, 0.02164741,
          0.01717477, 0.03178778, 0.02716961, 0.02530041, 0.01629004,
          0.02249274, 0.01696591, 0.01471753, 0.01546378, 0.01649652,
          0.0216156 , 0.01356349, 0.01219006, 0.01175995, 0.01222229,
          0.01097083, 0.01077967, 0.01137042, 0.0116909 , 0.01108422,
          0.01201067, 0.01336417, 0.01176872, 0.01056533, 0.0094718 ,
          0.0125

In [43]:
df = pd.DataFrame(results, columns=['estimator', 'best_params', 'train_score', 'test_score', 'cv_result'])

In [47]:
pd.DataFrame(df.loc[1, "cv_result"]).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,1.596668,0.090850,0.100720,0.017349,192,0.353563,11,"{'n_estimators': 192, 'max_features': 0.353562...",0.837398,0.943089,0.853659,0.878049,0.918033,0.886046,0.039387,1
94,1.274922,0.018251,0.072850,0.005895,192,0.425645,11,"{'n_estimators': 192, 'max_features': 0.425645...",0.837398,0.943089,0.853659,0.878049,0.918033,0.886046,0.039387,1
30,1.503013,0.113102,0.099602,0.011875,192,0.367922,11,"{'n_estimators': 192, 'max_features': 0.367921...",0.837398,0.943089,0.853659,0.878049,0.918033,0.886046,0.039387,1
18,0.869584,0.033572,0.056495,0.012315,113,0.353563,12,"{'n_estimators': 113, 'max_features': 0.353562...",0.837398,0.943089,0.869919,0.869919,0.909836,0.886032,0.036624,4
36,1.659901,0.074912,0.082662,0.007245,188,0.425645,14,"{'n_estimators': 188, 'max_features': 0.425645...",0.837398,0.943089,0.861789,0.878049,0.909836,0.886032,0.036983,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,0.857662,0.156168,0.077880,0.054002,113,0.789841,3,"{'n_estimators': 113, 'max_features': 0.789841...",0.821138,0.934959,0.853659,0.853659,0.893443,0.871371,0.039195,94
31,1.347146,0.068845,0.099326,0.017622,171,0.866617,3,"{'n_estimators': 171, 'max_features': 0.866617...",0.813008,0.934959,0.845528,0.853659,0.901639,0.869759,0.043208,97
4,1.074814,0.074598,0.066092,0.022162,132,0.866617,3,"{'n_estimators': 132, 'max_features': 0.866617...",0.821138,0.934959,0.845528,0.853659,0.893443,0.869745,0.040056,98
52,1.672917,0.138466,0.091912,0.032250,188,0.789841,3,"{'n_estimators': 188, 'max_features': 0.789841...",0.813008,0.934959,0.845528,0.845528,0.901639,0.868133,0.043930,99


In [18]:
clf.best_params_

{'max_features': 0.7342518428169433, 'max_depth': 7}

In [19]:
clf.best_score_

0.871398107423697