In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [6]:
df= pd.read_csv('data/diabetes_feature.csv')
df.shape

(768, 16)

In [7]:
df_insulin = pd.read_csv('data/diabetes_fill_insulin.csv')
df['Insulin'] = df_insulin['Insulin']
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,188.652003,33.6,0.627,50,1,False,False,True,False,206.0,5.332719,False
1,1,85,66,29,53.467915,26.6,0.351,31,0,False,False,True,False,130.0,4.875197,False
2,8,183,64,0,184.168548,23.3,0.672,32,1,True,False,True,False,206.0,5.332719,False
3,1,89,66,23,94.0,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168.0,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


In [8]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,188.652003,33.6,0.627,50,1,False,False,True,False,206.0,5.332719,False
1,1,85,66,29,53.467915,26.6,0.351,31,0,False,False,True,False,130.0,4.875197,False
2,8,183,64,0,184.168548,23.3,0.672,32,1,True,False,True,False,206.0,5.332719,False
3,1,89,66,23,94.0,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168.0,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


## 학습과 예측에 사용할 데이터셋 만들기

In [9]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [10]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
       'Insulin']]
X.shape

(768, 8)

In [12]:
y = df['Outcome']
y.shape

(768,)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [14]:
X_train.shape, y_train.shape

((614, 8), (614,))

In [15]:
X_test.shape, y_test.shape

((154, 8), (154,))

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)
            ]
estimators

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=42, splitter='best'),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 GradientBoost

In [17]:
max_depth = np.random.randint(2, 20, 10)
max_depth

array([ 9, 12,  3, 10, 17, 14, 13, 13, 17,  3])

In [18]:
max_features = np.random.uniform(0.3, 1.0, 10)
max_features

array([0.8986524 , 0.37644312, 0.47559584, 0.50752648, 0.35661647,
       0.94361256, 0.50218039, 0.30828287, 0.86673051, 0.40734975])

In [19]:
results = []
for estimator in estimators:
    result = []
    result.append(estimator.__class__.__name__)
    results.append(result)
results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [21]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {"max_depth": max_depth, 
                       "max_features": max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 200, 10)
        
    clf = RandomizedSearchCV(estimator, 
                       param_distributions, 
                       n_iter=100,
                       scoring="accuracy",
                       n_jobs=-1,
                       cv=5, 
                       verbose=2
                      )

    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 485 out of 500 | elapsed:   11.4s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   11.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.4min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.6min finished


In [22]:
df = pd.DataFrame(results, 
             columns=["estimator", "best_params", "train_score", "test_score", "cv_result"])
df

Unnamed: 0,estimator,best_params,train_score,test_score,cv_result
0,DecisionTreeClassifier,"{'max_features': 0.7748502351689125, 'max_dept...",0.763761,0.727273,"{'mean_fit_time': [0.014754772186279297, 0.024..."
1,RandomForestClassifier,"{'n_estimators': 160, 'max_features': 0.691876...",0.789924,0.753247,"{'mean_fit_time': [1.127849006652832, 1.251852..."
2,GradientBoostingClassifier,"{'n_estimators': 189, 'max_features': 0.691876...",0.794695,0.75974,"{'mean_fit_time': [1.7169142723083497, 2.28343..."


In [23]:
pd.DataFrame(df.loc[1, "cv_result"]).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
95,1.509215,0.120999,0.073190,0.009832,160,0.691877,11,"{'n_estimators': 160, 'max_features': 0.691876...",0.804878,0.829268,0.764228,0.747967,0.803279,0.789924,0.029566,1
72,1.215665,0.068078,0.065505,0.006644,157,0.548617,7,"{'n_estimators': 157, 'max_features': 0.548617...",0.788618,0.829268,0.764228,0.739837,0.819672,0.788325,0.033439,2
34,1.429534,0.073684,0.073994,0.004826,178,0.681566,6,"{'n_estimators': 178, 'max_features': 0.681566...",0.813008,0.821138,0.764228,0.739837,0.803279,0.788298,0.031117,3
18,1.422449,0.180563,0.076763,0.020356,160,0.405229,7,"{'n_estimators': 160, 'max_features': 0.405228...",0.813008,0.829268,0.747967,0.731707,0.811475,0.786685,0.039096,4
36,1.241658,0.056580,0.066458,0.008541,160,0.548617,7,"{'n_estimators': 160, 'max_features': 0.548617...",0.788618,0.837398,0.756098,0.739837,0.811475,0.786685,0.035554,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,1.194828,0.033292,0.062958,0.005051,160,0.405229,19,"{'n_estimators': 160, 'max_features': 0.405228...",0.756098,0.788618,0.715447,0.756098,0.811475,0.765547,0.032650,96
43,1.290422,0.084057,0.071064,0.012039,146,0.405229,19,"{'n_estimators': 146, 'max_features': 0.405228...",0.747967,0.796748,0.707317,0.756098,0.811475,0.763921,0.037037,97
85,1.166922,0.086083,0.060991,0.016389,132,0.574643,19,"{'n_estimators': 132, 'max_features': 0.574643...",0.756098,0.813008,0.739837,0.731707,0.778689,0.763868,0.029348,98
64,0.863855,0.035318,0.065001,0.007531,132,0.329469,4,"{'n_estimators': 132, 'max_features': 0.329468...",0.747967,0.788618,0.747967,0.723577,0.795082,0.760642,0.027070,99


* 인슐린은 중앙값/평균으로 대체한 것보다 회귀로 예측했을 때 정확도가 더 떨어지는 결과가 나온다.