In [3]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [4]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [5]:
wine.columns

Index(['alcohol', 'sugar', 'pH', 'class'], dtype='object')

In [6]:
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [7]:
wine['class'].unique()

array([0., 1.])

In [8]:
wine['class'].value_counts()

class
1.0    4898
0.0    1599
Name: count, dtype: int64

In [9]:
# hold-out 기법

In [10]:
wine.values

array([[ 9.4 ,  1.9 ,  3.51,  0.  ],
       [ 9.8 ,  2.6 ,  3.2 ,  0.  ],
       [ 9.8 ,  2.3 ,  3.26,  0.  ],
       ...,
       [ 9.4 ,  1.2 ,  2.99,  1.  ],
       [12.8 ,  1.1 ,  3.34,  1.  ],
       [11.8 ,  0.8 ,  3.26,  1.  ]], shape=(6497, 4))

In [22]:
data=wine.values[:,:3]

In [12]:
wine['class'].values

array([0., 0., 0., ..., 1., 1., 1.], shape=(6497,))

In [13]:
target=wine['class'].values

In [14]:
from sklearn.model_selection import train_test_split

In [16]:
# 6497 rows

In [17]:
6497*0.8, 6497*0.2

(5197.6, 1299.4)

In [23]:
train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [18]:
len(train_input),len(test_input)

(5197, 1300)

In [25]:
train_input.shape

(5197, 3)

In [24]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [20]:
5197*0.8, 5197*0.2

(4157.6, 1039.4)

In [21]:
len(sub_input), len(val_input)

(4157, 1040)

In [26]:
sub_input.shape

(4157, 3)

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [29]:
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [30]:
# 교차 검증

In [31]:
from sklearn.model_selection import cross_validate

In [32]:
scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00705624, 0.00539732, 0.00529695, 0.00562096, 0.0050261 ]), 'score_time': array([0.00130725, 0.00071883, 0.00081921, 0.00112319, 0.00088787]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [36]:
# 검증 데이터 셋을 두는 목적이 하이퍼파라미터 튜닝을 위해서

In [34]:
# 우리가 정해야할 하이퍼파라미터가 상당히 많음
# 분지 기준:{"지니","엔트로피","로그 손실"}, 기본값 ="지니"
# 분할기 {"최고","랜덤"}, 기본값="최고"
# 최대 깊이(정수)
# min_samples_split은 정수 또는 실수형이며 기본값은 2입니다. 내부 노드를 분할하는 데 필요한 최소 샘플 수
# min_samples_leaf는 정수 또는 실수형이며 기본값은 1입니다. 리프 노드에 필요한 최소 샘플 수입니다.
# max_features
# min_impurity_decrease는 float형 값이며 기본값은 0.0입니다. 
# 노드 분할로 인해 불순물 함량이 이 값 이상 감소하는 경우 해당 노드가 분할됩니다.

In [35]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [37]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [38]:
gs.fit(train_input, train_target)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'min_impurity_decrease': [0.0001, 0.0002, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0001


In [39]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [40]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [43]:
import numpy as np

In [44]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8681929740134745


In [41]:
# 균등분포

In [48]:
# 난수 생성만 필요하면 NumPy를, 통계 분석이나 scikit-learn과의 통합이 필요하면 scipy를 사용

In [None]:
from scipy.stats import uniform, randint

In [46]:
rgen = randint(0, 10)
rgen.rvs(10)

array([5, 6, 8, 9, 1, 5, 6, 3, 7, 9])

In [49]:
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 91, 106,  90,  86,  97, 118,  97, 115, 106,  94]))

In [50]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.3564721 , 0.28594553, 0.20289235, 0.13921984, 0.14438943,
       0.25722408, 0.43083501, 0.54576914, 0.31403514, 0.91677834])

In [51]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [52]:
from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_distributions,"{'max_depth': <scipy.stats....00243F7272490>, 'min_impurity_decrease': <scipy.stats....00243F7270F50>, 'min_samples_leaf': <scipy.stats....00243F7281F30>, 'min_samples_split': <scipy.stats....00243F7272210>}"
,n_iter,100
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,39
,min_samples_split,13
,min_samples_leaf,7
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,np.float64(0....2546602601173)


In [54]:
# 그리드서치로 하이퍼파라미터 최적화 후 최종 성능 0.8681929740134745

In [53]:
print(gs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [55]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8695428296438884


In [56]:
dt = gs.best_estimator_

print(dt.score(test_input, test_target))

0.86
