In [1]:
# pandas 라이브러리
import pandas as pd

In [4]:
wine = pd.read_csv('winequality-white.csv', sep=';')
wine.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6


In [5]:
wine.shape

(4898, 12)

# 와인등급 평가 예측

In [6]:
# 독립변수, 종속변수 분리
wine_x = wine.iloc[:, :-1].copy()
wine_y = wine['quality'].copy()

In [8]:
wine_x.shape

(4898, 11)

In [10]:
wine_y.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [11]:
# 학습용/테스트 데이터 분리
# 8 대 2
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(wine_x, wine_y, test_size=.2)

In [14]:
train_x.shape

(3918, 11)

In [15]:
test_x.shape

(980, 11)

In [16]:
# 의사결정나무(또는 다른분류기) 객체 생성
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

In [17]:
dt.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
dt.score(test_x, test_y)

0.6091836734693877

### 파라미터 최적화

In [23]:
param_grid = {
                'max_features' : [None,'sqrt'],
                'criterion' : ['gini', 'entropy'],
                'max_depth' : [3,5,7,9],
                'min_samples_leaf' : [0.001, 0.01, 0.1, 0.5]
}

In [19]:
# Grid Search 수행 = cv 기본값 (3)
from sklearn.model_selection import GridSearchCV

In [21]:
dt2 = DecisionTreeClassifier()

In [24]:
grid1 = GridSearchCV(dt2, param_grid)

In [30]:
dt3 = DecisionTreeClassifier()

In [31]:
# Grid Search 수행 = cv (5)
grid2 = GridSearchCV(dt3, param_grid,cv=5)

In [26]:
# 학습
grid1.fit(train_x, train_y)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_features': [None, 'sqrt'], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7, 9], 'min_samples_leaf': [0.001, 0.01, 0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
grid2.fit(train_x, train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_features': [None, 'sqrt'], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7, 9], 'min_samples_leaf': [0.001, 0.01, 0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
grid1.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'max_features': None,
 'min_samples_leaf': 0.001}

In [34]:
grid1.best_score_

0.5329249617151608

In [35]:
grid2.best_params_

{'criterion': 'gini',
 'max_depth': 9,
 'max_features': None,
 'min_samples_leaf': 0.001}

In [36]:
grid2.best_score_

0.5428790199081164

In [37]:
# 평가
grid1.score(test_x, test_y)

0.4846938775510204

In [38]:
grid2.score(test_x, test_y)

0.5346938775510204

In [39]:
# Cross-validation : 교차검증
from sklearn.model_selection import cross_validate

In [41]:
dt5 = DecisionTreeClassifier()

In [44]:
result = cross_validate(dt5, wine_x, wine_y, cv=5, return_train_score=True)

In [47]:
result['test_score'].mean()

0.413473812631561

In [46]:
result['train_score']

array([1., 1., 1., 1., 1.])

### 성능 평가

In [48]:
test_y

2471    7
2754    5
669     5
1747    6
2161    7
4054    7
4527    6
1035    6
2687    5
799     8
3495    7
1911    6
3956    5
4090    6
1376    6
917     6
2817    7
2483    6
2464    5
4426    6
4877    5
860     8
4390    5
4842    6
2651    5
1817    4
694     6
3576    5
114     5
1780    5
       ..
3647    7
4605    6
3661    6
3104    5
2586    7
3355    6
1065    5
490     6
426     6
3981    7
1049    6
2744    6
4707    6
3066    5
382     6
1870    6
4470    6
3564    6
3597    6
77      7
804     6
651     7
424     7
450     5
3755    6
3758    5
4296    6
2593    5
3414    7
311     8
Name: quality, Length: 980, dtype: int64

In [49]:
pred_y = grid2.predict(test_x)

In [50]:
# confusion_matrix 결과 값
from sklearn.metrics import confusion_matrix

In [54]:
test_y.unique()

array([7, 5, 6, 8, 4, 3, 9], dtype=int64)

In [55]:
label = range(3,10)
confusion_matrix(test_y, pred_y, labels=label)

array([[  0,   0,   3,   0,   1,   0,   0],
       [  0,   8,  14,   6,   2,   0,   0],
       [  0,  14, 145,  93,  13,   1,   0],
       [  0,   4,  98, 295,  54,   6,   0],
       [  0,   0,  14,  98,  70,   0,   0],
       [  0,   0,   1,  17,  16,   6,   0],
       [  0,   0,   0,   1,   0,   0,   0]], dtype=int64)

In [61]:
test_y.value_counts()

6    457
5    266
7    182
8     40
4     30
3      4
9      1
Name: quality, dtype: int64

In [60]:
import numpy as np
np.unique(pred_y)

array([4, 5, 6, 7, 8], dtype=int64)

In [57]:
# pd.cross-table(예측값, 실제값)
pd.crosstab(pred_y, test_y)

quality,3,4,5,6,7,8,9
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,0,8,14,4,0,0,0
5,3,14,145,98,14,1,0
6,0,6,93,295,98,17,1
7,1,2,13,54,70,16,0
8,0,0,1,6,0,6,0
