In [10]:
from sklearn.datasets import load_wine
wine = load_wine()

In [11]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

In [12]:
import numpy as np
import pandas as pd
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [13]:
df.shape

(178, 14)

In [14]:
# 와인의 등급
df.target.value_counts()

1    71
0    59
2    48
Name: target, dtype: int64

In [15]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, stratify=wine.target,
    test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((142, 13), (36, 13), (142,), (36,))

In [17]:
# y 값의 분포
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([47, 57, 38], dtype=int64))

In [18]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2]), array([12, 14, 10], dtype=int64))

#### 3.학습

In [19]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2023)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2023,
 'splitter': 'best'}

In [22]:
dtc.fit(X_train, y_train)

#### 4.예측

In [24]:
pred = dtc.predict(X_test)

In [25]:
rf = pd.DataFrame({'y 실제값': y_test, 'y 예측값': pred})
rf.head()

Unnamed: 0,y 실제값,y 예측값
0,2,2
1,2,2
2,2,2
3,0,0
4,1,1


#### 5.평가

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9166666666666666

In [27]:
dtc.score(X_test, y_test)

0.9166666666666666

#### GridsearchCV 적용

- 학습/훈련시 사용

In [28]:
params = {
    'max_depth' : [2, 5, 8],            
    'min_samples_split': [2, 3, 4]      # 분류하는 최소 단위가 2
}

In [29]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc,                    # estimator, Decision Tree Classfier
    param_grid=params,      # 파라메터 조합
    scoring='accuracy',     # 평가방법 - 정확도
    cv = 5                  # 교차검증 세트 수
)
# 총(max_depth) 3 x 3 x 5 = 45회 훈련

In [30]:
# 학습 실행
grid_dt.fit(X_train, y_train)

In [31]:
# 베스트 파라메터 조합
grid_dt.best_params_

{'max_depth': 8, 'min_samples_split': 4}

In [32]:
# 베스트 스코어
grid_dt.best_score_

0.9014778325123153

In [33]:
# 베스트 모델로 평가
grid_dt.best_estimator_.score(X_test, y_test)

0.9166666666666666

- 파라메터의 범위를 좁혀가면서 계속 수행

In [34]:
params = {
    'max_depth' : [2, 5, 8],            
    'min_samples_split': [2, 3, 4]      # 분류하는 최소 단위가 2
}
grid_dt = GridSearchCV(
    dtc,                    # estimator, Decision Tree Classfier
    param_grid=params,      # 파라메터 조합
    scoring='accuracy',     # 평가방법 - 정확도
    cv = 5                  # 교차검증 세트 수
)
grid_dt.fit(X_train, y_train)

In [35]:
grid_dt.best_params_

{'max_depth': 8, 'min_samples_split': 4}

In [36]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.9166666666666666