# [Project 2] load_wine

#### * 목표
와인 종류 분류

### 1. 모듈 import

In [1]:
from sklearn.datasets import load_wine
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix # 오차행렬

print("Import Success")

Import Success


### 2. 데이터 준비

In [2]:
wine = load_wine()

print(wine.keys()) # 6개 key

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


### 3. 데이터 이해

#### 1) Feature Data 지정

In [3]:
wine_feature = wine.data
print(wine_feature.shape) #178개 데이터 x 13개 feature

(178, 13)


#### 2) Label Data 지정

In [4]:
wine_label = wine.target
print(wine_label.shape) # 178개 데이터

(178,)


In [5]:
# DataFrame으로 Feature, Label Data 확인
wine_df = pd.DataFrame(data = wine_feature, columns = wine.feature_names)
wine_df["label"] = wine.target
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


#### 3) Target Names 출력

In [6]:
print(wine.target_names) # 0 : class_0, 1 : class_1, 2 : class_2

['class_0' 'class_1' 'class_2']


#### 4) 데이터 Describe

In [7]:
print(wine.DESCR) # 데이터셋 설명

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

## 4. train, test 데이터 분리

In [8]:
x_train, x_test, y_train, y_test = train_test_split(wine_feature, 
                                                    wine_label, 
                                                    test_size = 0.2, 
                                                    random_state = 7)

print('x_train 개수: ', len(x_train), ', x_test 개수: ', len(x_test))
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

x_train 개수:  142 , x_test 개수:  36
(142, 13) (142,)
(36, 13) (36,)


## 5. 다양한 모델 학습

#### 1) Decision Tree

In [9]:
decision_tree = DecisionTreeClassifier(random_state = 32) # 변수 모델 저장, random_state - 난수 설정
decision_tree.fit(x_train, y_train) # 모델 학습
y_pred = decision_tree.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 7  0  0]
 [ 0 17  0]
 [ 0  2 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.89      1.00      0.94        17
           2       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36



#### 2) Random Forest

In [10]:
random_forest = RandomForestClassifier(random_state = 32) # 변수 모델 저장, random_state - 난수 설정
random_forest.fit(x_train, y_train) # 모델 학습
y_pred = random_forest.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 7  0  0]
 [ 0 17  0]
 [ 0  0 12]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



#### 3) SVM

In [11]:
_svm = svm.SVC() # 변수 모델 저장
_svm.fit(x_train, y_train) # 모델 학습
y_pred = _svm.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 6  0  1]
 [ 1 15  1]
 [ 0 11  1]]
              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.58      0.88      0.70        17
           2       0.33      0.08      0.13        12

    accuracy                           0.61        36
   macro avg       0.59      0.61      0.56        36
weighted avg       0.55      0.61      0.54        36



#### 4) SGD Classifier

In [12]:
sgd = SGDClassifier() # 변수 모델 저장
sgd.fit(x_train, y_train) # 모델 학습
y_pred = sgd.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 7  0  0]
 [ 1 10  6]
 [ 0  1 11]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       0.91      0.59      0.71        17
           2       0.65      0.92      0.76        12

    accuracy                           0.78        36
   macro avg       0.81      0.83      0.80        36
weighted avg       0.82      0.78      0.77        36



#### 5) Logistic Regression

In [13]:
logistic_regression = LogisticRegression(max_iter = 5000) # 변수 모델 저장, max_iter - 학습 반복 횟수 지정
logistic_regression.fit(x_train, y_train) # 모델 학습
y_pred = logistic_regression.predict(x_test)


print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 7  0  0]
 [ 0 17  0]
 [ 0  1 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.94      1.00      0.97        17
           2       1.00      0.92      0.96        12

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36



## 6. 모델 평가

#### * 예측 결과 (classification_report)
* confusion matrix을 통해 성능을 비교한 결과,    
5가지 모델 중 Random Forest 모델이 100%의 정확도로 가장 높은 분류 성능을 보였다.    


* Random Forest 모델은 많은 의사 결정 트리를 함께 결합하여 분류하거나 예측한다.     
특성에 따라 다수 결정을 토대로 와인의 종류을 예측 할 수 있어 효과적이라 생각된다.


#### * 성능 평가 지표 (Confusion Matrix)

* 실제 와인과 예측한 와인이 다를 때의 상황을 고려해야 하므로, Precision 으로 평가하였다.