In [78]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Data Load / EDA

In [79]:
wine = load_wine()
print(dir(wine))

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']


In [80]:
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [81]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [82]:
wine.data.shape

(178, 13)

In [83]:
data = wine.data

In [84]:
data[0]

array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03])

In [85]:
label = wine.target

In [86]:
label.shape

(178,)

In [87]:
label[0]

0

In [88]:
import numpy as np

np.unique(label, return_counts = True)

(array([0, 1, 2]), array([59, 71, 48]))

In [89]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [90]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [91]:
label[:3]

array([0, 0, 0])

# Train / Test Split

In [92]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, 
                                                    label, 
                                                    test_size=0.2, 
                                                    random_state=7)

print('X_train 개수: ', len(X_train),', X_test 개수: ', len(X_test))

X_train 개수:  142 , X_test 개수:  36


# Decision Tree

In [93]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)

# Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)

# SVM

In [95]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(data, 
                                                                    label, 
                                                                    test_size=0.3, 
                                                                    random_state=7)

In [96]:
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(X_train_svm, y_train_svm)
y_pred_svm = svm_model.predict(X_test_svm)

# SGD Classifier

In [97]:
X_train_sgd, X_test_sgd, y_train_sgd, y_test_sgd = train_test_split(data, 
                                                                    label, 
                                                                    test_size=0.45, 
                                                                    random_state=7)

In [98]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(X_train_sgd, y_train_sgd)
y_pred_sgd = sgd_model.predict(X_test_sgd)

In [99]:
y_pred_sgd

array([0, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0,
       2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2])

In [100]:
y_test_sgd

array([2, 0, 2, 2, 1, 2, 1, 0, 1, 2, 0, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 1,
       1, 1, 0, 2, 1, 2, 2, 2, 1, 0, 2, 1, 1, 1, 2, 2, 0, 2, 0, 1, 2, 2,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 2, 0, 2, 1, 2, 0, 2, 1,
       1, 1, 0, 1, 0, 0, 2, 0, 2, 1, 1, 2, 1, 0, 1])

# Logistic Regression

In [105]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=4000)
logistic_model.fit(X_train, y_train)
y_pred_lr = logistic_model.predict(X_test)

# Performance Evaluation

### Decision Tree

In [106]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.89      1.00      0.94        17
           2       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.95        36
weighted avg       0.95      0.94      0.94        36



### Random Forest

In [107]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



### SVM

In [108]:
print(classification_report(y_test_svm, y_pred_svm))

              precision    recall  f1-score   support

           0       0.81      1.00      0.90        13
           1       0.58      0.92      0.71        24
           2       0.00      0.00      0.00        17

    accuracy                           0.65        54
   macro avg       0.46      0.64      0.54        54
weighted avg       0.45      0.65      0.53        54



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [109]:
np.unique(y_test_svm, return_counts = True)

(array([0, 1, 2]), array([13, 24, 17]))

In [110]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test_svm, y_pred_svm)

array([[13,  0,  0],
       [ 2, 22,  0],
       [ 1, 16,  0]])

confusion matrix를 확인해보면, 2를 2라고 예측을 못함..

### SGD Classifier

In [111]:
print(classification_report(y_test_sgd, y_pred_sgd))

              precision    recall  f1-score   support

           0       0.41      1.00      0.58        21
           1       1.00      0.08      0.15        36
           2       0.30      0.33      0.31        24

    accuracy                           0.40        81
   macro avg       0.57      0.47      0.35        81
weighted avg       0.64      0.40      0.31        81



In [112]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test_sgd, y_pred_sgd)

array([[21,  0,  0],
       [14,  3, 19],
       [16,  0,  8]])

얘는 1을 예측을 못함..

### Logistic Regression

In [113]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.94      1.00      0.97        17
           2       1.00      0.92      0.96        12

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36



# Conclusion

#### SVM 모델과 SGD 모델의 문제

SVM 모델과 SGD 모델에서, 자꾸 예측을 레이블 두 개[0,1]로만 하는 문제가 생김 (원래 정답 레이블은 세 개[0,1,2]).
와인 데이터의 크기가 너무 작아서...라고 추측해봄..!
구글링을 해 보다 보니 train/test 비율을 조정해보라는 이야기도 있어서 해 보았지만 큰 효과는 없었음..
SVM과 SGD 모델이 어떻게 돌아가는 지 이해를 해야 문제를 해결할 수 있을 것 같음.
일단 여기에선 두 모델은 제외하는걸로...

#### 평가척도 정하기

wine 데이터는 label의 개수가 세 개이고, 각 레이블 별 데이터의 분포가 일정한 편으로 불균형데이터 혹은 imbalanced data의 문제가 보이지 않기 때문에 평가 척도로 accuracy를 사용하는 것이 적합하다고 판단하였다. 

각 모델 별 accuracy는 다음과 같다:

In [114]:
from sklearn.metrics import accuracy_score

print("Decision Tree:", accuracy_score(y_test, y_pred_dt))
print("Random Forest:", accuracy_score(y_test, y_pred_rf))
#print("SVM:", accuracy_score(y_test_svm, y_pred_svm))
#print("SGD Classifier:", accuracy_score(y_test_sgd, y_pred_sgd))
print("Logistic Regression:", accuracy_score(y_test, y_pred_lr))
print("(SVM과 SGD모델은 제외하였음)")

Decision Tree: 0.9444444444444444
Random Forest: 1.0
Logistic Regression: 0.9722222222222222
(SVM과 SGD모델은 제외하였음)


#### 결과: RF가 1.00의 Accuracy로 가장 높은 성능을 보였다.

아마도 test data의 개수가 전체 데이터의 0.2로 36개밖에 되지 않았기 때문에 1.0의 어큐러시가 가능하지 않았을까,,?