In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

UCI Dataset 중, Wine dataset을 사용하여 ML model에 학습시켜보겠다.
먼저, sklearn 라이브러리의 load_wine을 이용하여 dataset을 불러온다.
ML model을 이용하여 classification 하는 task를 해볼 것이다.
wine의 여러 feature를 학습하여 와인의 품질을 분류하는 것이 충분히 가능하기 때문이다.

In [2]:
wine = load_wine()
wine_data = wine.data
df_wine = pd.DataFrame(data = wine_data, columns=[wine.feature_names])
df_wine['target'] = wine.target
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


UCI dataset의 wine dataset 에는 총 13가지의 feature class가 있으며, target값을 데이터프레임에 붙여 총 14개의 columns을 가진 데이터 프레임이 되었다.

In [3]:
df_wine.columns.tolist()

[('alcohol',),
 ('malic_acid',),
 ('ash',),
 ('alcalinity_of_ash',),
 ('magnesium',),
 ('total_phenols',),
 ('flavanoids',),
 ('nonflavanoid_phenols',),
 ('proanthocyanins',),
 ('color_intensity',),
 ('hue',),
 ('od280/od315_of_diluted_wines',),
 ('proline',),
 ('target',)]

wine의 target 값은 wine의 품질이며, {0, 1, 2}의 세가지 class로 분류되어 있다. class 분포는 약간 imbalance한 편이다.

In [4]:
df_wine['target'].value_counts()

(target,)
1            71
0            59
2            48
dtype: int64

Gaussian Naive Bayes, DecisionTree, RandomForest 총 3가지 model을 사용하여 학습시킬 것 이다.
sklearn 라이브러리에서 알맞는 model들을 불러온다.

In [5]:
nb = GaussianNB()
dtc = DecisionTreeClassifier()
rf = RandomForestClassifier()

sklearn 라이브러리의 train_test_split을 사용하여 8:2의 비율로 train과 test를 나누어서 학습과 추론과정을 진행할 것 이다.

In [6]:
train_X, test_X, train_y, test_y = train_test_split(wine.data, wine.target, stratify = df_wine['target'], random_state=42, test_size=0.2)

각각의 모델들에 대해서 Train dataset으로 학습시킨 뒤, Test dataset으로 검증한다.

In [7]:
# Train
nb.fit(train_X, train_y)
dtc.fit(train_X, train_y)
rf.fit(train_X, train_y)

# Inference
pred_nb = nb.predict(test_X)
pred_dtc = dtc.predict(test_X)
pred_rf = rf.predict(test_X)

학습된 결과에 대해서 Confusion Matrix(혼동 행렬)을 뽑아봐서 전체 class 중 어느 정도로 제대로 맞추었는지 확인해보자.

In [8]:
confusion_matrix(test_y, pred_nb)

array([[12,  0,  0],
       [ 1, 13,  0],
       [ 0,  0, 10]])

In [9]:
confusion_matrix(test_y, pred_dtc)

array([[11,  1,  0],
       [ 1, 13,  0],
       [ 0,  0, 10]])

In [10]:
confusion_matrix(test_y, pred_rf)

array([[12,  0,  0],
       [ 0, 14,  0],
       [ 0,  0, 10]])

class imbalance가 어느 정도 있는 dataset을 사용하여 학습하였기 때문에, 학습된 결과에 대해서 precision과 recall의 조화평균인 f1-score를 사용하였다.

f1-score를 사용할 때는 class imbalance 문제를 어느 정도 반영하기 위해, macro average보다는 micro average를 선택하였다.

In [11]:
print("F1-Score")
print("naive bayes: {0:.2f}".format(f1_score(test_y, pred_nb, average='micro')))
print("decision tree: {0:.2f}".format(f1_score(test_y, pred_dtc, average='micro')))
print("random forest: {0:.2f}".format(f1_score(test_y, pred_rf, average='micro')))

F1-Score
naive bayes: 0.97
decision tree: 0.94
random forest: 1.00


실험 결과, RandomForest 방식의 분류가 가장 높은 성능을 보였음을 알 수 있다.

이번에는 수업시간에 배운 StratifiedKFold 방식을 사용하여 Cross validation을 사용해보겠다. 반복횟수는 5번으로 지정하고, K번마다 K개의 train dataset을 나누어 평가를 진행한다.

In [12]:
skf = StratifiedKFold(n_splits=5)
n_iter = 0
logr_score = []
nb_score = []
dtc_score = []
rf_score = []
for train_idx, test_idx in skf.split(df_wine, df_wine['target']):
    n_iter += 1
    train_X, test_X = wine_data[train_idx], wine_data[test_idx]
    train_y, test_y = wine.target[train_idx], wine.target[test_idx]

    nb.fit(train_X, train_y)
    dtc.fit(train_X, train_y)
    rf.fit(train_X, train_y)

    pred_nb = nb.predict(test_X)
    pred_dtc = dtc.predict(test_X)
    pred_rf = rf.predict(test_X)

    nb_score.append(f1_score(test_y, pred_nb, average='micro'))
    dtc_score.append(f1_score(test_y, pred_dtc, average='micro'))
    rf_score.append(f1_score(test_y, pred_rf, average='micro'))
    print('--------------- Cross Validation - {} ---------------'.format(n_iter))
    print("naive bayes: {0:.2f}".format(f1_score(test_y, pred_nb, average='micro')))
    print("decision tree: {0:.2f}".format(f1_score(test_y, pred_dtc, average='micro')))
    print("random forest: {0:.2f}".format(f1_score(test_y, pred_rf, average='micro')))

--------------- Cross Validation - 1 ---------------
naive bayes: 0.94
decision tree: 0.94
random forest: 0.94
--------------- Cross Validation - 2 ---------------
naive bayes: 0.97
decision tree: 0.78
random forest: 0.94
--------------- Cross Validation - 3 ---------------
naive bayes: 0.97
decision tree: 0.89
random forest: 0.97
--------------- Cross Validation - 4 ---------------
naive bayes: 0.94
decision tree: 0.91
random forest: 1.00
--------------- Cross Validation - 5 ---------------
naive bayes: 1.00
decision tree: 0.86
random forest: 1.00


In [13]:
print('--------------- 평균 F1 Score - {} ---------------')
print("logistic regression: {0:.2f}".format(np.mean(nb_score)))
print("logistic regression: {0:.2f}".format(np.mean(dtc_score)))
print("logistic regression: {0:.2f}".format(np.mean(rf_score)))

--------------- 평균 F1 Score - {} ---------------
logistic regression: 0.97
logistic regression: 0.88
logistic regression: 0.97
