#### 머신러닝 개념(ML)

- 데이터를 기반으로 패턴을 학습하고 결과를 추론하는 알고리즘 기법
- 지도학습(Supervised Learning), 비지도학습(Un-Supervised Learning) -> 데이터의 정답 유무로 기준을 나눔
- 지도학습(분류, 회귀), 비지도학습(군집화, 차원축소)

#### 머신러닝 용어(데이터형식: DataFrame)
- 피처(feature): 데이터의 일반 속성
- 레이블, 클래스, 타겟값, 결정값: 정답 데이터

In [1]:
import numpy as np
import pandas as pd

print('numpy version - ',np.__version__)
print('pandas version - ',pd.__version__)

numpy version -  1.20.3
pandas version -  1.3.4


In [2]:
import sklearn
from sklearn.datasets import load_iris

print('sklearn version - ',sklearn.__version__)

sklearn version -  0.24.2


In [4]:
iris = load_iris()
print('type - ', type(iris))
print('keys - ',iris.keys())

type -  <class 'sklearn.utils.Bunch'>
keys -  dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [10]:
print('data - ',iris.data)
print('data type - ',type(iris.data))
print('data len - ',len(iris.data))

data -  [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.

In [11]:
print('target - ',iris.target)
print('target type - ',type(iris.target))
print('target len - ',len(iris.target))

target -  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
target type -  <class 'numpy.ndarray'>
target len -  150


In [12]:
print('target_names - ',iris.target_names)
print('target_names type - ',type(iris.target_names))

target_names -  ['setosa' 'versicolor' 'virginica']
target_names type -  <class 'numpy.ndarray'>


In [13]:
print('feature_names - ',iris.feature_names)
print('feature_names type - ',type(iris.feature_names))

feature_names -  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
feature_names type -  <class 'list'>


In [112]:
print('feature, target를 이용해서 데이터프레임을 만들어 보자 - ')
print()
iris_frm = pd.DataFrame(data=iris.data,
                       columns = iris.feature_names)
iris_frm['target'] = iris.target
display(iris_frm)
print('iris_frm type - ',type(iris_frm))

feature, target를 이용해서 데이터프레임을 만들어 보자 - 



Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


iris_frm type -  <class 'pandas.core.frame.DataFrame'>


In [126]:
print('데이터 프레임 형식에서 학습데이터와 테스트데이터를 분리한다면? - ')
print()
print('target - ')
#print(iris_frm['target'], type(iris_frm['target']))
print('iloc() 이용해서 피쳐와 타겟을 추출한다면 - ')
iris_feature_frm = iris_frm.iloc[:,:-1]
iris_target_frm = iris_frm.iloc[:,-1]
display(iris_feature_frm)
display(iris_target_frm)

데이터 프레임 형식에서 학습데이터와 테스트데이터를 분리한다면? - 

target - 
iloc() 이용해서 피쳐와 타겟을 추출한다면 - 


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int32

#### 지도학습 - 분류(Classification)
 - step 01. 데이터 분리(training data, test data)
 - step 02. 학습데이터를 기반으로 ML 알고리즘 적용해 학습 모델을 생성
 - step 03. 테스트데이터를 기반으로 분류예측을 수행
 - step 04. 모델의 성능 평가

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [127]:
print('step 01.')
print()
X_train, X_test, Y_train, Y_test = train_test_split(iris.data,
                                                   iris.target,
                                                   test_size=0.2,
                                                   shuffle=True,
                                                   random_state = 100)


step 01.



In [103]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((118, 4), (32, 4), (118,), (32,))

In [104]:
print('train data - ',X_train)
print()
print('train target - ',Y_train)

train data -  [[6.  2.2 5.  1.5]
 [4.8 3.  1.4 0.1]
 [5.4 3.9 1.3 0.4]
 [6.4 3.2 4.5 1.5]
 [5.1 3.8 1.6 0.2]
 [5.5 2.5 4.  1.3]
 [6.3 3.4 5.6 2.4]
 [5.8 2.8 5.1 2.4]
 [4.5 2.3 1.3 0.3]
 [5.5 2.6 4.4 1.2]
 [7.1 3.  5.9 2.1]
 [7.2 3.6 6.1 2.5]
 [4.9 3.6 1.4 0.1]
 [4.6 3.4 1.4 0.3]
 [5.  3.  1.6 0.2]
 [5.1 3.7 1.5 0.4]
 [5.8 2.6 4.  1.2]
 [4.9 3.1 1.5 0.1]
 [5.1 3.3 1.7 0.5]
 [5.  3.2 1.2 0.2]
 [6.5 2.8 4.6 1.5]
 [7.9 3.8 6.4 2. ]
 [6.1 3.  4.9 1.8]
 [5.4 3.  4.5 1.5]
 [6.4 2.7 5.3 1.9]
 [5.7 2.9 4.2 1.3]
 [7.7 3.8 6.7 2.2]
 [6.5 3.2 5.1 2. ]
 [5.8 2.7 3.9 1.2]
 [4.6 3.6 1.  0.2]
 [6.9 3.1 5.4 2.1]
 [6.7 3.3 5.7 2.1]
 [6.3 2.8 5.1 1.5]
 [5.5 4.2 1.4 0.2]
 [4.4 3.2 1.3 0.2]
 [5.8 2.7 5.1 1.9]
 [5.4 3.9 1.7 0.4]
 [5.5 3.5 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [6.9 3.1 4.9 1.5]
 [6.5 3.  5.8 2.2]
 [6.7 3.3 5.7 2.5]
 [6.1 2.6 5.6 1.4]
 [5.4 3.7 1.5 0.2]
 [6.  3.4 4.5 1.6]
 [5.9 3.2 4.8 1.8]
 [4.6 3.1 1.5 0.2]
 [6.8 2.8 4.8 1.4]
 [4.9 2.4 3.3 1. ]
 [6.2 2.8 4.8 1.8]
 [5.1 3.5 1.4 0.2]
 [6.  2.9 4.5 1.5

In [105]:
print('test data - ',X_test)
print()
print('test target - ',Y_test)

test data -  [[6.4 2.8 5.6 2.1]
 [4.8 3.4 1.6 0.2]
 [7.7 2.6 6.9 2.3]
 [5.7 4.4 1.5 0.4]
 [6.3 2.7 4.9 1.8]
 [7.7 3.  6.1 2.3]
 [5.2 4.1 1.5 0.1]
 [4.9 3.  1.4 0.2]
 [6.5 3.  5.5 1.8]
 [4.8 3.  1.4 0.3]
 [5.  3.5 1.3 0.3]
 [6.4 3.2 5.3 2.3]
 [5.  3.4 1.6 0.4]
 [5.2 3.4 1.4 0.2]
 [6.7 3.  5.2 2.3]
 [6.2 2.9 4.3 1.3]
 [6.  2.2 4.  1. ]
 [6.7 3.  5.  1.7]
 [7.7 2.8 6.7 2. ]
 [6.8 3.  5.5 2.1]
 [7.2 3.2 6.  1.8]
 [5.4 3.4 1.5 0.4]
 [6.3 2.5 5.  1.9]
 [4.7 3.2 1.6 0.2]
 [5.6 2.5 3.9 1.1]
 [5.9 3.  5.1 1.8]
 [6.6 3.  4.4 1.4]
 [5.4 3.4 1.7 0.2]
 [6.1 2.8 4.7 1.2]
 [6.9 3.2 5.7 2.3]
 [5.5 2.4 3.7 1. ]
 [5.7 2.8 4.1 1.3]]

test target -  [2 0 2 0 2 2 0 0 2 0 0 2 0 0 2 1 1 1 2 2 2 0 2 0 1 2 1 0 1 2 1 1]


In [106]:
print('step 02. fit()')
print()
iris_dtc_model = DecisionTreeClassifier()
iris_dtc_model.fit(X_train,Y_train)

step 02. fit()



DecisionTreeClassifier()

In [107]:
print("step 03. predict()")
print()
Y_pred = iris_dtc_model.predict(X_test)
print('Y_test - ',Y_test)
print('Y_pred - ',Y_pred)

step 03. predict()

Y_test -  [2 0 2 0 2 2 0 0 2 0 0 2 0 0 2 1 1 1 2 2 2 0 2 0 1 2 1 0 1 2 1 1]
Y_pred -  [2 0 2 0 2 2 0 0 2 0 0 2 0 0 2 1 1 2 2 2 2 0 2 0 1 2 1 0 1 2 1 1]


In [108]:
print('step 04. 예측정확도 - ')
print()
print('acc - ',accuracy_score(Y_test,Y_pred))

step 04. 예측정확도 - 

acc -  0.96875


#### 교차검증(corss validation) - 회귀 X, 예측 
- 과적합(overfitting)을 방지하기 위한 방법
- 데이터의 편중을 막기 위한 방법
- KFold 방식

In [128]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_validate

In [140]:
fold_iris = load_iris()
features = fold_iris.data
labels = fold_iris.target

In [137]:
features

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [141]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [212]:
print('5개의 폴더 세트를 분리하여 각 폴더 세트별 정확도를 확인해보자 - ')
cv_acc = []
kfold = KFold(n_splits=5)

fold_dct_model = DecisionTreeClassifier()

for train_idx,test_idx in kfold.split(features):
    #print('train idx - ',train_idx)
    #print('test idx - ',test_idx)
    X_train,X_val = features[train_idx],features[test_idx]
    Y_train,Y_val = label[train_idx],label[test_idx]
    #print('X_train - ',X_train)
    #print('X_val - ',X_val)
    fold_dct_model.fit(X_train,Y_train)
    fold_pred = fold_dct_model.predict(X_val)
    
    acc = accuracy_score(Y_val,fold_pred)    
    print('acc - ',acc)
    print()
    
    cv_acc.append(acc)

5개의 폴더 세트를 분리하여 각 폴더 세트별 정확도를 확인해보자 - 
acc -  0.8666666666666667

acc -  1.0

acc -  0.9666666666666667

acc -  0.9

acc -  0.9666666666666667



In [195]:
print('교차검증 평균 정확도 - ',np.mean(cv_acc))

교차검증 평균 정확도 -  0.9133333333333333


In [198]:
print('기존 KFold 방식의 문제점 확인 - ')
print()
fold_iris_frm = pd.DataFrame(data = fold_iris.data,
                            columns = fold_iris.feature_names)
fold_iris_frm['target'] = fold_iris.target
display(fold_iris_frm)


기존 KFold 방식의 문제점 확인 - 



Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [199]:
fold_iris_frm['target'].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

In [223]:
bad_fold_iris = KFold(n_splits=3,shuffle=True)
n_iter = 0

fold_dct_model = DecisionTreeClassifier()
for train_idx,test_idx in bad_fold_iris.split(fold_iris_frm):
    n_iter += 1
    
    label_train = fold_iris_frm['target'].iloc[train_idx]
    label_val = fold_iris_frm['target'].iloc[test_idx]
    print('교차검증 횟수 - ',n_iter)
    print()
    print('학습 레이블 데이터 분포 - ',label_train) 
    print('검증 레이블 데이터 분포 - ',label_val)
    print()
    
#     fold_dct_model.fit(X_train,Y_train)
#     fold_pred = fold_dct_model.predict(X_val)
    
#     acc = accuracy_score(Y_val,fold_pred)    
#     #print('acc - ',acc)
#     cv_acc.append(acc)

교차검증 횟수 -  1

학습 레이블 데이터 분포 -  0      0
2      0
3      0
4      0
5      0
      ..
142    2
145    2
146    2
147    2
148    2
Name: target, Length: 100, dtype: int32
검증 레이블 데이터 분포 -  1      0
6      0
8      0
10     0
12     0
15     0
17     0
19     0
24     0
27     0
37     0
39     0
41     0
44     0
48     0
52     1
55     1
60     1
69     1
72     1
75     1
80     1
81     1
83     1
84     1
87     1
90     1
93     1
97     1
101    2
102    2
106    2
108    2
112    2
113    2
116    2
117    2
120    2
121    2
122    2
124    2
125    2
126    2
133    2
135    2
136    2
139    2
143    2
144    2
149    2
Name: target, dtype: int32

교차검증 횟수 -  2

학습 레이블 데이터 분포 -  1      0
4      0
6      0
7      0
8      0
      ..
143    2
144    2
146    2
147    2
149    2
Name: target, Length: 100, dtype: int32
검증 레이블 데이터 분포 -  0      0
2      0
3      0
5      0
9      0
11     0
20     0
21     0
22     0
23     0
25     0
30     0
31     0
34     0
45     0
47     0
49  

#### [실습]
- 아이리스 데이터를 이용하여 StratifiedKFold 교차검증을 진행해 보자
- random_state = 200

- StratifiedKFold(3,5) 평균 정확도 확인

In [222]:
fold_iris = load_iris()

features = fold_iris.data
labels = fold_iris.target

cv_acc = []
n_iter = 0
strat_kfold = StratifiedKFold(n_splits=3)

strat_kfold_dct_model = DecisionTreeClassifier(
                             random_state=200)

for train_idx, test_idx in strat_kfold.split(features,labels):
    n_iter += 1
    
    #print('train idx - ',train_idx)
    #print('test idx - ',test_idx)
    
    X_train, X_val = features[train_idx],features[test_idx]
    Y_train, Y_val = labels[train_idx],labels[test_idx]
    
    strat_kfold_dct_model.fit(X_train,Y_train)
    strat_kfold_pred = strat_kfold_dct_model.predict(X_val)
    
    acc = accuracy_score(Y_val,strat_kfold_pred)
    print('교차검증 횟수 - ',n_iter)
    print('acc - ',acc)
    print()
    
    cv_acc.append(acc)

print('평균 acc - ',np.mean(cv_acc))
    
    

교차검증 횟수 -  1
acc -  0.98

교차검증 횟수 -  2
acc -  0.92

교차검증 횟수 -  3
acc -  1.0

평균 acc -  0.9666666666666667


- 위 과정을 한번에 수행하는 함수: cross_val_score()
- 인자로 예측 모델, 피처 세트, 레이블, 성능평가 지표, 폴더 수

In [226]:
fold_iris = load_iris()

features = fold_iris.data
labels = fold_iris.target

dt_model = DecisionTreeClassifier(random_state=100)

In [227]:
print('성능 평가 acc, 교차검증 5회 수행 - ')
print('cross_val_score() - ')
print()

scores = cross_val_score(dt_model,
                         features,
                         labels,
                         scoring='accuracy',
                         cv=5)

성능 평가 acc, 교차검증 5회 수행 - 
cross_val_score() - 



In [233]:
print('type - ',type(scores))
print('data - ',scores)
print('mean - ',np.round(np.mean(scores),2))

type -  <class 'numpy.ndarray'>
data -  [0.96666667 0.96666667 0.9        0.93333333 1.        ]
mean -  0.95


- cross_validate

In [235]:
scores = cross_validate(dt_model,
                         features,
                         labels,
                         scoring='accuracy',
                         cv=5)

print('type - ',type(scores))
print('data - ',scores)

type -  <class 'dict'>
data -  {'fit_time': array([0.00199556, 0.00043035, 0.00099683, 0.00099754, 0.00099778]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])}


In [242]:
for key in scores.keys():
    print(key)

print('fit_time - ',scores['fit_time'])
print('score_time - ',scores['score_time'])
print('test_score - ',scores['test_score'])
print()
print('mean - ',np.round(np.mean(scores['test_score']),2))

fit_time
score_time
test_score
fit_time -  [0.00199556 0.00043035 0.00099683 0.00099754 0.00099778]
score_time -  [0. 0. 0. 0. 0.]
test_score -  [0.96666667 0.96666667 0.9        0.93333333 1.        ]

mean -  0.95
