#### 머신러닝 개념

- 데이터를 기반으로 패턴을 학습하고 결과를 추론하는 알고리즘 기법
- 지도학습(Supervised Learning), 비지도학습(Unsupervised Learning)
- 지도학습(분류, 회귀), 비지도학습(클러스터링, 차원축소)

#### 머신러닝 용어(데이터의 형식 : DataFrame)
- 피처(Feature) : 데이터의 일반 속성
- 레이블, 클래스, 타겟 값, 결정값 : 정답 데이터

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import sklearn
from sklearn.datasets import load_iris

print(np.__version__)
print(pd.__version__)
print(sklearn.__version__)

1.20.3
1.3.4
0.24.2


In [38]:
iris = load_iris()

print('type - ',type(iris))   # bunch 타입은 dict와 유사한 타입
print('keys - ',iris.keys())

type -  <class 'sklearn.utils.Bunch'>
keys -  dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [39]:
print('data type - ', type(iris.data))
print('data - ', iris.data)

data type -  <class 'numpy.ndarray'>
data -  [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7. 

In [40]:
print('target type - ', type(iris.target))
print('target - ', iris.target)

target type -  <class 'numpy.ndarray'>
target -  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [41]:
print('target_names type - ', type(iris.target_names))
print('target_names - ', iris.target_names)

target_names type -  <class 'numpy.ndarray'>
target_names -  ['setosa' 'versicolor' 'virginica']


In [42]:
print('feature type - ', type(iris.feature_names))
print('feature - ', iris.feature_names)

feature type -  <class 'list'>
feature -  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [43]:
# data, feature 데이터프레임 만들기

df = pd.DataFrame(data=iris.data, columns= iris.feature_names)
df['target'] = iris.target
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


#### 지도학습 - 분류(Classification)

- step 01. 데이터 분리(학습 데이터, 테스트 데이터로 나눈다.)
- step 02. 학습데이터를 기반으로 ML 알고리즘을 적용해 학습 모델을 생성
- step 03. 테스트데이터를 기반으로 분류예측
- step 04. 모델 성능 평가

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [45]:
# train_X, train_Y, test_X, test_Y = df.iloc[:100, :-1], df.iloc[:100, -1], df.iloc[100:, :-1], df.iloc[100:, -1] 

print('step 01. ------------------------------------------------\n')
X_train, X_test, Y_train, Y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size = 0.2,
                                                    shuffle= True,
                                                    random_state= 100
                                                   )

X_train.shape, X_test.shape,  Y_train.shape, Y_test.shape

step 01. ------------------------------------------------



((120, 4), (30, 4), (120,), (30,))

In [46]:
print('train data - ', X_train)
print('train target - ', Y_train)
print()
print('test data - ', X_test)
print('test target - ', Y_test)

train data -  [[5.5 2.4 3.7 1. ]
 [5.7 2.8 4.1 1.3]
 [6.  2.2 5.  1.5]
 [4.8 3.  1.4 0.1]
 [5.4 3.9 1.3 0.4]
 [6.4 3.2 4.5 1.5]
 [5.1 3.8 1.6 0.2]
 [5.5 2.5 4.  1.3]
 [6.3 3.4 5.6 2.4]
 [5.8 2.8 5.1 2.4]
 [4.5 2.3 1.3 0.3]
 [5.5 2.6 4.4 1.2]
 [7.1 3.  5.9 2.1]
 [7.2 3.6 6.1 2.5]
 [4.9 3.6 1.4 0.1]
 [4.6 3.4 1.4 0.3]
 [5.  3.  1.6 0.2]
 [5.1 3.7 1.5 0.4]
 [5.8 2.6 4.  1.2]
 [4.9 3.1 1.5 0.1]
 [5.1 3.3 1.7 0.5]
 [5.  3.2 1.2 0.2]
 [6.5 2.8 4.6 1.5]
 [7.9 3.8 6.4 2. ]
 [6.1 3.  4.9 1.8]
 [5.4 3.  4.5 1.5]
 [6.4 2.7 5.3 1.9]
 [5.7 2.9 4.2 1.3]
 [7.7 3.8 6.7 2.2]
 [6.5 3.2 5.1 2. ]
 [5.8 2.7 3.9 1.2]
 [4.6 3.6 1.  0.2]
 [6.9 3.1 5.4 2.1]
 [6.7 3.3 5.7 2.1]
 [6.3 2.8 5.1 1.5]
 [5.5 4.2 1.4 0.2]
 [4.4 3.2 1.3 0.2]
 [5.8 2.7 5.1 1.9]
 [5.4 3.9 1.7 0.4]
 [5.5 3.5 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [6.9 3.1 4.9 1.5]
 [6.5 3.  5.8 2.2]
 [6.7 3.3 5.7 2.5]
 [6.1 2.6 5.6 1.4]
 [5.4 3.7 1.5 0.2]
 [6.  3.4 4.5 1.6]
 [5.9 3.2 4.8 1.8]
 [4.6 3.1 1.5 0.2]
 [6.8 2.8 4.8 1.4]
 [4.9 2.4 3.3 1. ]
 [6.2 2.8 4.8 1.8

In [47]:
print('step 02. ------------------------------------------------\n')

iris_dtc_model = DecisionTreeClassifier()

# 학습
iris_dtc_model.fit(X_train, Y_train)

step 02. ------------------------------------------------



DecisionTreeClassifier()

In [53]:
print('step 03. ------------------------------------------------\n')

# 예측
y_pred = iris_dtc_model.predict(X_test)

print('y_test - ', Y_test)
print('y_pred - ', y_pred)


step 03. ------------------------------------------------

y_test -  [2 0 2 0 2 2 0 0 2 0 0 2 0 0 2 1 1 1 2 2 2 0 2 0 1 2 1 0 1 2]
y_pred -  [2 0 2 0 2 2 0 0 2 0 0 2 0 0 2 1 1 2 2 2 2 0 2 0 1 2 1 0 1 2]


In [54]:
print('step 04. ------------------------------------------------\n')

# 예측 정확도
print('acc - ', accuracy_score(Y_test, y_pred))

step 04. ------------------------------------------------

acc -  0.9666666666666667


In [55]:
train_X, train_Y, test_X, test_Y = df.iloc[:100, :-1], df.iloc[:100, -1], df.iloc[100:, :-1], df.iloc[100:, -1] 


#### 교차 검증(cross validation) - 회귀 X, 분류할 때 사용
- 과적합(overffiting)을 방지하기 위한 방법
- 데이터의 편중을 막기위해서
- KFold 방식

In [69]:
from sklearn.model_selection import KFold, StratifiedKFold 
from sklearn.model_selection import cross_validate, cross_val_score

In [70]:
fold_iris = load_iris()

features = fold_iris.data
label = fold_iris.target

In [75]:
print('5개의 폴더 세트를 분리하여 각 폴더 세트별 정확도를 확인해보자 - \n')

cv_acc = []
kfold = KFold(n_splits=5)

fold_dct_model = DecisionTreeClassifier()

# kfold는 셔플까지 되지 않기 때문에 정규분포를 따르지 않을 수 있음
for train_idx, test_idx in kfold.split(features):
#     print('train idx - ', train_idx)
#     print('test idx - ', test_idx)
#     print()
    X_train, X_val = features[train_idx], features[test_idx]
    y_train, y_val = label[train_idx], label[test_idx]
#     print(y_val)

    fold_dct_model.fit(X_train, y_train)
    fold_dct_pred = fold_dct_model.predict(X_val)
    
    acc = accuracy_score(y_val, fold_dct_pred)
    cv_acc.append(acc)

print('교차검증 평균 정확도 - ', np.mean(cv_acc))

5개의 폴더 세트를 분리하여 각 폴더 세트별 정확도를 확인해보자 - 

final acc -  0.9066666666666666


In [80]:
print("기존 KFord 방식의 문제점 확인")
print()

fold_iris_frm = pd.DataFrame(data = fold_iris.data,
                            columns= fold_iris.feature_names)

fold_iris_frm['target'] = iris.target

기존 KFord 방식의 문제점 확인



In [81]:
fold_iris_frm['target'].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

In [85]:
bad_fold_iris = KFold(n_splits=3)

In [89]:
print('KFold 워스트 케이스를 확인하기 위한 예제 - \n')

n_iter = 0
cv_acc = []
bad_fold_iris = KFold(n_splits=3)

fold_dct_model = DecisionTreeClassifier()

# kfold는 셔플까지 되지 않기 때문에 정규분포를 따르지 않음
for train_idx, test_idx in bad_fold_iris.split(fold_iris_frm):

    n_iter += 1
    
    label_train = fold_iris_frm['target'].iloc[train_idx]
    label_val = fold_iris_frm['target'].iloc[test_idx]

    print('교차검증 횟수 - \n', n_iter)
    print('학습 레이블 데이터 분포 - ', label_train)
    print('검증 레이블 데이터 분포 - ', label_val)
    
    fold_dct_model.fit(X_train, y_train)
    fold_dct_pred = fold_dct_model.predict(X_val)


KFold 워스트 케이스를 확인하기 위한 예제 - 

교차검증 횟수 - 
 1
학습 레이블 데이터 분포 -  50     1
51     1
52     1
53     1
54     1
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 100, dtype: int32
검증 레이블 데이터 분포 -  0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
Name: target, dtype: int32
교차검증 횟수 - 
 2
학습 레이블 데이터 분포 -  0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 100, dtype: int32
검증 레이블 데이터 분포 -  50    1
51    1
52    1
53    1
54    1
55    1
56    1
57    1
58    1
59    1
60    1
61    1
62    1
63    1
64    1
65    1
66    1
67    1
68    1
69    1
70    1
7

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
