## 데이터 셋 불러오기
- iris 데이터 셋을 불러온다(Scikit-learn 내에 내장)
- 원본 dataset source: https://archive.ics.uci.edu/ml/datasets/iris

In [1]:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [2]:
print(type(iris))

<class 'sklearn.datasets.base.Bunch'>


- x data: 입력변수(종속변수)
- y data: 출력변수(독립변수)

In [3]:
x_data = iris.data
print(type(x_data))

<class 'numpy.ndarray'>


In [4]:
y_data = iris.target
print(type(y_data))

<class 'numpy.ndarray'>


## 데이터 탐색하기
x data는 총 150개의 인스턴스(관측치)와 4개의 feature(변수)로 이루어짐

In [5]:
print('type of x_data: ', type(x_data))
print('shape of x_data: ', x_data.shape)

type of x_data:  <class 'numpy.ndarray'>
shape of x_data:  (150, 4)


In [6]:
x_data

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4

#### y data는 각 클래스에 따라 각각 0,1,2로 인코딩된 정수로 이루어짐
- 0: Iris-setosa
- 1: Iris-cersicolor
- 2: Iris-virginica

In [7]:
print('type of y_data: ', type(y_data))
print('shape of y_data: ', y_data.shape)

type of y_data:  <class 'numpy.ndarray'>
shape of y_data:  (150,)


In [8]:
y_data

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
from collections import Counter

In [10]:
Counter(y_data) # 0,1,2 각각의 클래스가 50개씩 골고루 분포되어 있다

Counter({0: 50, 1: 50, 2: 50})

### 데이터 전처리
로지스틱 회귀분석 모델은 두 가지 클래스를 나누는 이진 분류(binary classification) 문제에만 적용 가능
- 처음 100개의 관측치만 불러와 클래스 0,1을 분류해보자.


In [9]:
x_data = x_data[:100, :]
y_data = y_data[:100]

In [10]:
x_data.shape

(100, 4)

In [11]:
y_data.shape

(100,)

In [13]:
from collections import Counter
Counter(y_data)

Counter({0: 50, 1: 50})

#### 학습데이터/검증데이터 분류하기

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# 학습데이터와 검증 데이터의 비율은 7:3으로 설정
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, \
random_state = 7)

In [16]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(70, 4)
(70,)
(30, 4)
(30,)


### 데이터 학습하기
분류기(classifier)객체를 생성하고, fit함수를 통해 데이터를 학습 

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
classifier = LogisticRegression()

In [19]:
classifier.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### 모델 평가하기
학습된 분류기(classifier)로 데이터를 예측하고 정확도(accuracy)를 계산 

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
y_pred = classifier.predict(x_test) # 새로운 x값을 넣어주면 학습된 모델을 가지고 y값을 예측해 줌

In [22]:
# accuracy_score() 함수는 전체 검증 데이터에서 얼마나 맞추었는지(비율)를 계산
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  1.0


In [24]:
from sklearn.metrics import confusion_matrix

In [25]:
confusion_matrix(y_pred, y_test)

array([[16,  0],
       [ 0, 14]])

## 실습 3-1-1. 로지스틱 회귀분석
Iris 데이터 셋에서 클래스 1 (versicolor)과 2 (virginica)를 분류하는 모델을 세워 본다.

In [26]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()

x_data = iris.data
y_data = iris.target

## TODO 

x_data = x_data[50:, :]
y_data = y_data[50:]
print(Counter(y_data))

## END

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 7)

classifier = LogisticRegression()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)
print('Accuracy: ', accuracy_score(y_pred, y_test))

Counter({1: 50, 2: 50})
Accuracy:  0.9


In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred, y_test)

array([[13,  0],
       [ 3, 14]])

## 실습 3-1-2. 다른 분류 모델 생성하기
Iris 데이터 셋을 활용하여 3개의 클래스를 분류하는 모델을 세워본다.

- 서포트 벡터 머신(support vector machines) : 
sklearn.svm.SVC()
- 의사결정나무(decision trees) : 
sklearn.tree.DecisionTreeClassifier()

In [28]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()

x_data = iris.data
y_data = iris.target

x_data = x_data[:100, :]
y_data = y_data[:100]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 7)


## Support Vector Classifer

classifier = SVC()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)
print('Accuracy of Support Vector Classifier: ', accuracy_score(y_pred, y_test))


## Decision Tree Classifier

classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)
print('Accuracy of Decision Tree Classifier: ', accuracy_score(y_pred, y_test))

Accuracy of Support Vector Classifier:  1.0
Accuracy of Decision Tree Classifier:  1.0


## 3-1-3. k-폴드 교차검증
Iris 데이터 셋을 활용하여 세 개의 클래스를 분류하는 모델을 세운 후 교차검증(cross-validation)을 통해 모델의 정확도를 평가해 본다.
- 넘파이의 array_split() 함수를 활용한다
- 각각의 fold를 학습한 후 평가한 결과를 출력해 본다
- numpy implementation of k-fold cross validation

In [29]:
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.svm import SVC

iris = datasets.load_iris()
# print(type(iris))

x_data = iris.data
y_data = iris.target

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 7)

# print(type(x_data))
# print(type(y_data))
# print('type of x_data: ', type(x_data))
# print('shape of x_data: ', x_data.shape)

# print('type of y_data: ', type(y_data))
# print('shape of y_data: ', y_data.shape)


## TODO 

k = 5
accuracies = []

for i in range(k):
	x_train_folds = np.array_split(x_train, k)
	y_train_folds = np.array_split(y_train, k)

	x_te = x_train_folds[i]
	y_te = y_train_folds[i]
	
	x_tr = x_train_folds.pop(i)
	y_tr = y_train_folds.pop(i)
	
	classifier = SVC()
	classifier.fit(x_tr, y_tr)
	y_pred = classifier.predict(x_te)
	acc = accuracy_score(y_pred, y_te)
	accuracies.append(acc)

	print('{}th fold train finished!'.format(i+1))

print('Accuracies for each fold')
for i in range(len(accuracies)):
	print('{}th fold accuracy: '.format(i+1), accuracies[i])

## END

1th fold train finished!
2th fold train finished!
3th fold train finished!
4th fold train finished!
5th fold train finished!
Accuracies for each fold
1th fold accuracy:  1.0
2th fold accuracy:  1.0
3th fold accuracy:  1.0
4th fold accuracy:  0.916666666667
5th fold accuracy:  1.0


## 3-1-4. k-폴드 교차검증-2
Iris 데이터 셋을 활용하여 세 개의 클래스를 분류하는 모델을 세운 후 교차검증(cross-validation)을 통해 모델의 정확도를 평가해 본다
- Scikit learn의 cross_val_score를 활용

In [30]:
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

iris = datasets.load_iris()
# print(type(iris))

x_data = iris.data
y_data = iris.target

# print(type(x_data))
# print(type(y_data))
# print('type of x_data: ', type(x_data))
# print('shape of x_data: ', x_data.shape)

# print('type of y_data: ', type(y_data))
# print('shape of y_data: ', y_data.shape)


## TODO 

classifier = SVC()
scores = cross_val_score(classifier, x_data, y_data, cv = 5)  # 5-fold cross validation을 수행

for i in range(len(scores)):
	print('{}th fold accuracy: '.format(i+1), scores[i])
    
## END

1th fold accuracy:  0.966666666667
2th fold accuracy:  1.0
3th fold accuracy:  0.966666666667
4th fold accuracy:  0.966666666667
5th fold accuracy:  1.0


## 3-1-5. Classification with Zoo data
- 아래 데이터 셋을 다운받아 분류 작업을 수행해 본다
- https://archive.ics.uci.edu/ml/datasets/Zoo 

In [31]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data', sep = ',', header = None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [32]:
del data[0]
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [33]:
data = data.values
print(data.shape)

(101, 17)


In [34]:
x_data = data[ : , :-1]
y_data = data[ : , -1]
print(x_data.shape)
print(y_data.shape)

(101, 16)
(101,)


In [35]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 7)

In [36]:
k = 5
accuracies = []

for i in range(k):
	x_train_folds = np.array_split(x_train, k)
	y_train_folds = np.array_split(y_train, k)

	x_te = x_train_folds[i]
	y_te = y_train_folds[i]
	
	x_tr = x_train_folds.pop(i)
	y_tr = y_train_folds.pop(i)
	
	classifier = SVC()
	classifier.fit(x_tr, y_tr)
	y_pred = classifier.predict(x_te)
	acc = accuracy_score(y_pred, y_te)
	accuracies.append(acc)

	print('{}th fold train finished!'.format(i+1))

print('Accuracies for each fold')
for i in range(len(accuracies)):
	print('{}th fold accuracy: '.format(i+1), accuracies[i])

1th fold train finished!
2th fold train finished!
3th fold train finished!
4th fold train finished!
5th fold train finished!
Accuracies for each fold
1th fold accuracy:  0.857142857143
2th fold accuracy:  1.0
3th fold accuracy:  0.785714285714
4th fold accuracy:  0.857142857143
5th fold accuracy:  0.857142857143
