# Regression with scikit-learn

## 1. 데이터 셋 불러오기
Boston Housing dataset : 1978년 보스턴 지역의 주택 가격을 예측하기 위한 데이터 셋 
- 범죄율(CRIM), 방의 개수(RM), 세율(TAX) 등을 통해 주택의 가격을 예측
- Boston 데이터 셋은 Scikit-learn 내에 내장되어 있다. Import 문을 통해 손쉽게 불러올 수 있다.
- 원본 dataset source: http://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html 


In [1]:
import numpy as np
from sklearn.datasets import load_boston
boston = load_boston()

In [2]:
type(boston)

sklearn.datasets.base.Bunch

In [3]:
x_data = boston.data
print(type(x_data))

<class 'numpy.ndarray'>


In [4]:
y_data = boston.target
print(type(y_data))

<class 'numpy.ndarray'>


## 2. 데이터 탐색하기
x data는 총 506개의 인스턴스(관측치)와 13개의 feature(변수)로 이루어짐

In [5]:
print('type of x_data:', type(x_data))
print('shape of x_data:', x_data.shape)

type of x_data: <class 'numpy.ndarray'>
shape of x_data: (506, 13)


In [6]:
print(x_data)

[[  6.32000000e-03   1.80000000e+01   2.31000000e+00 ...,   1.53000000e+01
    3.96900000e+02   4.98000000e+00]
 [  2.73100000e-02   0.00000000e+00   7.07000000e+00 ...,   1.78000000e+01
    3.96900000e+02   9.14000000e+00]
 [  2.72900000e-02   0.00000000e+00   7.07000000e+00 ...,   1.78000000e+01
    3.92830000e+02   4.03000000e+00]
 ..., 
 [  6.07600000e-02   0.00000000e+00   1.19300000e+01 ...,   2.10000000e+01
    3.96900000e+02   5.64000000e+00]
 [  1.09590000e-01   0.00000000e+00   1.19300000e+01 ...,   2.10000000e+01
    3.93450000e+02   6.48000000e+00]
 [  4.74100000e-02   0.00000000e+00   1.19300000e+01 ...,   2.10000000e+01
    3.96900000e+02   7.88000000e+00]]


In [7]:
print('type of y_data:', type(y_data))
print('shape of y_data:', y_data.shape)

type of y_data: <class 'numpy.ndarray'>
shape of y_data: (506,)


In [8]:
y_data

array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,
        18.9,  15. ,  18.9,  21.7,  20.4,  18.2,  19.9,  23.1,  17.5,
        20.2,  18.2,  13.6,  19.6,  15.2,  14.5,  15.6,  13.9,  16.6,
        14.8,  18.4,  21. ,  12.7,  14.5,  13.2,  13.1,  13.5,  18.9,
        20. ,  21. ,  24.7,  30.8,  34.9,  26.6,  25.3,  24.7,  21.2,
        19.3,  20. ,  16.6,  14.4,  19.4,  19.7,  20.5,  25. ,  23.4,
        18.9,  35.4,  24.7,  31.6,  23.3,  19.6,  18.7,  16. ,  22.2,
        25. ,  33. ,  23.5,  19.4,  22. ,  17.4,  20.9,  24.2,  21.7,
        22.8,  23.4,  24.1,  21.4,  20. ,  20.8,  21.2,  20.3,  28. ,
        23.9,  24.8,  22.9,  23.9,  26.6,  22.5,  22.2,  23.6,  28.7,
        22.6,  22. ,  22.9,  25. ,  20.6,  28.4,  21.4,  38.7,  43.8,
        33.2,  27.5,  26.5,  18.6,  19.3,  20.1,  19.5,  19.5,  20.4,
        19.8,  19.4,  21.7,  22.8,  18.8,  18.7,  18.5,  18.3,  21.2,
        19.2,  20.4,  19.3,  22. ,  20.3,  20.5,  17.3,  18.8,  21.4,
        15.7,  16.2,

y data는 각 관측치의 '주택 가격'을 나타내는 실수 값으로 이루어져 있다
- y 값의 평균은 22.5328이다.
- y 값의 표준편차는 9.189이다.

In [9]:
print('average of y_data:', np.mean(y_data))
print('standard deviation of y_data:', np.std(y_data))

average of y_data: 22.5328063241
standard deviation of y_data: 9.18801154528


In [10]:
np.mean(x_data, axis = 0) # 13개의 변수(행)의 평균

array([  3.59376071e+00,   1.13636364e+01,   1.11367787e+01,
         6.91699605e-02,   5.54695059e-01,   6.28463439e+00,
         6.85749012e+01,   3.79504269e+00,   9.54940711e+00,
         4.08237154e+02,   1.84555336e+01,   3.56674032e+02,
         1.26530632e+01])

In [11]:
# np.mean(x_data, axis = 1) # 506개 관측치(열)의 평균 => 여기서는 의미 없음

### 데이터 전처리
모델을 학습하기 위한 학습 데이터와 모델의 성능을 평가하기 위한 검증 데이터로 나누어 본다
- Scikit-learn의 train_test_split 함수를 사용한다
- 학습 데이터와 검증 데이터의 비율은 7:3으로 설정한다
 

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, \
random_state = 7)

In [14]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(354, 13)
(354,)
(152, 13)
(152,)


## 3. 데이터 학습하기
회귀 모델(regressor) 객체를 생성하고, fit 함수를 통해 데이터를 학습한다
- fit 함수의 파라미터로는 x, y 학습 데이터를 순서대로 넣어준다

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
regressor = LinearRegression()

In [17]:
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## 4. 모델 평가하기
학습된 회귀 모델(regressor)로 데이터를 예측하고 오차(mean squared error)를 계산해 본다
- mean_squared_error 함수는 예측값에서 실제 값이 떨어져 있는 정도를 제곱하여 모두 합한 후 평균한 값을 계산한다
 

In [18]:
from sklearn.metrics import mean_squared_error , mean_absolute_error

In [19]:
y_pred = regressor.predict(x_test)

In [20]:
print('MSE: ', mean_squared_error(y_pred, y_test))
print('MAE: ', mean_absolute_error(y_pred, y_test))

MSE:  25.7478927444
MAE:  3.30638059656


## 실습 3-2-1 선형 회귀분석
Scikit-learn의 diabetes 데이터 셋을 불러와 회귀 모델을 세워 본다
- sklearn.diabetes.load_diabetes
- 참고: http://scikit-learn.org/stable/datasets/index.html 


In [23]:
import numpy as np
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()

x_data = diabetes.data
y_data = diabetes.target

print('type of x_data:', type(x_data))
print('shape of x_data:', x_data.shape)
print('-'*50)
print('type of y_data:', type(y_data))
print('shape of y_data:', y_data.shape)
print('-'*50)
print('average of y_data:', np.mean(y_data))
print('standard deviation of y_data:', np.std(y_data))

type of x_data: <class 'numpy.ndarray'>
shape of x_data: (442, 10)
--------------------------------------------------
type of y_data: <class 'numpy.ndarray'>
shape of y_data: (442,)
--------------------------------------------------
average of y_data: 152.133484163
standard deviation of y_data: 77.0057458695


In [24]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, \
random_state = 7)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(309, 10)
(309,)
(133, 10)
(133,)


In [25]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
from sklearn.metrics import mean_squared_error , mean_absolute_error

y_pred = regressor.predict(x_test)

print('MSE: ', mean_squared_error(y_pred, y_test))
print('MAE: ', mean_absolute_error(y_pred, y_test))

MSE:  2785.93354031
MAE:  40.7344363014


## 실습 3-2-2 다른 회귀 모델 생성하기
Boston housing 데이터 셋을 활용하여 다른 회귀 알고리즘을 구현해 본다
- 회귀모형 예시
- 서포트 벡터 머신(support vector machines) : sklearn.svm.SVR()
- 의사결정나무(decision trees): sklearn.tree.DecisionTreeRegressor()


In [30]:
## Support Vector regressor
from sklearn.svm import SVR

regressor = SVR()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
print('MSE: ', mean_squared_error(y_pred, y_test))


## Decision Tree regressor
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
print('MSE: ', mean_squared_error(y_pred, y_test))

MSE:  5817.17468716
MSE:  5910.17293233


## 3-2-3. 교차 검증
Boston housing 데이터 셋을 활용하여 교차검증을 수행해 본다
- 넘파이의 array_split() 함수를 활용한다
- 각각의 fold를 학습한 후 평가한 결과를 출력해 본다

In [31]:
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.svm import SVR

boston = datasets.load_boston()
# print(type(iris))

x_data = boston.data
# print(type(x_data))

y_data = boston.target

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 7)
# print(type(y_data))
# print('type of x_data: ', type(x_data))
# print('shape of x_data: ', x_data.shape)

# print('type of y_data: ', type(y_data))
# print('shape of y_data: ', y_data.shape)

## TODO 

k = 5
accuracies = []

for i in range(k):
	x_train_folds = np.array_split(x_train, k)
	y_train_folds = np.array_split(y_train, k)

	x_te = x_train_folds[i]
	y_te = y_train_folds[i]
	
	x_tr = x_train_folds.pop(i)
	y_tr = y_train_folds.pop(i)
	
	classifier = SVR()
	classifier.fit(x_tr, y_tr)
	y_pred = classifier.predict(x_te)
	acc = mean_squared_error(y_pred, y_te)
	accuracies.append(acc)

	print('{}th fold train finished!'.format(i+1))

print('Errors for each fold')
for i in range(len(accuracies)):
	print('{}th fold mse: '.format(i+1), accuracies[i])

## END

1th fold train finished!
2th fold train finished!
3th fold train finished!
4th fold train finished!
5th fold train finished!
Errors for each fold
1th fold mse:  64.586655831
2th fold mse:  88.2721298532
3th fold mse:  74.5563287218
4th fold mse:  72.9295605364
5th fold mse:  72.9532605056


## 3-2-3. 교차 검증-2
Boston housing 데이터 셋을 사용하여 교차검증을 수행해 본다
- Cross_val_score 함수를 활용한다

In [32]:
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.svm import SVR

boston = datasets.load_boston()
# print(type(iris))

x_data = boston.data
# print(type(x_data))

y_data = boston.target

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 7)

classifier = SVR()
scores = cross_val_score(classifier, x_data, y_data, scoring = 'neg_mean_squared_error', cv = 5)  # 5-fold cross validation을 수행

for i in range(len(scores)):
	print('{}th fold accuracy: '.format(i+1), scores[i])


1th fold accuracy:  -36.2138313984
2th fold accuracy:  -101.179273587
3th fold accuracy:  -175.80268172
4th fold accuracy:  -89.1095963766
5th fold accuracy:  -64.9999073642
