# 지도학습(1) -회귀


사이킷런은 다항 회귀를 위한 클래스를 제공하지 않음. 다항 회귀 역시 선형회귀 이므로 비선형 함수를 선형 모델에 적용시키는 방법을 구현해 사용한다.

-> PolunomaialFeature 클래스를 통해서 피처를 다항식(Polynomial) 피처로 변환한다. 

-> PolynomaialFeatures 클래스는 degree 파라미터를 통해 입력받은 단항식 피처를 degree에 해당하는 다항식 피처로 변환한다. -> 그러나 차수가 높아지면 과적합 문제 발생


지도 학습은 1) 회귀와 2) 분류로 나눌 수 있다. 


In [3]:
from pandas import read_excel
from sklearn.linear_model import LinearRegression # 선형회귀분석 모듈

### 단순선형회귀

In [4]:
# 자동차의 속도에 따른 제동거리(dist) 조사 데이터
origin = read_excel('https://data.hossam.kr/E04/cars.xlsx')
origin.head()

Unnamed: 0,speed,dist
0,4,2
1,4,10
2,7,4
3,7,22
4,8,16


#### 분석 알고리즘을 포함하고 있는 객체 생성

In [5]:
model = LinearRegression()

#### 분석모델을 학습시킴

fit(독립변수x, 종속변수y) 형태로 사용

직접 데이터를 파라미터로 전달하는 경우
1. 2차원 배열 numpy
2. 완전한 데이터 프레임 형태로 전달

In [6]:
fit = model.fit(origin[['speed']],origin[['dist']]) # 2차원 배열이므로 list 2개
fit

### 학습결과 확인

In [7]:
# 기울기
coef = fit.coef_
coef

array([[3.93240876]])

In [9]:
# 절편
intercept = fit.intercept_
intercept

array([-17.57909489])

### 회귀식 추정

In [11]:
print('y={}*x+{}'.format(coef[0][0],intercept[0]))

y=3.932408759124089*x+-17.579094890510973


In [14]:
# 반올림
print('y={0:.2f}*x+{1:.2f}'.format(coef[0][0],intercept[0]))

y=3.93*x+-17.58


### 설명력

In [15]:
rscore = fit.score(origin[['speed']],origin[['dist']])
rscore # R-squared 값

0.6510793807582509

### 회귀분석 결과 활용

In [20]:
# 모델을 활용한 결과 값 얻기
speed = []
for i in range(10,41,5) :
    speed.append([i])
pred = fit.predict(speed)
pred



array([[ 21.7449927 ],
       [ 41.4070365 ],
       [ 61.06908029],
       [ 80.73112409],
       [100.39316788],
       [120.05521168],
       [139.71725547]])

## 03. 다중선형회귀

In [21]:
origin = read_excel("https://data.hossam.kr/E04/boston.xlsx")
origin

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,0
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,0
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,0
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,0


### 데이터 전처리

머신러닝에서 데이터 전처리는 독립변수와 종속변수를 분리하는 과정을 뜻함

In [22]:
origin.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV', 'CAT. MEDV'],
      dtype='object')

In [32]:
# 독립변수 추출
x_train = origin[["CRIM", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "TAX", "PTRATIO", "B", "LSTAT"]]
x_train.head()

Unnamed: 0,CRIM,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
0,0.00632,2.31,0,0.538,6.575,65.2,4.09,296,15.3,396.9,4.98
1,0.02731,7.07,0,0.469,6.421,78.9,4.9671,242,17.8,396.9,9.14
2,0.02729,7.07,0,0.469,7.185,61.1,4.9671,242,17.8,392.83,4.03
3,0.03237,2.18,0,0.458,6.998,45.8,6.0622,222,18.7,394.63,2.94
4,0.06905,2.18,0,0.458,7.147,54.2,6.0622,222,18.7,396.9,5.33


In [34]:
# 종속변수 추출

y_train = origin['MEDV']
# y_train.head()

In [35]:
# 분석수행
model = LinearRegression()
fit = model.fit(x_train, y_train)

print("계수: ", fit.coef_)
print("절편: ", fit.intercept_)
print("설명력: ", fit.score(x_train, y_train))

print("y = {0:.2f} * X + {1:.2f}".format(coef[0][0], intercept[0]))

계수:  [-6.18750142e-02 -7.22462346e-02  3.12288964e+00 -1.58299544e+01
  4.28468563e+00 -8.28081464e-03 -1.25415352e+00  2.87273861e-03
 -9.65514253e-01  8.56492285e-03 -5.09092893e-01]
절편:  30.558531448421274
설명력:  0.7249622391093025
y = 3.93 * X + -17.58
