## 선형회귀 분석

미국 보스톤 부동산 시세 데이터 (sklearn 패키지 제공).

#### 필요한 패키지를 불러온다:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

#### 데이터를 불러온다:

In [None]:
data = load_boston()

In [None]:
data.keys()

In [None]:
print(data['DESCR'])

In [None]:
X = data['data']

In [None]:
header = data['feature_names']

In [None]:
Y = data['target']
Y = Y.reshape(-1, 1)

In [None]:
header

#### 데이터 프레임을 만들어 본다:

In [None]:
df = pd.DataFrame(np.append(X,Y,axis = 1))

In [None]:
df.columns = np.append(header,'PRICE')

In [None]:
df.head(5)

In [None]:
df.tail(5)

#### 여러 통계치를 계산해 본다:

In [None]:
df.describe()

In [None]:
np.round(df.corr(),3)

#### 선형회귀 적용:

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X,Y)

In [None]:
lm.coef_

In [None]:
pd.DataFrame(lm.coef_, columns = header)

In [None]:
lm.intercept_

In [None]:
plt.scatter(X[:,5],Y[:,0],c = 'g',s=15,alpha=0.5)
plt.xlabel('RM')
plt.ylabel('PRICE')
plt.show()

#### 예측 가격 (in-sample testing):

In [None]:
Y_pred = lm.predict(X)

In [None]:
plt.scatter(Y[:,0],Y_pred,c = 'blue', s=15, alpha=0.5)
plt.xlabel('REAL PRICE')
plt.ylabel('PREDICTED PRICE')
plt.show()

#### 결정계수 ($R^2$):

In [None]:
lm.score(X,Y)

#### 학습과 시험 (in and out-of-sample testing):

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
lm = LinearRegression()
lm.fit(X_train,Y_train)
Y_pred_train = lm.predict(X_train)
Y_pred_test = lm.predict(X_test)

In [None]:
print('Training MSE is:' + str(np.mean((Y_train - Y_pred_train)**2)))
print('Testing MSE is:' + str(np.mean((Y_test - Y_pred_test)**2)))

#### 잔차:

In [None]:
plt.scatter(np.arange(Y_train.size),Y_pred_train-Y_train,c = 'red', s=15, alpha=0.5)
plt.xlabel('N#')
plt.ylabel('Train Residual')
plt.show()

In [None]:
plt.scatter(np.arange(Y_test.size),Y_pred_test-Y_test,c = 'red', s=15, alpha=0.5)
plt.xlabel('N#')
plt.ylabel('Test Residual')
plt.show()

#### 신규 데이터와 가격 예측:      
        - CRIM     : 0.03
        - ZN       : 0.0
        - INDUS    : 13.0
        - CHAS     : 0.0
        - NOX      : 0.4
        - RM       : 4.3
        - AGE      : 23.5
        - DIS      : 1.9
        - RAD      : 1.0
        - TAX      : 273.0
        - PTRATIO  : 18.0 
        - B        : 380.0
        - LSTAT    : 7.5

In [None]:
X_new = np.array([0.03, 0.0, 13.0, 0.0, 0.4, 4.3, 23.5, 1.9, 1.0, 273.0, 18.0, 380.0, 7.5]).reshape(1,-1)
Y_pred_new = lm.predict(X_new)
print(Y_pred_new[0,0])