# 선형 회귀식의 계수를 찾는 법 - OLS VS. SGD
- 보스턴 집값 데이터 활용(RM VS Price)

### 필요한 모듈 import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 필요한 라이브러리 import 

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from score import eval_score


# 1. LinearRegression 모델을 사용한 경우

In [2]:
boston = pd.read_csv('./data/boston_room_price.csv')
boston

Unnamed: 0,RM,price
0,6.575,24.0
1,6.421,21.6
2,7.185,34.7
3,6.998,33.4
4,7.147,36.2
...,...,...
501,6.593,22.4
502,6.120,20.6
503,6.976,23.9
504,6.794,22.0


In [10]:
X = np.array(boston.RM.values).reshape(-1, 1)
y = boston.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [11]:
lr = LinearRegression()
reg = lr.fit(X_train, y_train)

# 선형 회귀식
print(f"y = {reg.coef_[0]:.3f}X(Room) + ({reg.intercept_:.3f})")

# 선형 모델 평가
y_pred = reg.predict(X_test)

eval_score(reg, X_test, y_test)

y = 8.461X(Room) + (-30.571)
mse = 36.517
rmse = 6.043
r2 = 0.602


# 2. SGDRegressor with hyperparameter

In [13]:
from sklearn.linear_model import SGDRegressor

# 모델 객체 생성
reg = SGDRegressor(random_state=42)
reg.fit(X_train, y_train)

# 선형 회귀식
print(f"y = {reg.coef_[0]:.3f}X(Room) + ({reg.intercept_})")

# 선형 모델 평가
y_pred = reg.predict(X_test)

eval_score(reg, X_test, y_test)

y = 4.177X(Room) + ([-3.58747787])
mse = 55.130
rmse = 7.425
r2 = 0.399


In [16]:
boston

Unnamed: 0,RM,price
0,6.575,24.0
1,6.421,21.6
2,7.185,34.7
3,6.998,33.4
4,7.147,36.2
...,...,...
501,6.593,22.4
502,6.120,20.6
503,6.976,23.9
504,6.794,22.0


# 3. SGDRegressor with scaling

In [39]:
# 표준화 스케일링 (평균 0, 분산 1)

train_mean = np.mean(X_train, axis = 0)
train_std = np.std(X_train, axis = 0)
X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

from sklearn.linear_model import SGDRegressor

# 모델 객체 생성
reg = SGDRegressor(random_state=42)
reg.fit(X_train_scaled, y_train)

# 선형 회귀식
print(f"y = {reg.coef_[0]:.3f}X(Room) + ({reg.intercept_})")

# 선형 모델 평가
y_pred = reg.predict(X_test_scaled)

eval_score(reg, X_test_scaled, y_test)

y = 5.848X(Room) + ([22.31897879])
mse = 36.523
rmse = 6.043
r2 = 0.602


# 4. SGDRegressor with StandardScaler() + hyperparameter tuning

In [69]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.linear_model import SGDRegressor

# 모델 객체 생성
reg = SGDRegressor(max_iter =1000000, random_state=42, eta0 = 0.00001)
reg.fit(X_train_scaled, y_train)

# 선형 회귀식
print(f"y = {reg.coef_[0]:.3f}X(Room) + ({reg.intercept_})")

# 선형 모델 평가
y_pred = reg.predict(X_test_scaled)

eval_score(reg, X_test_scaled, y_test)


y = 4.887X(Room) + ([18.65786642])
mse = 53.343
rmse = 7.304
r2 = 0.418


# 참고 : 파이프라인으로 데이터 변환하기


In [71]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline

# reg = make_pipeline(StandardScaler(),
#                     SGDRegressor(max_iter=1000000, eta0=0.01,\
#                                  tol=0.0001, random_state=42, loss='squared_loss'))
# reg.fit(X_train, y_train)

# # 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
# print(reg[1].coef_, reg[1].intercept_)

# #회귀식 - pipeline()을 사용했기 때문에 SGDRegressor의 parameter가 reg객체의 1번 인덱스에 들어감
# print("y = {:2f}X + {:.3f}".format(reg[1].coef_[0], reg[1].intercept_[0]))

# # 예측 수행
# y_pred = reg.predict(X_test)
# from sklearn.metrics import mean_squared_error, r2_score

# # MSE, RMSE, r2_score
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse)
# r2 = r2_score(y_test, y_pred)

# print("MSE:", np.round(mse, 3))
# print("RMSE: ", np.round(rmse, 3))
# print("R2: ", np.round(r2, 3))


