# 보스톤 집값 예측

## 패키지 로딩

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
# 총 네가지 모델의 선형회기 분석
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error # 오차에 제곱후 평균
from sklearn.metrics import mean_absolute_error # 절댓값을 씌운 후 평균

import numpy as np
import pandas as pd

## 데이터 로드

In [6]:
boston = load_boston()
boston_x = boston.data
boston_y = boston.target


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [7]:
boston_columns = boston.feature_names

In [8]:
boston_x.shape, boston_y.shape

((506, 13), (506,))

## train, test 데이터 분할

In [9]:
scaled_data = StandardScaler().fit_transform(boston_x)
#데이터 스케일링
x_train, x_test, y_train, y_test = train_test_split(scaled_data, boston_y, test_size=0.3)

## 알파값 변화에 따른 회귀계수 및 상관계수 값 계산
- 알파값은 규제 강도를 제어하는 상수 [0,inf) 범위의 부동 소수점 값을 사용할 수 있다.
- alpha = 0 --> 일반적 LinearRegression에서 사용되는 일반적인 최소제곱법과 같아진다.

In [14]:
def regulation_compare(alpha, model_name):
    df = pd.DataFrame()
    for a in alpha:
        print('회기모형:', model_name)
        print('알파값:', a)
        
        if model_name == 'Ridge':
            model = Ridge(alpha = a)
        elif model_name == 'Lasso':
            model = Lasso(alpha = a)
        elif model_name == 'ElasticNet':
            model = ElasticNet(alpha = a, l1_ratio= 0.5)
        else:
            model = LinearRegression()
        
        model.fit(x_train,y_train)
        r_square = model.score(x_test, y_test)
        print(f'결정계수: {r_square:.2f}')
        y_hat = model.predict(x_test)
        print(f'MAE: {mean_absolute_error(y_test,y_hat)}' )
        
        weight = { f:w for f,w in zip(boston.feature_names, model.coef_)}
        df['alpha'+str(a)] = pd.Series(weight)
        
        print('-'*40)
        
    return df
        

## 라쏘 회기 모델 생성 (L1 규제)

In [16]:
alpha = [0.07, 0.1, 0.5, 1, 3]
regulation_compare(alpha, 'Lasso')

# 알파 값이 커질수록 제거되는 항이 많아지면서 결정계수가 낮아지고 있음
#즉, 독립변수로 사용되는 13개의 특성값이 모두 중요한 역할을 하고 있는것으로 판단됨

회기모형: Lasso
알파값: 0.07
결정계수: 0.65
MAE: 3.60437468320981
----------------------------------------
회기모형: Lasso
알파값: 0.1
결정계수: 0.64
MAE: 3.5901367332030367
----------------------------------------
회기모형: Lasso
알파값: 0.5
결정계수: 0.62
MAE: 3.6266817948068475
----------------------------------------
회기모형: Lasso
알파값: 1
결정계수: 0.61
MAE: 3.7048673713509035
----------------------------------------
회기모형: Lasso
알파값: 3
결정계수: 0.48
MAE: 4.478519249252879
----------------------------------------


Unnamed: 0,alpha0.07,alpha0.1,alpha0.5,alpha1,alpha3
CRIM,-0.740197,-0.638377,-0.0,-0.0,-0.0
ZN,0.802758,0.693398,0.0,0.0,0.0
INDUS,-0.055578,-0.086713,-0.0,-0.0,-0.0
CHAS,0.065109,0.052768,0.0,0.0,0.0
NOX,-2.054213,-1.930364,-0.340138,-0.0,-0.0
RM,3.073556,3.13311,3.414215,3.197529,2.35348
AGE,-0.196219,-0.152556,-0.0,-0.0,-0.0
DIS,-2.983595,-2.764185,-0.336649,-0.0,0.0
RAD,1.688635,1.234727,-0.0,-0.0,-0.0
TAX,-1.328284,-0.954174,-0.136882,-0.058003,-0.0


## 릿지 회기 모델 생성 (L2 생성)

In [18]:
alpha = [0, 1, 5, 10, 100]
regulation_compare(alpha, 'Ridge')

회기모형: Ridge
알파값: 0
결정계수: 0.65
MAE: 3.672898346252991
----------------------------------------
회기모형: Ridge
알파값: 1
결정계수: 0.65
MAE: 3.662777180023433
----------------------------------------
회기모형: Ridge
알파값: 5
결정계수: 0.65
MAE: 3.631665279548606
----------------------------------------
회기모형: Ridge
알파값: 10
결정계수: 0.65
MAE: 3.6016460859341666
----------------------------------------
회기모형: Ridge
알파값: 100
결정계수: 0.63
MAE: 3.5838359497772685
----------------------------------------


Unnamed: 0,alpha0,alpha1,alpha5,alpha10,alpha100
CRIM,-0.977985,-0.961908,-0.906387,-0.851679,-0.568684
ZN,1.057633,1.036039,0.961731,0.888876,0.519106
INDUS,0.017626,-0.011261,-0.104068,-0.185455,-0.48615
CHAS,0.093889,0.100025,0.120796,0.140785,0.250341
NOX,-2.344059,-2.29984,-2.139087,-1.968149,-0.848834
RM,2.934798,2.948153,2.991586,3.029131,2.923292
AGE,-0.298551,-0.3032,-0.317347,-0.328058,-0.306548
DIS,-3.495597,-3.452362,-3.289396,-3.106656,-1.604693
RAD,2.748531,2.640838,2.285303,1.958133,0.465308
TAX,-2.201558,-2.105706,-1.798336,-1.530232,-0.622211


## Elastic 회귀모델 생성( L1+L2 규제)

In [19]:
alpha = [0, 1, 5, 10, 100]
regulation_compare(alpha, 'ElasticNet')

회기모형: ElasticNet
알파값: 0
결정계수: 0.65
MAE: 3.672898346252994
----------------------------------------
회기모형: ElasticNet
알파값: 1
결정계수: 0.58
MAE: 3.894251760007903
----------------------------------------
회기모형: ElasticNet
알파값: 5
결정계수: 0.30
MAE: 5.453358840066965
----------------------------------------
회기모형: ElasticNet
알파값: 10
결정계수: 0.06
MAE: 6.531035880374551
----------------------------------------
회기모형: ElasticNet
알파값: 100
결정계수: -0.02
MAE: 6.819907820398453
----------------------------------------


  model.fit(x_train,y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,alpha0,alpha1,alpha5,alpha10,alpha100
CRIM,-0.977985,-0.261746,-0.0,-0.0,-0.0
ZN,1.057633,0.080526,0.0,0.0,0.0
INDUS,0.017626,-0.314847,-0.219548,-0.0,-0.0
CHAS,0.093889,0.0,0.0,0.0,0.0
NOX,-2.344059,-0.285263,-0.045273,-0.0,-0.0
RM,2.934798,2.600251,0.983469,0.286327,0.0
AGE,-0.298551,-0.0,-0.0,-0.0,-0.0
DIS,-3.495597,-0.108167,0.0,0.0,0.0
RAD,2.748531,-0.0,-0.0,-0.0,-0.0
TAX,-2.201558,-0.395434,-0.185102,-0.0,-0.0
