##### 보스턴 집값 예측 모델
- 데이터셋:boston.csv
- 학습방법: 지도학습(회귀)
- 피쳐/독립: 13개
- 타겟/종속: 1개


[1] 데이터 준비

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split


In [4]:
# 데이터
DATA_FILE= '../data/boston.csv'
DF= pd.read_csv(DATA_FILE)
DF.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


[2] 전처리
- [2-1] 데이터 정제

In [5]:
# 결측, 중복, 이상.....(컬럼별 고유값 추출로 체크)

[2-2] 표준화 & 정규화 (진행여부에 따라 성능차이는 상이)  
    * 정규분포 데이터셋을 기반으로 한 모델 -> StandardScaler, MinMaxScaler, Log변환  
    * 피쳐의 값의 범위 차이를 줄이기-> MinMaxScaler, RobusScaler....  
    * 범주형 피쳐-> 수치화 인코딩 => OneHotEncoder, OrdinalEncoder  
    * 문자열 타겟-> 정수 라벨인코딩 => LabelEncoder

[2-3] 피쳐와 타겟 분리

In [8]:
featureDF= DF.iloc[: , :-1]
targetSR= DF['MEDV']

In [11]:
print(f'featureDF: {featureDF.shape}, \ntargetSR= {targetSR.shape}')

featureDF: (506, 13), 
targetSR= (506,)


[3] 학습준비    
[3-1] 데이터셋 분리

In [12]:
X_train, X_test, Y_train, Y_test= train_test_split(featureDF, targetSR, random_state=10)

In [13]:
print(f'X_train: {X_train.shape}, Y_train: {Y_train.shape}')
print(f'X_test: {X_test.shape}, Y_test: {Y_test.shape}')

X_train: (379, 13), Y_train: (379,)
X_test: (127, 13), Y_test: (127,)


[3-2] 스케일러 생성


In [14]:
# 수치 피쳐 범위차가 크기 때문에 스케일링 진행
ssScaler= StandardScaler()

ssScaler.fit(X_train)

In [15]:
X_train_scaled= ssScaler.transform(X_train)
X_test_scaled= ssScaler.transform(X_test)

[4] 학습진행 -> 교차 검증으로 진행


In [47]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Lasso

In [21]:


# 인스턴스 생성
ridge_model= Ridge(alpha=1.0) #기본값= 1

In [24]:
# 학습 진행
# cv=3개  (얘는 hyperparameter아님!)
# scoring= 'MSE', 'r2'
# return_train_score

result= cross_validate(ridge_model, X_train_scaled, Y_train,
                       cv=3, scoring=['neg_mean_squared_error', 'r2'],
                       return_train_score=True)

In [28]:
resultDF= pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2
0,0.002328,0.0,-17.320297,-20.143636,0.748283,0.755663
1,0.0,0.0,-22.582566,-18.210772,0.756292,0.740039
2,0.015002,0.00102,-22.657585,-17.293662,0.680991,0.786097


In [56]:
# 하이퍼파라미터 제어 (튜닝)  -> alpha, max_iter ...
alpha_values= [1,10,100]

# 인스턴스 생성
for alpha in alpha_values:
    ridge_model= Lasso(alpha=alpha) #기본값= 1
    
    result= cross_validate(ridge_model, X_train_scaled, Y_train,
                       cv=3, scoring=['neg_mean_squared_error', 'r2'],
                       return_train_score=True, return_estimator=True) #점수, 기울기+절편 가져오기

    
    
    resultDF=pd.DataFrame(result)[['test_r2', 'train_r2']]
    
    resultDF['Diff']=abs(resultDF['test_r2']-resultDF['train_r2'])
    best_idx=resultDF['Diff'].sort_values()[0]
    print(result['estimator'][0].coef_)
    print(f'alpha={alpha}: ')
    print(f'{resultDF}\n\n')
    print(best_idx)

[-0.18119516  0.         -0.          0.         -0.          2.6706524
 -0.         -0.         -0.         -0.1542158  -1.17708874  0.36943757
 -3.33718723]
alpha=1: 
    test_r2  train_r2      Diff
0  0.712820  0.665907  0.046913
1  0.677096  0.650001  0.027094
2  0.602241  0.720850  0.118609


0.04691254131246558
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
alpha=10: 
    test_r2  train_r2      Diff
0 -0.000010       0.0  0.000010
1 -0.014817       0.0  0.014817
2 -0.018473       0.0  0.018473


9.961709551475906e-06
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
alpha=100: 
    test_r2  train_r2      Diff
0 -0.000010       0.0  0.000010
1 -0.014817       0.0  0.014817
2 -0.018473       0.0  0.018473


9.961709551475906e-06


In [57]:
# 하이퍼파라미텨 튜닝과 검증을 동시에 진행

In [58]:
from sklearn.model_selection import GridSearchCV

In [59]:
#Ridge의 Hyper-parameter 값 설정
params={'alpha': [0,0.1, 0.5, 1], 
        'max_iter':[3,5]}
# 총 경우의 수는 8개 (8개의 모델 생성)


In [60]:
# 인스턴스 생성
r_model=Ridge()
searchCV= GridSearchCV(r_model, param_grid=params, cv=3, verbose=True, return_train_score=True)


In [62]:
# 학습 진행
searchCV.fit(X_train_scaled, Y_train)
#하이퍼 파라미터 조합 8개 * cv3개=> 24개 fit

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [66]:
searchCV.best_params_, searchCV.best_estimator_, searchCV.best_score_

({'alpha': 1, 'max_iter': 3}, Ridge(alpha=1, max_iter=3), 0.7285219517985944)

In [68]:
# best 모델 추출
bsetCV=searchCV.best_estimator_

In [70]:
pd.DataFrame(searchCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001325,0.0004713195,0.000665,0.00094,0.0,3,"{'alpha': 0, 'max_iter': 3}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
1,0.000664,0.0004696845,0.000664,0.00047,0.0,5,"{'alpha': 0, 'max_iter': 5}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
2,0.000997,5.947204e-07,0.000106,0.00015,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
3,0.0,0.0,0.001998,0.002826,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
4,0.000908,0.0001373893,0.000332,0.000469,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
5,0.0,0.0,0.0,0.0,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
6,0.0,0.0,0.0,0.0,1.0,3,"{'alpha': 1, 'max_iter': 3}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
7,0.000361,0.0005104826,0.004057,0.005738,1.0,5,"{'alpha': 1, 'max_iter': 5}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124


In [72]:
bsetCV.score(X_test_scaled, Y_test)

0.6744468321019557