# 보스톤 집값 예측

## 패키지 로딩

In [14]:
import warnings
warnings.filterwarnings(action='ignore') # action - 'default' 원상복귀

In [29]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [16]:
x,y = load_boston(return_X_y= True)

## 학습, 평가 데이터 분할

In [24]:
scaled_x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(scaled_x, y, test_size= 0.3, random_state= 10)

## 모델 생성 및 검증

In [25]:
n_estimators = 20 # 모델의 개수
model = RandomForestRegressor(n_estimators= n_estimators, random_state=0)
model.fit(x_train,y_train)

RandomForestRegressor(n_estimators=20, random_state=0)

In [30]:
y_hat = model.predict(x_test)

print(f'n_estimators: {n_estimators}개')
print(f'MSE : {mean_squared_error(y_test, y_hat):.3f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test,y_hat)):.3f}')
print(f'R**2: {r2_score(y_test,y_hat):.3f}')

n_estimators: 20개
MSE : 12.050
RMSE: 3.471
R**2: 0.877


## 하이퍼 파라메터

In [32]:
from sklearn.metrics import SCORERS
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [33]:
from sklearn.model_selection import GridSearchCV

# max_features: 의사결정 나무가 최적의 분할을 만들기 위해 사용하는 피처의 수
#bootstrap : 복원 추출 여부 , False : 복원추출 안함
params = [ {'n_estimators': [3, 10, 20, 30, 40, 50],
           'max_features': [2,4,6,8]},
          
          {'n_estimators': [3, 10],
           'max_features': [2,3,4],
          'bootstrap':[False]}]

gs = GridSearchCV(model, params, cv=5, scoring = 'r2')
gs.fit(x_train, y_train)

print('best params', gs.best_params_)

best params {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}


In [35]:
y_hat = gs.best_estimator_.predict(x_test)

print('best params', gs.best_params_)
print(f'MSE : {mean_squared_error(y_test, y_hat):.3f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test,y_hat)):.3f}')
print(f'R**2: {r2_score(y_test,y_hat):.3f}')

best params {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
MSE : 10.648
RMSE: 3.263
R**2: 0.891
