Для выполнения домашнего задания необходимо взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество).

In [224]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
from sklearn.model_selection import train_test_split
%config InlineBackend.figure_format = 'svg'
from sklearn.datasets import load_boston
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

In [225]:
df = load_boston()
df = pd.concat([pd.DataFrame(df['data'], columns=df['feature_names']), pd.Series(df['target'],name='MEDV')],axis=1)

In [226]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(14)
memory usage: 55.5 KB


In [228]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [229]:
models, predictions, scores = {}, {}, {}

### Gradient Boosting

In [230]:
params = {'n_estimators': np.linspace(1, 100, dtype=int),
          'max_depth': np.linspace(1, 100, dtype=int)}

rand_gb = RandomizedSearchCV(GradientBoostingRegressor(), param_distributions=params, cv=10)
rand_gb.fit(X_train,y_train)

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                            

In [231]:
print(rand_gb.best_score_)
print(rand_gb.best_params_)

0.8379255669754316
{'n_estimators': 23, 'max_depth': 7}


In [232]:
params = {'n_estimators': [10,50,100,150],
          'max_depth': np.linspace(20, 100, 5, dtype=int)}

grid_gb = GridSearchCV(GradientBoostingRegressor(), param_grid=params, cv=10)
grid_gb.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                

In [233]:
print(grid_gb.best_score_)
print(grid_gb.best_params_)

0.8109112941029288
{'max_depth': 60, 'n_estimators': 100}


In [294]:
models['GradientBoosting'] = grid_gb.best_estimator_
predictions['GradientBoosting'] = grid_gb.predict(X_test)
scores['GradientBoosting'] = {'MAE': mean_absolute_error(y_test, predictions['GradientBoosting']), 
                          'MSE': mean_squared_error(y_test, predictions['GradientBoosting']),
                          'R2': r2_score(y_test, predictions['GradientBoosting'])}

print(f"GradientBoosting\n{10*'-'}\nMAE = {scores['GradientBoosting']['MAE']:.2f}\n"\
      f"MSE = {scores['GradientBoosting']['MSE']:.2f}\n"\
      f"R2 = {scores['GradientBoosting']['R2']:.2f}")

GradientBoosting
----------
MAE = 3.54
MSE = 33.08
R2 = 0.59


### Random Forest

In [235]:
params = {'n_estimators': np.linspace(1, 100, dtype=int),
          'max_depth': np.linspace(1, 100, dtype=int)}

rand_rf = RandomizedSearchCV(RandomForestRegressor(), param_distributions=params, cv=10)
rand_rf.fit(X_train,y_train)

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                             

In [236]:
print(rand_rf.best_score_)
print(rand_rf.best_params_)

0.8660015311061103
{'n_estimators': 41, 'max_depth': 9}


In [237]:
params = {'n_estimators': [10,50,100,150],
          'max_depth': np.linspace(20, 100, 5, dtype=int)}

grid_rf = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=10)
grid_rf.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [238]:
print(grid_rf.best_score_)
print(grid_rf.best_params_)

0.8697113154443104
{'max_depth': 80, 'n_estimators': 100}


In [295]:
models['RandomForest'] = grid_rf.best_estimator_
predictions['RandomForest'] = grid_rf.predict(X_test)
scores['RandomForest'] = {'MAE': mean_absolute_error(y_test, predictions['RandomForest']), 
                          'MSE': mean_squared_error(y_test, predictions['RandomForest']),
                          'R2': r2_score(y_test, predictions['RandomForest'])}

print(f"RandomForest\n{10*'-'}\nMAE = {scores['RandomForest']['MAE']:.2f}\n"\
      f"MSE = {scores['RandomForest']['MSE']:.2f}\n"\
      f"R2 = {scores['RandomForest']['R2']:.2f}")

RandomForest
----------
MAE = 2.69
MSE = 20.25
R2 = 0.75


### Extra Trees Regressor

In [240]:
params = {'n_estimators': np.linspace(1, 100, dtype=int),
          'max_depth': np.linspace(1, 100, dtype=int)}

rand_et = RandomizedSearchCV(ExtraTreesRegressor(), param_distributions=params, cv=10)
rand_et.fit(X_train,y_train)

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                                 criterion='mse',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=Fals...
        27,  29,  31,  

In [241]:
print(rand_et.best_score_)
print(rand_et.best_params_)

0.9101128525728219
{'n_estimators': 83, 'max_depth': 43}


In [242]:
params = {'n_estimators': [10,50,100,150],
          'max_depth': np.linspace(20, 100, 5, dtype=int)}

grid_et = GridSearchCV(ExtraTreesRegressor(), param_grid=params, cv=10)
grid_et.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                           criterion='mse', max_depth=None,
                                           max_features='auto',
                                           max_leaf_nodes=None,
                                           max_samples=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=100, n_jobs=None,
                                           oob_score=False, random_state=None,
                                           verbose=0, warm_start=False),
             iid='deprecated', n_jobs=None,
             pa

In [243]:
print(grid_et.best_score_)
print(grid_et.best_params_)

0.916146042415275
{'max_depth': 40, 'n_estimators': 100}


In [296]:
models['ExtraTrees'] = grid_et.best_estimator_
predictions['ExtraTrees'] = grid_et.predict(X_test)
scores['ExtraTrees'] = {'MAE': mean_absolute_error(y_test, predictions['ExtraTrees']), 
                          'MSE': mean_squared_error(y_test, predictions['ExtraTrees']),
                          'R2': r2_score(y_test, predictions['ExtraTrees'])}

print(f"ExtraTrees\n{10*'-'}\nMAE = {scores['ExtraTrees']['MAE']:.2f}\n"\
      f"MSE = {scores['ExtraTrees']['MSE']:.2f}\n"\
      f"R2 = {scores['ExtraTrees']['R2']:.2f}")

ExtraTrees
----------
MAE = 2.71
MSE = 21.54
R2 = 0.74


### Сравним результаты

In [264]:
for model in scores:
    print(f"{model:<16}\tMAE={scores[model]['MAE']:.2f}\tMSE={scores[model]['MSE']:.2f}\tR2={scores[model]['R2']:.2f}")

GradientBoosting	MAE=3.54	MSE=33.08	R2=0.59
RandomForest    	MAE=2.69	MSE=20.25	R2=0.75
ExtraTrees      	MAE=2.71	MSE=21.54	R2=0.74
