In [17]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [18]:
csv_in = 'ai-end2-5.csv'

In [19]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(506, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   MEDV     506 non-null    float64
 1   CRIM     506 non-null    float64
 2   ZN       506 non-null    float64
 3   INDUS    506 non-null    float64
 4   CHAS     506 non-null    float64
 5   NOX      506 non-null    float64
 6   RM       506 non-null    float64
 7   AGE      506 non-null    float64
 8   DIS      506 non-null    float64
 9   RAD      506 non-null    float64
 10  TAX      506 non-null    float64
 11  PTRATIO  506 non-null    float64
 12  B        506 non-null    float64
 13  LSTAT    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB
None


Unnamed: 0,MEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,24.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,21.6,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,34.7,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,33.4,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,36.2,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [20]:
X = df.loc[:, 'CRIM':]  # explanatory variables
y = df['MEDV']  # objective variable
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

X: (506, 13)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


y: (506,)
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64


In [21]:
rfr=RandomForestRegressor(random_state=20)            

# Hyperparameter settings for grid search
param_grid = {
    'n_estimators': [50, 100, 500],
    'max_depth' : [4, 6, None],
}

In [22]:
grid_cv = KFold(n_splits=4, shuffle=True, random_state=21)  # for grid search
gen_cv = KFold(n_splits=4, shuffle=True, random_state=22)  # for estimation of general

In [23]:
gs = GridSearchCV(rfr, param_grid , cv=grid_cv, scoring='neg_mean_squared_error')

In [24]:
%%time
nested_score = cross_val_score(gs, X=X, y=y, cv=gen_cv,
                               scoring='neg_mean_squared_error')
print(nested_score)
print(np.sqrt(-nested_score.mean()))

[-14.80570986  -8.79561684  -9.78868283 -10.19374566]
3.300899695207208
Wall time: 1min 8s


In [25]:
%%time
gs.fit(X, y)
gs_best = gs.best_estimator_

Wall time: 20.9 s


In [26]:
print(gs_best)

RandomForestRegressor(n_estimators=500, random_state=20)


In [27]:
print(pd.Series(gs_best.feature_importances_, index=X.columns))

CRIM       0.038219
ZN         0.001114
INDUS      0.006242
CHAS       0.000749
NOX        0.024441
RM         0.429374
AGE        0.013591
DIS        0.066676
RAD        0.003771
TAX        0.013692
PTRATIO    0.016565
B          0.011536
LSTAT      0.374032
dtype: float64
