In [285]:
from pandas import read_csv
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

df = read_csv("./data/boston.csv")
df.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.12,76.7,2.2875,1,273.0,21.0,396.9,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.9,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0
505,0.04741,0.0,11.93,0,0.573,6.03,80.8,2.505,1,273.0,21.0,396.9,7.88,11.9


In [286]:
X = df.drop("MEDV",axis=1)
y = df.MEDV

In [287]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [288]:
steps = Pipeline([
    ("Poly", PolynomialFeatures()),
    ("Scaler", StandardScaler()),
    ("RF", RandomForestRegressor())
])

#n_estimators is number of decision_tree that will be averaged to get the ensembled decision tree or random forest function
#min_samples_leaf is the minimum number of sample in the leaf, higher num of samples will create more general model
parameters = {
    "Poly__interaction_only" : [True, False],
    "RF__n_estimators" : [10, 20, 40, 60],
    "RF__max_features" : [0.1, 0.5, 'sqrt'],
    "RF__min_samples_leaf" : [1, 3] 
}

RF = GridSearchCV(steps, param_grid=parameters, cv=3)
RF.fit(X_train,y_train)
RF.best_score_

0.8364243219938771

In [289]:
RF.best_params_

{'Poly__interaction_only': True,
 'RF__max_features': 'sqrt',
 'RF__min_samples_leaf': 1,
 'RF__n_estimators': 20}

In [290]:
RF.grid_scores_



[mean: 0.82056, std: 0.03071, params: {'Poly__interaction_only': True, 'RF__max_features': 0.1, 'RF__min_samples_leaf': 1, 'RF__n_estimators': 10},
 mean: 0.79863, std: 0.02808, params: {'Poly__interaction_only': True, 'RF__max_features': 0.1, 'RF__min_samples_leaf': 1, 'RF__n_estimators': 20},
 mean: 0.81638, std: 0.02098, params: {'Poly__interaction_only': True, 'RF__max_features': 0.1, 'RF__min_samples_leaf': 1, 'RF__n_estimators': 40},
 mean: 0.82669, std: 0.02035, params: {'Poly__interaction_only': True, 'RF__max_features': 0.1, 'RF__min_samples_leaf': 1, 'RF__n_estimators': 60},
 mean: 0.79379, std: 0.06335, params: {'Poly__interaction_only': True, 'RF__max_features': 0.1, 'RF__min_samples_leaf': 3, 'RF__n_estimators': 10},
 mean: 0.80512, std: 0.01834, params: {'Poly__interaction_only': True, 'RF__max_features': 0.1, 'RF__min_samples_leaf': 3, 'RF__n_estimators': 20},
 mean: 0.80652, std: 0.03379, params: {'Poly__interaction_only': True, 'RF__max_features': 0.1, 'RF__min_samples

In [291]:
RF.score(X_train, y_train), RF.score(X_test, y_test)

(0.9740373953408598, 0.8372710850782765)

In [292]:
import numpy as np
RMSE = np.sqrt(np.mean((y_test-RF.predict(X_test))**2))
RMSE

3.4544937343675