In [5]:
import pandas as pd
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split, GridSearchCV

# import models
from sklearn.ensemble import RandomForestRegressor

# set seed
SEED = 1

# read data
wbc = pd.read_csv("https://assets.datacamp.com/production/repositories/1796/datasets/0eb6987cb9633e4d6aa6cfd11e00993d2387caa4/wbc.csv")
wbc.head()

wbc_df = wbc.drop(['id', 'Unnamed: 32'], axis = 1)
wbc_df = pd.get_dummies(wbc_df, drop_first = True)

X = wbc_df.drop('diagnosis_M', axis = 1).values
y = wbc_df['diagnosis_M'].values


# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=SEED)

rf = RandomForestRegressor(random_state=SEED)

params_rf = {
    'n_estimators': [300, 400, 500],
    'max_depth': [4, 6, 8],
    'min_samples_leaf': [0.1, 0.2],
    'max_features': ['log2', 'sqrt']
}

grid_rf = GridSearchCV(estimator = rf,
                       param_grid=params_rf,
                       scoring = 'neg_mean_squared_error',
                       cv = 3,
                       verbose = 1,
                       n_jobs=-1)

grid_rf.fit(X_train, y_train)

best_hyperparams = grid_rf.best_params_
print('Best hyperparameters:\n', best_hyperparams)

best_model = grid_rf.best_estimator_

y_pred = best_model.predict(X_test)

rmse_test = MSE(y_test, y_pred)**(1/2)

print('Test set RMSE of rf: {:.2f}'.format(rmse_test))


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best hyperparameters:
 {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'n_estimators': 500}
Test set RMSE of rf: 0.26
