In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error as MSE

In [4]:
ames_preprocessed = pd.read_csv("https://assets.datacamp.com/production/repositories/943/datasets/4dbcaee889ef06fb0763e4a8652a4c1f268359b2/ames_housing_trimmed_processed.csv")

X = ames_preprocessed.drop('SalePrice', axis = 1)
y = ames_preprocessed['SalePrice']

In [6]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree (boosting round)
params = {"objective":"reg:squarederror", "max_depth":3}

# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

# Systematically vary the eta 
for curr_val in eta_vals:

    params["eta"] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain = housing_dmatrix,
                               params = params,
                               nfold = 3,
                               num_boost_round = 10,
                               early_stopping_rounds=10,
                               metrics = "rmse",
                               as_pandas = True,
                               seed = 123)
    
    
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"]))

     eta     best_rmse
0  0.001  78903.745397
1  0.010  74293.709019
2  0.100  47136.241898


In [24]:
# using GridSearchCV

X, y = ames_preprocessed[ames_preprocessed.columns.tolist()[:-1]], ames_preprocessed[ames_preprocessed.columns.tolist()[-1]]

housing_dmatrix = xgb.DMatrix(data = X, label = y)

gbm_param_grid = {'learning_rate': [0.01, 0.1, 0.5, 0.9],
                  'n_estimators': [200],
                  'subsample': [0.3, 0.5, 0.9]}

gbm = xgb.XGBRegressor()

grid_mse = GridSearchCV(estimator = gbm,
                        param_grid = gbm_param_grid,
                        scoring = 'neg_mean_squared_error',
                        cv = 4,
                        verbose = 1)

grid_mse.fit(X, y)

print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best parameters found:  {'learning_rate': 0.1, 'n_estimators': 200, 'subsample': 0.5}
Lowest RMSE found:  28509.31498929468


In [28]:
# test random search

from sklearn.model_selection import RandomizedSearchCV

gbm_param_grid = {'learning_rate': np.arange(0.05, 1.05, 0.05),
                  'n_estimators': [200],
                  'subsample': np.arange(0.05, 1.05, 0.05)}

gbm = xgb.XGBRegressor()

randomized_mse = RandomizedSearchCV(estimator = gbm,
                        param_distributions = gbm_param_grid,
                        scoring = 'neg_mean_squared_error',
                        n_iter = 25,
                        cv = 4,
                        verbose = 1)

randomized_mse.fit(X, y)

print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 25 candidates, totalling 100 fits
Best parameters found:  {'subsample': 0.7000000000000001, 'n_estimators': 200, 'learning_rate': 0.3}
Lowest RMSE found:  29654.069497378503
