In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [5]:
ames_preprocessed = pd.read_csv("https://assets.datacamp.com/production/repositories/943/datasets/4dbcaee889ef06fb0763e4a8652a4c1f268359b2/ames_housing_trimmed_processed.csv")

X = ames_preprocessed.drop('SalePrice', axis = 1)
y = ames_preprocessed['SalePrice']

In [9]:
housing_dmatrix = xgb.DMatrix(data=X, label=y)
tuned_params = {"objective":"reg:squarederror",
                'colsample_bytree': 0.3,
                'learning_rate': 0.1,
                'max_depth': 5}

tuned_cv_results_rmse = xgb.cv(dtrain = housing_dmatrix,
                               params = tuned_params,
                               nfold = 4,
                               num_boost_round = 200,
                               metrics = "rmse",
                               as_pandas = True,
                               seed = 123)

print("Tuned rmse: %f" %((tuned_cv_results_rmse["test-rmse-mean"]).tail(1)))

Tuned rmse: 29641.548619


In [11]:
# tuning the number of boosting rounds

# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data = X, label=y)

# Create the parameter dictionary for each tree: params 
params = {"objective":"reg:squarederror", "max_depth":3}

# Create list of number of boosting rounds
num_rounds = [5, 10, 15]

# Empty list to store final round rmse per XGBoost model
final_rmse_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_num_rounds in num_rounds:

    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, 
                        params=params, 
                        nfold=3, 
                        num_boost_round=curr_num_rounds, 
                        metrics="rmse", 
                        as_pandas=True, 
                        seed=123)
    
    # Append final round RMSE
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]))

   num_boosting_rounds          rmse
0                    5  40350.042785
1                   10  34222.544068
2                   15  32537.190260
