In [24]:
!pip install xgboost



In [25]:
#Import libraries:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [15]:
#Read the data:
data = pd.read_csv("../data/Melbourne_housing_FULL.csv")


In [18]:


#drop all the null vlaues based on the Price column:
data.dropna(subset=["Price"],inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27247 entries, 1 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         27247 non-null  object 
 1   Address        27247 non-null  object 
 2   Rooms          27247 non-null  int64  
 3   Type           27247 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         27247 non-null  object 
 6   SellerG        27247 non-null  object 
 7   Date           27247 non-null  object 
 8   Distance       27246 non-null  float64
 9   Postcode       27246 non-null  float64
 10  Bedroom2       20806 non-null  float64
 11  Bathroom       20800 non-null  float64
 12  Car            20423 non-null  float64
 13  Landsize       17982 non-null  float64
 14  BuildingArea   10656 non-null  float64
 15  YearBuilt      12084 non-null  float64
 16  CouncilArea    27244 non-null  object 
 17  Lattitude      20993 non-null  float64
 18  Longti

In [19]:
#using certain columns:
cols_to_use = ["Rooms", "Distance", "Landsize", "BuildingArea", "YearBuilt"]
X = data[cols_to_use]

In [20]:
#Select target:
y = data.Price

In [21]:
#Seperate data into training and validation sets:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In this notebook, we will be exploring XGBoost library. XGboost stands for extreme gradient boosting which is an implementation of gradient boosting with several additional features focused on performance and speed.

In [22]:
model = XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [23]:
help(XGBRegressor)

Help on class XGBRegressor in module xgboost.sklearn:

class XGBRegressor(XGBModel, sklearn.base.RegressorMixin)
 |  XGBRegressor(*, objective: Union[str, Callable[[numpy.ndarray, numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'reg:squarederror', **kwargs: Any) -> None
 |  
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  
 |  
 |  Parameters
 |  ----------
 |  
 |      n_estimators : int
 |          Number of gradient boosted trees.  Equivalent to number of boosting
 |          rounds.
 |  
 |      max_depth :  Optional[int]
 |          Maximum tree depth for base learners.
 |      max_leaves :
 |          Maximum number of leaves; 0 indicates no limit.
 |      max_bin :
 |          If using histogram-based algorithm, maximum number of bins per feature
 |      grow_policy :
 |          Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
 |          depth-wise. 1: favor splitting at nodes with highest loss change.
 | 

In [26]:
predictions = model.predict(X_valid)

In [28]:
print("Mean absolute error:",mean_absolute_error(y_true= y_valid, y_pred= predictions))

Mean absolute error: 233543.7948186564


In [29]:
#using hyper parameter tuning concept:
model = XGBRegressor(n_estimators = 500)
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [31]:
#using early stopping : it indicates the number of iterations as where to stop
model = XGBRegressor(n_estimators = 500)
model.fit(X_train, y_train,
          early_stopping_rounds = 5,
          eval_set = [(X_valid, y_valid)],
          verbose = True
          )

[0]	validation_0-rmse:930233.96475
[1]	validation_0-rmse:729387.88152
[2]	validation_0-rmse:601873.96149
[3]	validation_0-rmse:524734.76561
[4]	validation_0-rmse:480463.24049
[5]	validation_0-rmse:455242.98085
[6]	validation_0-rmse:440291.23348
[7]	validation_0-rmse:431241.10005
[8]	validation_0-rmse:425534.53403
[9]	validation_0-rmse:420260.47598
[10]	validation_0-rmse:415404.51754
[11]	validation_0-rmse:412354.36725
[12]	validation_0-rmse:411347.01764
[13]	validation_0-rmse:407462.40742
[14]	validation_0-rmse:404887.64221
[15]	validation_0-rmse:404360.31942




[16]	validation_0-rmse:403613.93647
[17]	validation_0-rmse:403261.80371
[18]	validation_0-rmse:402084.47104
[19]	validation_0-rmse:400969.56995
[20]	validation_0-rmse:398978.17846
[21]	validation_0-rmse:398299.32400
[22]	validation_0-rmse:398386.68008
[23]	validation_0-rmse:397391.58782
[24]	validation_0-rmse:397665.11123
[25]	validation_0-rmse:397683.45255
[26]	validation_0-rmse:397675.85862
[27]	validation_0-rmse:396279.39797
[28]	validation_0-rmse:396204.57983
[29]	validation_0-rmse:396327.72699
[30]	validation_0-rmse:396051.49242
[31]	validation_0-rmse:396164.07517
[32]	validation_0-rmse:395867.91283
[33]	validation_0-rmse:395585.64529
[34]	validation_0-rmse:395133.81752
[35]	validation_0-rmse:394682.58238
[36]	validation_0-rmse:394703.85290
[37]	validation_0-rmse:394513.19498
[38]	validation_0-rmse:393071.91983
[39]	validation_0-rmse:393464.62724
[40]	validation_0-rmse:392466.29536
[41]	validation_0-rmse:392424.28729
[42]	validation_0-rmse:392547.24056
[43]	validation_0-rmse:39204

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [32]:
#using learning rate: 
#Note: If you have lower learning rate, then you need more number of iterations to converge.
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
model.fit(X_train, y_train,
          early_stopping_rounds=5,
          eval_set=[(X_valid, y_valid)],
          verbose=True,)

[0]	validation_0-rmse:1186456.99992
[1]	validation_0-rmse:1136861.12036
[2]	validation_0-rmse:1089996.84471
[3]	validation_0-rmse:1045925.06906
[4]	validation_0-rmse:1004498.13112
[5]	validation_0-rmse:965098.46242
[6]	validation_0-rmse:927785.01898
[7]	validation_0-rmse:892977.76100
[8]	validation_0-rmse:860183.32457
[9]	validation_0-rmse:829510.44454
[10]	validation_0-rmse:800864.91401
[11]	validation_0-rmse:773704.81771
[12]	validation_0-rmse:748066.91890
[13]	validation_0-rmse:724255.25526
[14]	validation_0-rmse:701955.19119
[15]	validation_0-rmse:681341.78306
[16]	validation_0-rmse:662043.39758
[17]	validation_0-rmse:643721.87088
[18]	validation_0-rmse:626496.34006
[19]	validation_0-rmse:610978.38965
[20]	validation_0-rmse:596589.80819




[21]	validation_0-rmse:583279.29968
[22]	validation_0-rmse:570491.60726
[23]	validation_0-rmse:558740.93422
[24]	validation_0-rmse:548236.17190
[25]	validation_0-rmse:538094.37986
[26]	validation_0-rmse:528864.57893
[27]	validation_0-rmse:520111.41018
[28]	validation_0-rmse:511962.48867
[29]	validation_0-rmse:504788.95230
[30]	validation_0-rmse:498018.68851
[31]	validation_0-rmse:491895.79388
[32]	validation_0-rmse:486156.08218
[33]	validation_0-rmse:480837.01349
[34]	validation_0-rmse:475722.87882
[35]	validation_0-rmse:471255.17110
[36]	validation_0-rmse:466922.02652
[37]	validation_0-rmse:462892.97316
[38]	validation_0-rmse:459351.75532
[39]	validation_0-rmse:455804.28534
[40]	validation_0-rmse:452875.52803
[41]	validation_0-rmse:450219.21507
[42]	validation_0-rmse:447541.30922
[43]	validation_0-rmse:444913.14230
[44]	validation_0-rmse:442640.32020
[45]	validation_0-rmse:440609.39392
[46]	validation_0-rmse:438718.51532
[47]	validation_0-rmse:436771.53990
[48]	validation_0-rmse:43490

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=4, num_parallel_tree=None, predictor=None,
             random_state=None, ...)

Note:
    1- Having high learning rate is not good
    2- Having low learning rate is also not good
    3- Having very high estimators are also not good

We need to find these values by building our own function