In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [4]:

import zipfile
with zipfile.ZipFile('./final_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [5]:
df = pd.read_csv('./final_dataset.csv')

In [6]:
X = df.drop(['ars_price'], axis=1)
y = df['ars_price']
a, b = X.iloc[:,0:5].to_numpy(), X.iloc[:,5:].to_numpy()
scaler = StandardScaler()
scaled_a = scaler.fit_transform(a)
scaled_X = np.concatenate((scaled_a, b), axis=1)

In [7]:
minmax = MinMaxScaler()
minmax_a = minmax.fit_transform(a)
minmax_X = np.concatenate((minmax_a, b), axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)
scaled_X_train, scaled_X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.10)
minmax_X_train, minmax_X_test, y_train, y_test = train_test_split(minmax_X, y, test_size = 0.10)

In [9]:
param_grid ={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }


In [10]:
reg = xgb.XGBRegressor()

In [13]:
gs = GridSearchCV(reg, param_grid=param_grid, n_jobs=-1, scoring='neg_mean_squared_error', verbose=10, return_train_score=True)

In [14]:
gs.fit(minmax_X_train, y_train)

Fitting 3 folds for each of 3840 candidates, totalling 11520 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 229 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 289 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 353 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  

[Parallel(n_jobs=-1)]: Done 10321 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 10466 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 10613 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 10760 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 10909 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 11058 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 11209 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 11520 out of 11520 | elapsed: 17.1min finished
  if getattr(data, 'base', None) is not None and \




GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_sta...
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                         'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                         'learning_rate': [0.05, 0.1, 0.

In [18]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_min_child_weight,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,3.447040,0.013734,0.122639,0.014716,0.3,0,0.05,3,1,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.843521e+13,-2.078455e+13,-1.942399e+13,9.944758e+11,3826,-1.689567e+13,-1.742835e+13,-1.694313e+13,-1.708905e+13,2.406998e+11
1,3.079478,0.207396,0.090537,0.021950,0.3,0,0.05,3,3,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.850119e+13,-2.079028e+13,-1.942497e+13,9.852711e+11,3831,-1.746133e+13,-1.750361e+13,-1.694992e+13,-1.730495e+13,2.516401e+11
2,3.347731,0.224032,0.103387,0.015666,0.3,0,0.05,3,5,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.829217e+13,-2.085712e+13,-1.937571e+13,1.084255e+12,3821,-1.747608e+13,-1.766879e+13,-1.699569e+13,-1.738019e+13,2.830351e+11
3,3.314419,0.226211,0.100815,0.022268,0.3,0,0.05,3,7,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.825262e+13,-2.119684e+13,-1.949965e+13,1.243386e+12,3836,-1.751297e+13,-1.773444e+13,-1.721367e+13,-1.748702e+13,2.133943e+11
4,3.849596,0.266123,0.113878,0.003661,0.3,0,0.05,4,1,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.396827e+13,-1.525678e+13,-1.472701e+13,5.503800e+11,3801,-1.181482e+13,-1.232840e+13,-1.177175e+13,-1.197165e+13,2.528685e+11
5,3.592087,0.248242,0.103675,0.012995,0.3,0,0.05,4,3,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.410406e+13,-1.536897e+13,-1.479624e+13,5.232546e+11,3806,-1.210899e+13,-1.245858e+13,-1.189951e+13,-1.215569e+13,2.306159e+11
6,3.616576,0.188657,0.101241,0.028130,0.3,0,0.05,4,5,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.393973e+13,-1.576273e+13,-1.483858e+13,7.444393e+11,3811,-1.245039e+13,-1.270567e+13,-1.212446e+13,-1.242684e+13,2.378650e+11
7,3.593857,0.045626,0.095285,0.006857,0.3,0,0.05,4,7,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.404102e+13,-1.633387e+13,-1.512027e+13,9.408456e+11,3816,-1.256996e+13,-1.283204e+13,-1.222285e+13,-1.254162e+13,2.495089e+11
8,4.147263,0.280801,0.154677,0.073901,0.3,0,0.05,5,1,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.223281e+13,-1.338400e+13,-1.301886e+13,5.562707e+11,3771,-9.700959e+12,-9.979989e+12,-9.513081e+12,-9.731343e+12,1.918215e+11
9,4.038531,0.087954,0.110041,0.002659,0.3,0,0.05,5,3,"{'colsample_bytree': 0.3, 'gamma': 0.0, 'learn...",...,-1.231342e+13,-1.364950e+13,-1.309151e+13,5.672132e+11,3781,-1.029230e+13,-1.018517e+13,-9.646012e+12,-1.004116e+13,2.828148e+11


In [21]:
opt_params = gs.best_params_

In [22]:
reg = xgb.XGBRegressor(**opt_params)
reg.fit(minmax_X_train, y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0.0,
             importance_type='gain', learning_rate=0.2, max_delta_step=0,
             max_depth=8, min_child_weight=3, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [25]:
reg_pred = reg.predict(minmax_X_test)

In [26]:
rmse = np.sqrt(mean_squared_error(y_test, reg_pred))
print("RMSE: %f" % (rmse))

RMSE: 780609.469161


In [27]:
cv = cross_validate(reg, minmax_X, y, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1, return_train_score=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.9s finished


In [33]:
rmse = np.sqrt(-cv['test_score'])
rmse

array([1163284.27626112, 1062191.71949845, 1162591.46742052,
        681796.08301283,  860760.19773682])

In [37]:
import joblib
joblib.dump(reg, 'xgb_model.pkl') 

['xgb_model.pkl']