# Setup the enviroment

In [1]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy.stats as stats
from sklearn.metrics import mean_absolute_percentage_error
from sklearn import metrics
warnings.simplefilter('ignore')

# Read the data

In [2]:
print(os.path.exists("rgr_preprocessed.csv"))

True


In [3]:
df = pd.read_csv("rgr_preprocessed.csv")

### Splitting the data

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(['Price', 'Unnamed: 0'], axis=1)
Y = df['Price']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9)

# Baseline modeling
As a baseline model I'll use Desicion Tree(because it is model, that has less than 100% accuracy).

In [5]:
from xgboost import XGBRegressor

model = XGBRegressor(verbosity=0)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print('test set metrics: ', mean_absolute_percentage_error(Y_test, Y_pred))

test set metrics:  1.1610079719196602


# Hyperparameter tuning

In [6]:
model.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': 0}

In [7]:
%%time

from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.01, 0.001),
    'subsample': stats.uniform(0.5, 0.1),
    'n_estimators':stats.randint(50, 200)
}

xgb_model = XGBRegressor()

random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=50, cv=5, scoring='accuracy', verbose=3, return_train_score=True, refit='f1_weighted')

random_search.fit(X_train, Y_train)

print("The best parameters are %s with a score of %0.2f"
      % (random_search.best_params_, random_search.best_score_))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END learning_rate=0.010677814918166561, max_depth=7, n_estimators=178, subsample=0.5789094950420522;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/5] END learning_rate=0.010677814918166561, max_depth=7, n_estimators=178, subsample=0.5789094950420522;, score=(train=nan, test=nan) total time=   0.0s
[CV 3/5] END learning_rate=0.010677814918166561, max_depth=7, n_estimators=178, subsample=0.5789094950420522;, score=(train=nan, test=nan) total time=   0.0s
[CV 4/5] END learning_rate=0.010677814918166561, max_depth=7, n_estimators=178, subsample=0.5789094950420522;, score=(train=nan, test=nan) total time=   0.0s
[CV 5/5] END learning_rate=0.010677814918166561, max_depth=7, n_estimators=178, subsample=0.5789094950420522;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/5] END learning_rate=0.010365902479881712, max_depth=7, n_estimators=57, subsample=0.544424687200449;, score=(train=nan, test=nan) total time=

In [11]:
Y_pred1 = model.predict(X_test)
print('Original model MAPE: ', mean_absolute_percentage_error(Y_test, Y_pred1))

better_model = XGBRegressor(learning_rate=0.010934661031570524, max_depth=8, n_estimators=166, subsample=0.5431552041024764)
better_model.fit(X_train, Y_train)
Y_pred2 = better_model.predict(X_test)
print('Better model MAPE: ', mean_absolute_percentage_error(Y_test, Y_pred2))

Original model MAPE:  1.1610079719196602
Better model MAPE: 0.871054964205105
