In [106]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

Loading CSVs
==

In [107]:
d_city = pd.read_csv('../data/diamonds_city.csv')
d_clarity = pd.read_csv('../data/diamonds_clarity.csv')
d_color = pd.read_csv('../data/diamonds_color.csv')
d_cut = pd.read_csv('../data/diamonds_cut.csv')
d_dimensions = pd.read_csv('../data/diamonds_dimensions.csv')
d_properties = pd.read_csv('../data/diamonds_properties.csv')
d_transactional = pd.read_csv('../data/diamonds_transactional.csv')
d_test = pd.read_csv('../data/diamonds_test.csv')

In [108]:
diamonds = d_dimensions.merge(d_properties, how='inner', on='index_id')
diamonds = diamonds.merge(d_cut, how='inner', on='cut_id')
diamonds = diamonds.merge(d_color, how='inner', on='color_id')
diamonds = diamonds.merge(d_clarity, how='inner', on='clarity_id')
diamonds = diamonds.merge(d_transactional, how='inner', on='index_id')
diamonds = diamonds.merge(d_city, how='inner', on='city_id')
diamonds = diamonds.drop(columns=['cut_id', 'color_id', 'clarity_id', 'city_id'])
diamonds

Unnamed: 0,index_id,depth,table,x,y,z,cut,color,clarity,price,carat,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2,4268,1.21,Kimberly
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,60.8,60.0,6.85,6.89,4.18,Premium,J,VS2,4839,1.20,Kimberly
2,72b31cf00f8ab3967588fad4a32f61622cb162f9b7bc2c...,60.6,59.0,4.34,4.38,2.64,Premium,J,VS2,368,0.30,Kimberly
3,98c53df687f2e9b94da80eef5b9049f1fac456b4c41c80...,62.6,57.0,6.80,6.72,4.23,Premium,J,VS2,5053,1.20,Kimberly
4,5dfe43a321c6834c7de273c73aeadc705d919a5869e0f5...,59.4,62.0,6.66,6.58,3.93,Premium,J,VS2,3593,1.05,Kimberly
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,e03a231c5b52635043e7dc5f0c6c9f16722e14dbcc98bb...,61.8,56.0,4.42,4.46,2.74,Ideal,F,IF,978,0.33,Zurich
40451,90dcb905e13140ff99770039b843fb62fb179ab4a3bae9...,61.6,56.0,4.43,4.47,2.74,Ideal,F,IF,929,0.32,Zurich
40452,97d3c3344c245422ee7fa4f448b2cace9940121620df22...,62.4,55.0,4.20,4.17,2.61,Ideal,F,IF,828,0.28,Zurich
40453,e4dc4e0761ccc6fbb4c064517e40f3582522c325f9a4e5...,62.0,58.0,6.44,6.49,4.01,Ideal,F,IF,11116,1.02,Zurich


In [109]:
diamonds

Unnamed: 0,index_id,depth,table,x,y,z,cut,color,clarity,price,carat,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2,4268,1.21,Kimberly
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,60.8,60.0,6.85,6.89,4.18,Premium,J,VS2,4839,1.20,Kimberly
2,72b31cf00f8ab3967588fad4a32f61622cb162f9b7bc2c...,60.6,59.0,4.34,4.38,2.64,Premium,J,VS2,368,0.30,Kimberly
3,98c53df687f2e9b94da80eef5b9049f1fac456b4c41c80...,62.6,57.0,6.80,6.72,4.23,Premium,J,VS2,5053,1.20,Kimberly
4,5dfe43a321c6834c7de273c73aeadc705d919a5869e0f5...,59.4,62.0,6.66,6.58,3.93,Premium,J,VS2,3593,1.05,Kimberly
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,e03a231c5b52635043e7dc5f0c6c9f16722e14dbcc98bb...,61.8,56.0,4.42,4.46,2.74,Ideal,F,IF,978,0.33,Zurich
40451,90dcb905e13140ff99770039b843fb62fb179ab4a3bae9...,61.6,56.0,4.43,4.47,2.74,Ideal,F,IF,929,0.32,Zurich
40452,97d3c3344c245422ee7fa4f448b2cace9940121620df22...,62.4,55.0,4.20,4.17,2.61,Ideal,F,IF,828,0.28,Zurich
40453,e4dc4e0761ccc6fbb4c064517e40f3582522c325f9a4e5...,62.0,58.0,6.44,6.49,4.01,Ideal,F,IF,11116,1.02,Zurich


Features
==

In [110]:
def super_feature(df):
    return  (df['depth'] * df['table'])/1000
def super_feature2(df):

    x = df['x']*df['y']*df['z']/10000
    x = df['carat']/x

    return np.where(x != np.inf , x, 61)

diamonds['super_feature'] = super_feature(diamonds)
#diamonds['super_feature2'] = super_feature2(diamonds)

diamonds

Unnamed: 0,index_id,depth,table,x,y,z,cut,color,clarity,price,carat,city,super_feature
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2,4268,1.21,Kimberly,3.6192
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,60.8,60.0,6.85,6.89,4.18,Premium,J,VS2,4839,1.20,Kimberly,3.6480
2,72b31cf00f8ab3967588fad4a32f61622cb162f9b7bc2c...,60.6,59.0,4.34,4.38,2.64,Premium,J,VS2,368,0.30,Kimberly,3.5754
3,98c53df687f2e9b94da80eef5b9049f1fac456b4c41c80...,62.6,57.0,6.80,6.72,4.23,Premium,J,VS2,5053,1.20,Kimberly,3.5682
4,5dfe43a321c6834c7de273c73aeadc705d919a5869e0f5...,59.4,62.0,6.66,6.58,3.93,Premium,J,VS2,3593,1.05,Kimberly,3.6828
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,e03a231c5b52635043e7dc5f0c6c9f16722e14dbcc98bb...,61.8,56.0,4.42,4.46,2.74,Ideal,F,IF,978,0.33,Zurich,3.4608
40451,90dcb905e13140ff99770039b843fb62fb179ab4a3bae9...,61.6,56.0,4.43,4.47,2.74,Ideal,F,IF,929,0.32,Zurich,3.4496
40452,97d3c3344c245422ee7fa4f448b2cace9940121620df22...,62.4,55.0,4.20,4.17,2.61,Ideal,F,IF,828,0.28,Zurich,3.4320
40453,e4dc4e0761ccc6fbb4c064517e40f3582522c325f9a4e5...,62.0,58.0,6.44,6.49,4.01,Ideal,F,IF,11116,1.02,Zurich,3.5960


In [111]:
x_columns = ['depth', 'table','cut', 'color', 'clarity', 'carat', 'super_feature']
X = diamonds[x_columns]

y = diamonds['price']

In [112]:
#diamonds['super_feature2'].mean()

Encoding
==

In [113]:
X_encoding = pd.get_dummies(X, columns=[ 'cut', 'color', 'clarity'], drop_first=True)
X_encoding

Unnamed: 0,depth,table,carat,super_feature,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,62.4,58.0,1.21,3.6192,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,60.8,60.0,1.20,3.6480,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,60.6,59.0,0.30,3.5754,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,62.6,57.0,1.20,3.5682,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,59.4,62.0,1.05,3.6828,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,61.8,56.0,0.33,3.4608,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
40451,61.6,56.0,0.32,3.4496,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
40452,62.4,55.0,0.28,3.4320,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
40453,62.0,58.0,1.02,3.5960,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [114]:
def myEncoding(df):
    return pd.get_dummies(df, columns=['cut', 'color', 'clarity'], drop_first=True)

Scaling
==

In [115]:
def scaling(df):
    
    #scaler = MinMaxScaler()
    #scaler = StandardScaler()
    scaler = RobustScaler()
    scaling = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(scaling)
    
    return scaled_df

In [116]:
x_scaling = scaling(X_encoding)

Train, Test, Split
==

In [117]:
X_train, X_test, y_train, y_test = train_test_split(x_scaling, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

X_train: (32364, 21), X_test: (8091, 21), y_train: (32364,), y_test: (8091,)


In [118]:
#regressor = LinearRegression()
#regressor = RandomForestRegressor()
#regressor = SVR(gamma=3)
regressor = XGBRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Model:', regressor, '\n')
print('Model hyperparameters:', hyperparameters, '\n')
print('Ground truth target:', y_test, '\n')
print('Predicted target:', y_pred, '\n')

Model: XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...) 

Model hyperparameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'gamma': None, 'gpu_id': None

Predictions
==

In [119]:

d_test['super_feature'] = super_feature(d_test)
#d_test['super_feature2'] = super_feature2(d_test)

d_test = d_test[x_columns]

x_pred = myEncoding(d_test)
x_pred = scaling(x_pred)

In [120]:
d_predictions = regressor.predict(x_pred).clip(0, 30000)
d_predictions

array([2826.9023, 5418.4336, 9849.939 , ..., 3220.1785, 2123.2998,
        888.8553], dtype=float32)

In [121]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

576.370605300724

In [122]:

predictions = pd.DataFrame(d_predictions).rename(columns={0:'price'})
predictions.index.names = ['id']
predictions

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,2826.902344
1,5418.433594
2,9849.939453
3,4107.329590
4,1595.610596
...,...
13480,1685.029541
13481,2641.312500
13482,3220.178467
13483,2123.299805


In [123]:
predictions.to_csv('../results/predictions.csv')

In [124]:
r2 = r2_score(y_test, y_pred)
r2

0.9784823707579583

## Hyperparameter Tunning and Cross Validation
---

In [125]:
gridsearch_params = {
    (max_depth, min_child_weight, subsample, colsample, eta)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
    for eta in [.3, .2, .1, .05, .01, .005]
}

In [126]:
param_grid = {'min_child_weight': [i for i in range(9,12)],
              'max_depth': [i for i in range(5,8)],
             'subsample': [i/10. for i in range(7,11)],
              'colsample': [i/10. for i in range(7,11)],
             'eta': [.3, .2, .1, .05, .01, .005]}


In [127]:
param_grid

{'min_child_weight': [9, 10, 11],
 'max_depth': [5, 6, 7],
 'subsample': [0.7, 0.8, 0.9, 1.0],
 'colsample': [0.7, 0.8, 0.9, 1.0],
 'eta': [0.3, 0.2, 0.1, 0.05, 0.01, 0.005]}

In [128]:
grid_search = RandomizedSearchCV(regressor,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           error_score='raise',
                           
                           n_jobs=-1)

In [129]:
%%time

grid_search.fit(x_scaling,y)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

Fitting 5 folds for each of 10 candidates, totalling 50 fits




Parameters: { "colsample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Best hyperparameters:  {'subsample': 1.0, 'min_child_weight': 10, 'max_depth': 6, 'eta': 0.05, 'colsample': 0.9} 

Best score:  590.0004912436336 

CPU times: user 6.31 s, sys: 1.8 s, total: 8.11 s
Wall time: 20.3 s


In [103]:
X.head()

Unnamed: 0,depth,table,cut,color,clarity,carat,super_feature
0,62.4,58.0,Premium,J,VS2,1.21,3.6192
1,60.8,60.0,Premium,J,VS2,1.2,3.648
2,60.6,59.0,Premium,J,VS2,0.3,3.5754
3,62.6,57.0,Premium,J,VS2,1.2,3.5682
4,59.4,62.0,Premium,J,VS2,1.05,3.6828
