In [45]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold

import category_encoders
from category_encoders import TargetEncoder

import catboost as cb

Loading CSVs
==

In [46]:
d_city = pd.read_csv('../data/diamonds_city.csv')
d_clarity = pd.read_csv('../data/diamonds_clarity.csv')
d_color = pd.read_csv('../data/diamonds_color.csv')
d_cut = pd.read_csv('../data/diamonds_cut.csv')
d_dimensions = pd.read_csv('../data/diamonds_dimensions.csv')
d_properties = pd.read_csv('../data/diamonds_properties.csv')
d_transactional = pd.read_csv('../data/diamonds_transactional.csv')
d_test = pd.read_csv('../data/diamonds_test.csv')

In [47]:
diamonds = d_dimensions.merge(d_properties, how='inner', on='index_id')
diamonds = diamonds.merge(d_cut, how='inner', on='cut_id')
diamonds = diamonds.merge(d_color, how='inner', on='color_id')
diamonds = diamonds.merge(d_clarity, how='inner', on='clarity_id')
diamonds = diamonds.merge(d_transactional, how='inner', on='index_id')
diamonds = diamonds.merge(d_city, how='inner', on='city_id')
diamonds = diamonds.drop(columns=['cut_id', 'color_id', 'clarity_id', 'city_id'])
diamonds

Unnamed: 0,index_id,depth,table,x,y,z,cut,color,clarity,price,carat,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2,4268,1.21,Kimberly
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,60.8,60.0,6.85,6.89,4.18,Premium,J,VS2,4839,1.20,Kimberly
2,72b31cf00f8ab3967588fad4a32f61622cb162f9b7bc2c...,60.6,59.0,4.34,4.38,2.64,Premium,J,VS2,368,0.30,Kimberly
3,98c53df687f2e9b94da80eef5b9049f1fac456b4c41c80...,62.6,57.0,6.80,6.72,4.23,Premium,J,VS2,5053,1.20,Kimberly
4,5dfe43a321c6834c7de273c73aeadc705d919a5869e0f5...,59.4,62.0,6.66,6.58,3.93,Premium,J,VS2,3593,1.05,Kimberly
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,e03a231c5b52635043e7dc5f0c6c9f16722e14dbcc98bb...,61.8,56.0,4.42,4.46,2.74,Ideal,F,IF,978,0.33,Zurich
40451,90dcb905e13140ff99770039b843fb62fb179ab4a3bae9...,61.6,56.0,4.43,4.47,2.74,Ideal,F,IF,929,0.32,Zurich
40452,97d3c3344c245422ee7fa4f448b2cace9940121620df22...,62.4,55.0,4.20,4.17,2.61,Ideal,F,IF,828,0.28,Zurich
40453,e4dc4e0761ccc6fbb4c064517e40f3582522c325f9a4e5...,62.0,58.0,6.44,6.49,4.01,Ideal,F,IF,11116,1.02,Zurich


In [48]:
diamonds

Unnamed: 0,index_id,depth,table,x,y,z,cut,color,clarity,price,carat,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2,4268,1.21,Kimberly
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,60.8,60.0,6.85,6.89,4.18,Premium,J,VS2,4839,1.20,Kimberly
2,72b31cf00f8ab3967588fad4a32f61622cb162f9b7bc2c...,60.6,59.0,4.34,4.38,2.64,Premium,J,VS2,368,0.30,Kimberly
3,98c53df687f2e9b94da80eef5b9049f1fac456b4c41c80...,62.6,57.0,6.80,6.72,4.23,Premium,J,VS2,5053,1.20,Kimberly
4,5dfe43a321c6834c7de273c73aeadc705d919a5869e0f5...,59.4,62.0,6.66,6.58,3.93,Premium,J,VS2,3593,1.05,Kimberly
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,e03a231c5b52635043e7dc5f0c6c9f16722e14dbcc98bb...,61.8,56.0,4.42,4.46,2.74,Ideal,F,IF,978,0.33,Zurich
40451,90dcb905e13140ff99770039b843fb62fb179ab4a3bae9...,61.6,56.0,4.43,4.47,2.74,Ideal,F,IF,929,0.32,Zurich
40452,97d3c3344c245422ee7fa4f448b2cace9940121620df22...,62.4,55.0,4.20,4.17,2.61,Ideal,F,IF,828,0.28,Zurich
40453,e4dc4e0761ccc6fbb4c064517e40f3582522c325f9a4e5...,62.0,58.0,6.44,6.49,4.01,Ideal,F,IF,11116,1.02,Zurich


Features
==

In [49]:
def super_feature(df):
    return  (df['carat'] / df['table'] * df['depth'])


diamonds['super_feature'] = super_feature(diamonds)
#diamonds['super_feature2'] = super_feature2(diamonds)

diamonds

Unnamed: 0,index_id,depth,table,x,y,z,cut,color,clarity,price,carat,city,super_feature
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2,4268,1.21,Kimberly,1.301793
1,248aa2bdd0032920ac9e5f6ad36c350549da067efeaf7b...,60.8,60.0,6.85,6.89,4.18,Premium,J,VS2,4839,1.20,Kimberly,1.216000
2,72b31cf00f8ab3967588fad4a32f61622cb162f9b7bc2c...,60.6,59.0,4.34,4.38,2.64,Premium,J,VS2,368,0.30,Kimberly,0.308136
3,98c53df687f2e9b94da80eef5b9049f1fac456b4c41c80...,62.6,57.0,6.80,6.72,4.23,Premium,J,VS2,5053,1.20,Kimberly,1.317895
4,5dfe43a321c6834c7de273c73aeadc705d919a5869e0f5...,59.4,62.0,6.66,6.58,3.93,Premium,J,VS2,3593,1.05,Kimberly,1.005968
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,e03a231c5b52635043e7dc5f0c6c9f16722e14dbcc98bb...,61.8,56.0,4.42,4.46,2.74,Ideal,F,IF,978,0.33,Zurich,0.364179
40451,90dcb905e13140ff99770039b843fb62fb179ab4a3bae9...,61.6,56.0,4.43,4.47,2.74,Ideal,F,IF,929,0.32,Zurich,0.352000
40452,97d3c3344c245422ee7fa4f448b2cace9940121620df22...,62.4,55.0,4.20,4.17,2.61,Ideal,F,IF,828,0.28,Zurich,0.317673
40453,e4dc4e0761ccc6fbb4c064517e40f3582522c325f9a4e5...,62.0,58.0,6.44,6.49,4.01,Ideal,F,IF,11116,1.02,Zurich,1.090345


In [50]:
x_columns = ['depth', 'table','cut', 'color', 'clarity', 'carat', 'super_feature']
X = diamonds[x_columns]

y = diamonds['price']

Encoding
==

## Label Encoding
---

In [51]:
cut_encoding = {'Premium':40, 'Very Good':30, 'Fair':20, 'Good': 10, 'Ideal':5}
def labelEncodingForCut(x):
    for key in cut_encoding:
        if x == key:
            return cut_encoding[key]

In [52]:
color_encoding = {'J':60, 'I':50, 'H':40, 'G': 30, 'F':20, 'E':10, 'D':5}
def labelEncodingForColor(x):
    for key in color_encoding:
        if x == key:
            return color_encoding[key]

In [53]:
clarity_encoding = {'I1':5, 'SI2':10, 'SI1':20, 'VS2': 30, 'VS1':40, 'VVS2':50, 'VVS1':60, 'IF':70}
def labelEncodingForClarity(x):
    for key in clarity_encoding:
        if x == key:
            return clarity_encoding[key]

---

In [54]:
X['cut'] = X['cut'].apply(labelEncodingForCut)
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cut'] = X['cut'].apply(labelEncodingForCut)


Unnamed: 0,depth,table,cut,color,clarity,carat,super_feature
0,62.4,58.0,40,J,VS2,1.21,1.301793
1,60.8,60.0,40,J,VS2,1.20,1.216000
2,60.6,59.0,40,J,VS2,0.30,0.308136
3,62.6,57.0,40,J,VS2,1.20,1.317895
4,59.4,62.0,40,J,VS2,1.05,1.005968
...,...,...,...,...,...,...,...
40450,61.8,56.0,5,F,IF,0.33,0.364179
40451,61.6,56.0,5,F,IF,0.32,0.352000
40452,62.4,55.0,5,F,IF,0.28,0.317673
40453,62.0,58.0,5,F,IF,1.02,1.090345


In [55]:
X['color'] = X['color'].apply(labelEncodingForColor)
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['color'] = X['color'].apply(labelEncodingForColor)


Unnamed: 0,depth,table,cut,color,clarity,carat,super_feature
0,62.4,58.0,40,60,VS2,1.21,1.301793
1,60.8,60.0,40,60,VS2,1.20,1.216000
2,60.6,59.0,40,60,VS2,0.30,0.308136
3,62.6,57.0,40,60,VS2,1.20,1.317895
4,59.4,62.0,40,60,VS2,1.05,1.005968
...,...,...,...,...,...,...,...
40450,61.8,56.0,5,20,IF,0.33,0.364179
40451,61.6,56.0,5,20,IF,0.32,0.352000
40452,62.4,55.0,5,20,IF,0.28,0.317673
40453,62.0,58.0,5,20,IF,1.02,1.090345


In [56]:
X['clarity'] = X['clarity'].apply(labelEncodingForClarity)
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['clarity'] = X['clarity'].apply(labelEncodingForClarity)


Unnamed: 0,depth,table,cut,color,clarity,carat,super_feature
0,62.4,58.0,40,60,30,1.21,1.301793
1,60.8,60.0,40,60,30,1.20,1.216000
2,60.6,59.0,40,60,30,0.30,0.308136
3,62.6,57.0,40,60,30,1.20,1.317895
4,59.4,62.0,40,60,30,1.05,1.005968
...,...,...,...,...,...,...,...
40450,61.8,56.0,5,20,70,0.33,0.364179
40451,61.6,56.0,5,20,70,0.32,0.352000
40452,62.4,55.0,5,20,70,0.28,0.317673
40453,62.0,58.0,5,20,70,1.02,1.090345


---

Scaling
==

In [57]:
X['table'] = np.log(X['table'])
X['cut'] = np.log(X['cut'])
X['color'] = np.log(X['color'])
X['carat'] = np.log(X['carat'])
X['super_feature'] = np.log(X['super_feature'])
X['clarity'] = np.log(X['clarity'])
X['depth'] = np.log(X['depth'])
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['table'] = np.log(X['table'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cut'] = np.log(X['cut'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['color'] = np.log(X['color'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

Unnamed: 0,depth,table,cut,color,clarity,carat,super_feature,clarity2
0,4.133565,4.060443,3.688879,4.094345,3.401197,0.190620,0.263743,1.224128
1,4.107590,4.094345,3.688879,4.094345,3.401197,0.182322,0.195567,1.224128
2,4.104295,4.077537,3.688879,4.094345,3.401197,-1.203973,-1.177215,1.224128
3,4.136765,4.043051,3.688879,4.094345,3.401197,0.182322,0.276036,1.224128
4,4.084294,4.127134,3.688879,4.094345,3.401197,0.048790,0.005950,1.224128
...,...,...,...,...,...,...,...,...
40450,4.123903,4.025352,1.609438,2.995732,4.248495,-1.108663,-1.010111,1.446565
40451,4.120662,4.025352,1.609438,2.995732,4.248495,-1.139434,-1.044124,1.446565
40452,4.133565,4.007333,1.609438,2.995732,4.248495,-1.272966,-1.146734,1.446565
40453,4.127134,4.060443,1.609438,2.995732,4.248495,0.019803,0.086494,1.446565


In [58]:
X

Unnamed: 0,depth,table,cut,color,clarity,carat,super_feature,clarity2
0,4.133565,4.060443,3.688879,4.094345,3.401197,0.190620,0.263743,1.224128
1,4.107590,4.094345,3.688879,4.094345,3.401197,0.182322,0.195567,1.224128
2,4.104295,4.077537,3.688879,4.094345,3.401197,-1.203973,-1.177215,1.224128
3,4.136765,4.043051,3.688879,4.094345,3.401197,0.182322,0.276036,1.224128
4,4.084294,4.127134,3.688879,4.094345,3.401197,0.048790,0.005950,1.224128
...,...,...,...,...,...,...,...,...
40450,4.123903,4.025352,1.609438,2.995732,4.248495,-1.108663,-1.010111,1.446565
40451,4.120662,4.025352,1.609438,2.995732,4.248495,-1.139434,-1.044124,1.446565
40452,4.133565,4.007333,1.609438,2.995732,4.248495,-1.272966,-1.146734,1.446565
40453,4.127134,4.060443,1.609438,2.995732,4.248495,0.019803,0.086494,1.446565


---

Train, Test, Split
==

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

X_train: (36409, 8), X_test: (4046, 8), y_train: (36409,), y_test: (4046,)


---

In [65]:

d_test['super_feature'] = super_feature(d_test)


d_test = d_test[x_columns]
d_test['cut'] = d_test['cut'].apply(labelEncodingForCut)
d_test['color'] = d_test['color'].apply(labelEncodingForColor)
d_test['clarity'] = d_test['clarity'].apply(labelEncodingForClarity)

d_test['table'] = np.log(d_test['table'])
d_test['cut'] = np.log(d_test['cut'])
d_test['color'] = np.log(d_test['color'])
d_test['carat'] = np.log(d_test['carat'])
d_test['super_feature'] = np.log(d_test['super_feature'])
d_test['clarity'] = np.log(d_test['clarity'])
d_test['depth'] = np.log(d_test['depth'])
x_pred = d_test

d_test

Unnamed: 0,depth,table,cut,color,clarity,carat,super_feature,clarity2
0,4.138361,4.094345,3.401197,2.995732,2.995732,-0.235722,-0.191705,1.097189
1,4.110874,4.043051,1.609438,4.094345,3.688879,0.182322,0.250144,1.305323
2,4.130355,4.110874,3.688879,3.688879,2.995732,0.451076,0.470557,1.097189
3,4.155753,3.988984,3.401197,2.995732,2.995732,-0.105361,0.061409,1.097189
4,4.141546,4.060443,3.401197,2.995732,3.688879,-0.693147,-0.612044,1.305323
...,...,...,...,...,...,...,...,...
13480,4.125520,4.025352,1.609438,2.302585,2.995732,-0.562119,-0.461950,1.097189
13481,4.130355,4.007333,1.609438,3.912023,3.401197,-0.342490,-0.219468,1.224128
13482,4.120662,4.007333,1.609438,2.995732,3.688879,-0.356675,-0.243346,1.305323
13483,4.074142,4.043051,3.401197,2.995732,2.302585,-0.356675,-0.325584,0.834032


Predictions
==

## CatBoost
---

In [60]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [61]:
model = cb.CatBoostRegressor(random_strength=0.2 , learning_rate=0.035, iterations=1000, depth=8 ,loss_function='RMSE')

In [62]:
model.fit( X_train, y_train)

0:	learn: 3868.0568871	total: 3.11ms	remaining: 3.11s
1:	learn: 3743.7597114	total: 6.21ms	remaining: 3.1s
2:	learn: 3624.3766874	total: 9.59ms	remaining: 3.19s
3:	learn: 3509.1288827	total: 13.2ms	remaining: 3.28s
4:	learn: 3397.7023696	total: 17ms	remaining: 3.39s
5:	learn: 3290.7515822	total: 20.5ms	remaining: 3.39s
6:	learn: 3188.0323927	total: 24.1ms	remaining: 3.41s
7:	learn: 3088.2469629	total: 27.3ms	remaining: 3.38s
8:	learn: 2992.7079322	total: 30.6ms	remaining: 3.37s
9:	learn: 2899.2727920	total: 34.2ms	remaining: 3.39s
10:	learn: 2811.0863550	total: 37.6ms	remaining: 3.38s
11:	learn: 2724.8858725	total: 41.1ms	remaining: 3.38s
12:	learn: 2641.6353798	total: 44.6ms	remaining: 3.39s
13:	learn: 2562.8208019	total: 47.9ms	remaining: 3.37s
14:	learn: 2485.7146835	total: 51.4ms	remaining: 3.38s
15:	learn: 2411.8076230	total: 54.9ms	remaining: 3.38s
16:	learn: 2340.8188880	total: 58.4ms	remaining: 3.37s
17:	learn: 2272.2769850	total: 61.8ms	remaining: 3.37s
18:	learn: 2205.7416843

161:	learn: 547.0235672	total: 600ms	remaining: 3.1s
162:	learn: 546.6113681	total: 604ms	remaining: 3.1s
163:	learn: 546.2359665	total: 608ms	remaining: 3.1s
164:	learn: 545.8301277	total: 612ms	remaining: 3.1s
165:	learn: 545.4396514	total: 615ms	remaining: 3.09s
166:	learn: 544.9862508	total: 619ms	remaining: 3.08s
167:	learn: 544.6003947	total: 622ms	remaining: 3.08s
168:	learn: 544.0955724	total: 625ms	remaining: 3.07s
169:	learn: 543.5487419	total: 628ms	remaining: 3.07s
170:	learn: 543.1020063	total: 632ms	remaining: 3.06s
171:	learn: 542.6860747	total: 635ms	remaining: 3.06s
172:	learn: 542.3339548	total: 639ms	remaining: 3.05s
173:	learn: 541.8882222	total: 642ms	remaining: 3.05s
174:	learn: 541.5735002	total: 645ms	remaining: 3.04s
175:	learn: 541.2428293	total: 648ms	remaining: 3.03s
176:	learn: 540.9270877	total: 652ms	remaining: 3.03s
177:	learn: 540.5888446	total: 655ms	remaining: 3.02s
178:	learn: 540.2644704	total: 658ms	remaining: 3.02s
179:	learn: 539.9289229	total: 6

319:	learn: 510.5682523	total: 1.21s	remaining: 2.58s
320:	learn: 510.4064973	total: 1.22s	remaining: 2.57s
321:	learn: 510.2915795	total: 1.22s	remaining: 2.57s
322:	learn: 510.1056021	total: 1.22s	remaining: 2.57s
323:	learn: 509.9910155	total: 1.23s	remaining: 2.56s
324:	learn: 509.7935225	total: 1.23s	remaining: 2.56s
325:	learn: 509.7029233	total: 1.24s	remaining: 2.55s
326:	learn: 509.6111419	total: 1.24s	remaining: 2.55s
327:	learn: 509.4443408	total: 1.24s	remaining: 2.54s
328:	learn: 509.2996323	total: 1.25s	remaining: 2.54s
329:	learn: 509.2012066	total: 1.25s	remaining: 2.54s
330:	learn: 509.1078275	total: 1.25s	remaining: 2.53s
331:	learn: 508.8425333	total: 1.25s	remaining: 2.53s
332:	learn: 508.6167495	total: 1.26s	remaining: 2.52s
333:	learn: 508.3948970	total: 1.26s	remaining: 2.52s
334:	learn: 508.3489036	total: 1.27s	remaining: 2.52s
335:	learn: 508.0764626	total: 1.27s	remaining: 2.51s
336:	learn: 507.9002740	total: 1.27s	remaining: 2.51s
337:	learn: 507.6508985	tota

526:	learn: 485.7598170	total: 2.01s	remaining: 1.81s
527:	learn: 485.6373245	total: 2.02s	remaining: 1.8s
528:	learn: 485.4740926	total: 2.02s	remaining: 1.8s
529:	learn: 485.4134552	total: 2.03s	remaining: 1.8s
530:	learn: 485.2879964	total: 2.03s	remaining: 1.79s
531:	learn: 485.1983540	total: 2.03s	remaining: 1.79s
532:	learn: 485.0297927	total: 2.04s	remaining: 1.78s
533:	learn: 484.9578741	total: 2.04s	remaining: 1.78s
534:	learn: 484.8754710	total: 2.04s	remaining: 1.78s
535:	learn: 484.6559292	total: 2.05s	remaining: 1.77s
536:	learn: 484.5954174	total: 2.05s	remaining: 1.77s
537:	learn: 484.5382291	total: 2.06s	remaining: 1.76s
538:	learn: 484.4182607	total: 2.06s	remaining: 1.76s
539:	learn: 484.3585420	total: 2.06s	remaining: 1.76s
540:	learn: 484.2372338	total: 2.07s	remaining: 1.75s
541:	learn: 484.1789880	total: 2.07s	remaining: 1.75s
542:	learn: 484.0644700	total: 2.07s	remaining: 1.75s
543:	learn: 483.9535809	total: 2.08s	remaining: 1.74s
544:	learn: 483.8169656	total: 

720:	learn: 467.8030203	total: 2.74s	remaining: 1.06s
721:	learn: 467.7182348	total: 2.75s	remaining: 1.06s
722:	learn: 467.6097937	total: 2.75s	remaining: 1.05s
723:	learn: 467.5246194	total: 2.75s	remaining: 1.05s
724:	learn: 467.5023526	total: 2.76s	remaining: 1.05s
725:	learn: 467.4676935	total: 2.76s	remaining: 1.04s
726:	learn: 467.4483888	total: 2.77s	remaining: 1.04s
727:	learn: 467.2832698	total: 2.77s	remaining: 1.03s
728:	learn: 467.2235746	total: 2.77s	remaining: 1.03s
729:	learn: 467.2045198	total: 2.77s	remaining: 1.03s
730:	learn: 467.1161873	total: 2.78s	remaining: 1.02s
731:	learn: 467.0890290	total: 2.78s	remaining: 1.02s
732:	learn: 466.9023048	total: 2.79s	remaining: 1.01s
733:	learn: 466.8646812	total: 2.79s	remaining: 1.01s
734:	learn: 466.8312791	total: 2.79s	remaining: 1.01s
735:	learn: 466.8145889	total: 2.79s	remaining: 1s
736:	learn: 466.8007387	total: 2.8s	remaining: 999ms
737:	learn: 466.7775190	total: 2.8s	remaining: 995ms
738:	learn: 466.7480046	total: 2.

885:	learn: 456.1160052	total: 3.34s	remaining: 430ms
886:	learn: 456.0144676	total: 3.35s	remaining: 426ms
887:	learn: 455.9405458	total: 3.35s	remaining: 423ms
888:	learn: 455.8438589	total: 3.35s	remaining: 419ms
889:	learn: 455.7618589	total: 3.36s	remaining: 415ms
890:	learn: 455.6930983	total: 3.36s	remaining: 411ms
891:	learn: 455.6455733	total: 3.36s	remaining: 407ms
892:	learn: 455.6197771	total: 3.37s	remaining: 403ms
893:	learn: 455.5623660	total: 3.37s	remaining: 400ms
894:	learn: 455.4792642	total: 3.37s	remaining: 396ms
895:	learn: 455.4679738	total: 3.38s	remaining: 392ms
896:	learn: 455.3864279	total: 3.38s	remaining: 388ms
897:	learn: 455.3127739	total: 3.38s	remaining: 384ms
898:	learn: 455.2901011	total: 3.39s	remaining: 381ms
899:	learn: 455.1595906	total: 3.39s	remaining: 377ms
900:	learn: 455.1378329	total: 3.39s	remaining: 373ms
901:	learn: 455.0147770	total: 3.4s	remaining: 369ms
902:	learn: 455.0056222	total: 3.4s	remaining: 365ms
903:	learn: 454.9282778	total:

<catboost.core.CatBoostRegressor at 0x14a64ada0>

In [63]:
pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

Testing performance
RMSE: 476.42
R2: 0.99


In [66]:
d_predictions = model.predict(x_pred).clip(0, 30000)
d_predictions

array([2961.20925151, 5588.56757766, 9631.57257292, ..., 3205.85090327,
       2059.55056367,  797.75002267])

## Save predictions
---

In [67]:
predictions = pd.DataFrame(d_predictions).rename(columns={0:'price'})
predictions.index.names = ['id']
predictions

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,2961.209252
1,5588.567578
2,9631.572573
3,4003.286181
4,1634.669882
...,...
13480,1708.118401
13481,2325.847838
13482,3205.850903
13483,2059.550564


In [68]:
predictions.to_csv('../results/predictions.csv')


---