In [1]:
import numpy as np
import pandas as pd
import catboost as cb
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, AdaBoostRegressor, BaggingRegressor

In [2]:
data = pd.read_csv('data_no_mising_value_label_encoder_col.csv')
data.head()

Unnamed: 0,brand,3G,4G,5G,Announced,Weight,Length,Width,Diameter,Display Type,...,3.5mm jack,Chipset,RAM,Storage,Micro-SIM,Mini-SIM,Nano-SIM,eSIM,ratio_1,ratio_2
0,0,True,True,False,2022,172.0,146.7,71.9,10.0,7,...,True,4,2.0,32.0,False,False,True,False,18.0,9.0
1,0,True,True,False,2021,190.0,156.4,74.8,9.7,1,...,True,7,2.0,32.0,False,False,True,False,5.0,9.0
2,0,True,True,False,2021,134.0,137.6,65.7,9.8,7,...,True,4,1.0,8.0,False,False,True,False,18.0,9.0
3,0,True,True,False,2021,194.0,165.6,75.6,8.7,1,...,True,4,4.0,64.0,False,False,True,False,20.0,9.0
4,0,True,True,False,2021,190.0,165.6,75.6,8.8,1,...,True,4,3.0,32.0,False,False,True,False,20.0,9.0


In [3]:
data.drop(['ratio_1', 'ratio_2', 'Loudspeaker', '3.5mm jack', 'Length', 'Width', 'Diameter'], axis=1, inplace=True)
data.head()

Unnamed: 0,brand,3G,4G,5G,Announced,Weight,Display Type,Display Size,ppi,body ratio,...,Price,CPU,pixel,Chipset,RAM,Storage,Micro-SIM,Mini-SIM,Nano-SIM,eSIM
0,0,True,True,False,2022,172.0,7,5.5,293.0,74.0,...,100.0,4.0,1036800.0,4,2.0,32.0,False,False,True,False
1,0,True,True,False,2021,190.0,1,6.1,282.0,78.1,...,110.0,8.0,1123200.0,7,2.0,32.0,False,False,True,False
2,0,True,True,False,2021,134.0,7,5.0,215.0,71.4,...,60.0,4.0,460800.0,4,1.0,8.0,False,False,True,False
3,0,True,True,False,2021,194.0,1,6.52,269.0,82.0,...,330.0,8.0,1152000.0,4,4.0,64.0,False,False,True,False
4,0,True,True,False,2021,190.0,1,6.52,269.0,82.0,...,130.0,8.0,1152000.0,4,3.0,32.0,False,False,True,False


In [4]:
data.columns

Index(['brand', '3G', '4G', '5G', 'Announced', 'Weight', 'Display Type',
       'Display Size', 'ppi', 'body ratio', 'OS', 'battery_capacity', 'Price',
       'CPU', 'pixel', 'Chipset', 'RAM', 'Storage', 'Micro-SIM', 'Mini-SIM',
       'Nano-SIM', 'eSIM'],
      dtype='object')

In [5]:
X = data.drop('Price', axis=1)
y = data['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [6]:
Announced_scaler = MinMaxScaler()
X_train['Announced'] = Announced_scaler.fit_transform(X_train[['Announced']])
X_val['Announced'] = Announced_scaler.transform(X_val[['Announced']])
X_test['Announced'] = Announced_scaler.transform(X_test[['Announced']])

Weight_scaler = MinMaxScaler()
X_train['Weight'] = Weight_scaler.fit_transform(X_train[['Weight']])
X_val['Weight'] = Weight_scaler.transform(X_val[['Weight']])
X_test['Weight'] = Weight_scaler.transform(X_test[['Weight']])

Size_scaler = MinMaxScaler()
X_train['Display Size'] = Size_scaler.fit_transform(X_train[['Display Size']])
X_val['Display Size'] = Size_scaler.transform(X_val[['Display Size']])
X_test['Display Size'] = Size_scaler.transform(X_test[['Display Size']])

ppi_scaler = MinMaxScaler()
X_train['ppi'] = ppi_scaler.fit_transform(X_train[['ppi']])
X_val['ppi'] = ppi_scaler.transform(X_val[['ppi']])
X_test['ppi'] = ppi_scaler.transform(X_test[['ppi']])

ratio_scaler = MinMaxScaler()
X_train['body ratio'] = ratio_scaler.fit_transform(X_train[['body ratio']])
X_val['body ratio'] = ratio_scaler.transform(X_val[['body ratio']])
X_test['body ratio'] = ratio_scaler.transform(X_test[['body ratio']])

battery_scaler = MinMaxScaler()
X_train['battery_capacity'] = battery_scaler.fit_transform(X_train[['battery_capacity']])
X_val['battery_capacity'] = battery_scaler.transform(X_val[['battery_capacity']])
X_test['battery_capacity'] = battery_scaler.transform(X_test[['battery_capacity']])

CPU_scaler = MinMaxScaler()
X_train['CPU'] = CPU_scaler.fit_transform(X_train[['CPU']])
X_val['CPU'] = CPU_scaler.transform(X_val[['CPU']])
X_test['CPU'] = CPU_scaler.transform(X_test[['CPU']])

pixel_scaler = MinMaxScaler()
X_train['pixel'] = pixel_scaler.fit_transform(X_train[['pixel']])
X_val['pixel'] = pixel_scaler.transform(X_val[['pixel']])
X_test['pixel'] = pixel_scaler.transform(X_test[['pixel']])

Storage_scaler = MinMaxScaler()
X_train['Storage'] = Storage_scaler.fit_transform(X_train[['Storage']])
X_val['Storage'] = Storage_scaler.transform(X_val[['Storage']])
X_test['Storage'] = Storage_scaler.transform(X_test[['Storage']])

RAM_scaler = MinMaxScaler()
X_train['RAM'] = RAM_scaler.fit_transform(X_train[['RAM']])
X_val['RAM'] = RAM_scaler.transform(X_val[['RAM']])
X_test['RAM'] = RAM_scaler.transform(X_test[['RAM']])

y_train = np.log(y_train)
y_val = np.log(y_val)
y_test = np.log(y_test)

In [7]:
def preprocessing(X, y):
    X['Announced'] = Announced_scaler.transform(X[['Announced']])
    X['Weight'] = Weight_scaler.transform(X[['Weight']])
    X['Display Size'] = Size_scaler.transform(X[['Display Size']])
    X['ppi'] = ppi_scaler.transform(X[['ppi']])
    X['body ratio'] = ratio_scaler.transform(X[['body ratio']])
    X['battery_capacity'] = battery_scaler.transform(X[['battery_capacity']])
    X['CPU'] = CPU_scaler.transform(X[['CPU']])
    X['pixel'] = pixel_scaler.transform(X[['pixel']])
    X['Storage'] = Storage_scaler.transform(X[['Storage']])
    X['RAM'] = RAM_scaler.transform(X[['RAM']])
    y = np.log(y)
    return (X, y)

In [8]:
X_train

Unnamed: 0,brand,3G,4G,5G,Announced,Weight,Display Type,Display Size,ppi,body ratio,...,battery_capacity,CPU,pixel,Chipset,RAM,Storage,Micro-SIM,Mini-SIM,Nano-SIM,eSIM
1568,13,True,True,False,0.428571,0.047897,1,0.213115,0.252187,0.614925,...,0.212858,0.250000,0.102804,4,0.117459,0.030991,False,False,True,False
1035,8,True,True,False,0.571429,0.072430,1,0.303279,0.644315,0.832836,...,0.256299,0.750000,0.537383,6,0.243536,0.062250,False,False,True,False
1498,13,True,True,False,0.714286,0.077103,7,0.250000,0.253644,0.691045,...,0.226759,0.250000,0.116822,4,0.117459,0.062250,False,False,True,False
1224,9,True,True,False,0.785714,0.129322,1,0.336066,0.217201,0.808955,...,0.425717,0.750000,0.130841,7,0.180497,0.062250,False,False,True,False
830,7,True,True,False,0.857143,0.427570,1,0.631148,0.150146,0.756716,...,0.438749,0.750000,0.271028,7,0.180497,0.062250,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533,5,True,True,False,1.000000,0.132009,7,0.356557,0.202624,0.850746,...,0.516942,0.750000,0.130841,3,0.495691,0.249800,False,False,True,False
868,7,True,True,False,0.571429,0.082944,1,0.311475,0.409621,0.843284,...,0.282363,0.750000,0.285835,6,0.369613,0.124766,False,False,True,False
982,7,True,False,False,0.214286,0.357477,7,0.377049,0.071429,0.438806,...,0.304083,0.480976,0.065421,4,0.054420,0.030991,False,True,False,False
1256,9,True,True,False,0.642857,0.051869,1,0.250000,0.110787,0.716418,...,0.212858,0.250000,0.046729,4,0.054420,0.015362,False,False,True,False


In [9]:
cat_r = cb.CatBoostRegressor(learning_rate=0.05, depth=6, eval_metric='R2')
cat_r.fit(X_train, y_train, [0, 6, 10, 14], eval_set=(X_val, y_val), plot=True)
y_val_pred = cat_r.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.0447692	test: 0.0440292	best: 0.0440292 (0)	total: 179ms	remaining: 2m 58s
1:	learn: 0.0922386	test: 0.0899886	best: 0.0899886 (1)	total: 247ms	remaining: 2m 3s
2:	learn: 0.1325764	test: 0.1289301	best: 0.1289301 (2)	total: 313ms	remaining: 1m 43s
3:	learn: 0.1777320	test: 0.1705128	best: 0.1705128 (3)	total: 372ms	remaining: 1m 32s
4:	learn: 0.2173382	test: 0.2090027	best: 0.2090027 (4)	total: 434ms	remaining: 1m 26s
5:	learn: 0.2488512	test: 0.2419989	best: 0.2419989 (5)	total: 492ms	remaining: 1m 21s
6:	learn: 0.2814521	test: 0.2730832	best: 0.2730832 (6)	total: 556ms	remaining: 1m 18s
7:	learn: 0.3105664	test: 0.3023399	best: 0.3023399 (7)	total: 619ms	remaining: 1m 16s
8:	learn: 0.3358520	test: 0.3277826	best: 0.3277826 (8)	total: 679ms	remaining: 1m 14s
9:	learn: 0.3612420	test: 0.3524918	best: 0.3524918 (9)	total: 736ms	remaining: 1m 12s
10:	learn: 0.3832500	test: 0.3742371	best: 0.3742371 (10)	total: 792ms	remaining: 1m 11s
11:	learn: 0.4060743	test: 0.3943174	best:

In [10]:
xgb_r = xgb.XGBRegressor(n_estimators=1000)
xgb_r.fit(X_train, y_train)
y_val_pred = xgb_r.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.6999965512512991
mean_absolute_error: 0.24943778994393037
0.5968479260266182


In [11]:
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train, y_train)
y_val_pred = knn_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.47495901579852307
mean_absolute_error: 0.31976831211991374
0.5968479260266182


In [12]:
svr_reg = SVR()
svr_reg.fit(X_train, y_train)
y_val_pred = svr_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.5478786245376944
mean_absolute_error: 0.3126197164350887
0.5968479260266182


In [13]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
y_val_pred = rf_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.6941419050203513
mean_absolute_error: 0.2506083796771379
0.5968479260266182


In [14]:
ab_reg = AdaBoostRegressor()
ab_reg.fit(X_train, y_train)
y_val_pred = ab_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

r2_score: 0.6052598247872876
mean_absolute_error: 0.29718560824092616
0.5968479260266182


In [15]:
models = [
    ('xgb_reg1', xgb_r),
    ('rf_reg1', rf_reg),
    ('rf_reg2', rf_reg),
    ('xgb_reg2', xgb_r)
]
sr = StackingRegressor(estimators=models, final_estimator=cat_r)
sr.fit(X_train, y_train)
y_val_pred = ab_reg.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

0:	learn: 0.0680349	total: 8.99ms	remaining: 8.98s
1:	learn: 0.1309587	total: 16.7ms	remaining: 8.31s
2:	learn: 0.1859660	total: 26.7ms	remaining: 8.88s
3:	learn: 0.2375165	total: 31.9ms	remaining: 7.95s
4:	learn: 0.2851457	total: 36.6ms	remaining: 7.28s
5:	learn: 0.3279201	total: 40.3ms	remaining: 6.68s
6:	learn: 0.3656947	total: 44.3ms	remaining: 6.28s
7:	learn: 0.4008930	total: 48.1ms	remaining: 5.97s
8:	learn: 0.4328147	total: 51.4ms	remaining: 5.66s
9:	learn: 0.4624739	total: 55.5ms	remaining: 5.49s
10:	learn: 0.4897655	total: 58.6ms	remaining: 5.27s
11:	learn: 0.5139109	total: 61.8ms	remaining: 5.09s
12:	learn: 0.5353845	total: 65.2ms	remaining: 4.95s
13:	learn: 0.5558388	total: 68.9ms	remaining: 4.85s
14:	learn: 0.5742848	total: 72.7ms	remaining: 4.78s
15:	learn: 0.5910248	total: 76ms	remaining: 4.67s
16:	learn: 0.6059158	total: 79.8ms	remaining: 4.62s
17:	learn: 0.6196233	total: 83.4ms	remaining: 4.55s
18:	learn: 0.6323971	total: 87.1ms	remaining: 4.5s
19:	learn: 0.6441842	tota

In [16]:
br = BaggingRegressor(cat_r)
br.fit(X_train, y_train)
y_val_pred = sr.predict(X_val)
print('r2_score:', r2_score(y_val, y_val_pred))
print('mean_absolute_error:', mean_absolute_error(y_val, y_val_pred))
print(y_val.std())

0:	learn: 0.0526836	total: 5.49ms	remaining: 5.48s
1:	learn: 0.0999908	total: 11.9ms	remaining: 5.95s
2:	learn: 0.1431066	total: 16.4ms	remaining: 5.45s
3:	learn: 0.1820529	total: 21.2ms	remaining: 5.27s
4:	learn: 0.2185190	total: 96.5ms	remaining: 19.2s
5:	learn: 0.2528125	total: 100ms	remaining: 16.6s
6:	learn: 0.2843401	total: 105ms	remaining: 14.9s
7:	learn: 0.3125044	total: 109ms	remaining: 13.6s
8:	learn: 0.3382614	total: 114ms	remaining: 12.6s
9:	learn: 0.3617415	total: 124ms	remaining: 12.3s
10:	learn: 0.3871850	total: 131ms	remaining: 11.8s
11:	learn: 0.4104590	total: 139ms	remaining: 11.4s
12:	learn: 0.4301090	total: 143ms	remaining: 10.8s
13:	learn: 0.4487986	total: 146ms	remaining: 10.3s
14:	learn: 0.4636814	total: 151ms	remaining: 9.91s
15:	learn: 0.4802993	total: 155ms	remaining: 9.52s
16:	learn: 0.4947290	total: 159ms	remaining: 9.2s
17:	learn: 0.5081377	total: 163ms	remaining: 8.88s
18:	learn: 0.5182162	total: 170ms	remaining: 8.77s
19:	learn: 0.5297444	total: 175ms	rem

In [17]:
y_test_pred = cat_r.predict(X_test)
print('r2_score:', r2_score(y_test, y_test_pred))
print('mean_absolute_error:', mean_absolute_error(y_test, y_test_pred))
pred = list()
for i in range(len(y_test)):
    pred.append((np.exp(y_test.values[i]), np.exp(y_test_pred[i])))
pred = pd.DataFrame(pred, columns=['y_test', 'y_test_pred'])
print('r2_score:', r2_score(pred.y_test, pred.y_test_pred))
print('mean_absolute_error:', mean_absolute_error(pred.y_test, pred.y_test_pred))
print('y_test std:', pred.y_test.std())
pred

r2_score: 0.7417059009287954
mean_absolute_error: 0.2224495637973066
r2_score: 0.7453734345449767
mean_absolute_error: 53.14186920568911
y_test std: 164.26058565961893


Unnamed: 0,y_test,y_test_pred
0,180.0,175.274413
1,600.0,755.444311
2,263.0,337.593054
3,140.0,97.886605
4,130.0,139.953717
...,...,...
348,160.0,139.489456
349,280.0,193.852962
350,280.0,383.965331
351,160.0,107.150754


In [18]:
cat_r.get_feature_importance(prettified=True).sort_values('Importances', ascending=False)

Unnamed: 0,Feature Id,Importances
0,pixel,18.710514
1,Announced,12.882344
2,RAM,10.403206
3,Chipset,8.36184
4,brand,6.846074
5,Storage,6.792641
6,Display Type,6.730191
7,Weight,6.50661
8,ppi,4.671103
9,body ratio,4.235061


In [19]:
cat_r = cb.CatBoostRegressor(eval_metric='R2')
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

grid_search_result = cat_r.grid_search(grid,
                                       X=X_train,
                                       y=y_train,
                                       plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: -65.8987062	test: -72.8978313	best: -72.8978313 (0)	total: 2.15ms	remaining: 2.15s
1:	learn: -62.0230689	test: -68.6021575	best: -68.6021575 (1)	total: 4.5ms	remaining: 2.24s
2:	learn: -58.3642311	test: -64.5404339	best: -64.5404339 (2)	total: 6.74ms	remaining: 2.24s
3:	learn: -54.9189877	test: -60.7063497	best: -60.7063497 (3)	total: 10ms	remaining: 2.49s
4:	learn: -51.6834641	test: -57.1085603	best: -57.1085603 (4)	total: 11.7ms	remaining: 2.33s
5:	learn: -48.6483783	test: -53.7509104	best: -53.7509104 (5)	total: 13.4ms	remaining: 2.23s
6:	learn: -45.7854895	test: -50.5851159	best: -50.5851159 (6)	total: 15.5ms	remaining: 2.2s
7:	learn: -43.0863102	test: -47.5924818	best: -47.5924818 (7)	total: 17.4ms	remaining: 2.15s
8:	learn: -40.5366609	test: -44.7670513	best: -44.7670513 (8)	total: 18.8ms	remaining: 2.07s
9:	learn: -38.1460824	test: -42.1258028	best: -42.1258028 (9)	total: 20.7ms	remaining: 2.04s
10:	learn: -35.8957356	test: -39.6304741	best: -39.6304741 (10)	total: 24.

In [20]:
y_test_pred = cat_r.predict(X_test)
print('r2_score:', r2_score(y_test, y_test_pred))
print('mean_absolute_error:', mean_absolute_error(y_test, y_test_pred))
pred = list()
for i in range(len(y_test)):
    pred.append((np.exp(y_test.values[i]), np.exp(y_test_pred[i])))
pred = pd.DataFrame(pred, columns=['y_test', 'y_test_pred'])
print('r2_score:', r2_score(pred.y_test, pred.y_test_pred))
print('mean_absolute_error:', mean_absolute_error(pred.y_test, pred.y_test_pred))
print('y_test std:', pred.y_test.std())
pred

r2_score: 0.7531404863264658
mean_absolute_error: 0.2196667769728868
r2_score: 0.7552144344542486
mean_absolute_error: 52.549356874606524
y_test std: 164.26058565961893


Unnamed: 0,y_test,y_test_pred
0,180.0,173.802287
1,600.0,615.504017
2,263.0,306.193100
3,140.0,113.520281
4,130.0,151.998879
...,...,...
348,160.0,141.185915
349,280.0,199.923163
350,280.0,345.430824
351,160.0,120.745099


In [21]:
cat_r.get_feature_importance(prettified=True).sort_values('Importances', ascending=False)

Unnamed: 0,Feature Id,Importances
0,pixel,25.014478
1,RAM,14.079752
2,Announced,13.656389
3,Storage,8.214932
4,ppi,7.703129
5,Weight,5.847947
6,Chipset,4.77478
7,brand,4.717252
8,battery_capacity,3.74779
9,body ratio,3.566165
