In [1]:
import matplotlib.pyplot as plt

plt.style.use('seaborn-pastel')
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 500
plt.rcParams['text.color'] = 'black'
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
plt.rcParams['axes.titlepad'] = 10
plt.rcParams['axes.titleweight'] = 1000
plt.rcParams['axes.labelpad'] = 5
plt.rcParams['font.family'] = 'serif'
plt.rcParams['axes.facecolor'] = 'white'

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

us = pd.read_csv("us_clocks.csv")
uk = pd.read_csv("uk_clocks.csv")

independentvariables = ['listingIsTopRated', 'sellerFeedbackScore', 'sellerPositivePercent', 'sellerIsTopRated',
                        'endAtWeekend','endAtEvening','length','isBroken','isUsed','isLarge','freeShipping']

X_uk = uk[independentvariables] 
y_uk = uk.price
X_train, X_test, y_train, y_test = train_test_split(X_uk, y_uk, test_size=0.3)

X_us = us[independentvariables] 
y_us = us.price
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(X_us, y_us, test_size=0.3)

In [3]:
random_forest_us = RandomForestRegressor(max_depth=1, min_samples_leaf=1, min_samples_split=1.0)
model = random_forest_us.fit(X_train_us,y_train_us)
y_pred = model.predict(X_test_us)

AE2_us = abs(y_test_us-y_pred)*100/y_test_us

In [4]:
random_forest_uk = RandomForestRegressor(max_depth=1, min_samples_leaf=1, min_samples_split=1.0)
model = random_forest_uk.fit(X_train,y_train)
y_pred = model.predict(X_test)

AE2_uk = abs(y_test-y_pred)*100/y_test

In [5]:
svm_us = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
model = svm_us.fit(X_train_us, y_train_us)
y_pred = model.predict(X_test_us)

AE3_us = abs(y_test_us-y_pred)*100/y_test_us

In [6]:
svm_uk = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
model = svm_us.fit(X_train, y_train)
y_pred = model.predict(X_test)

AE3_uk = abs(y_test-y_pred)*100/y_test

In [7]:
knn_us = KNeighborsRegressor(n_neighbors=5)
model = knn_us.fit(X_train_us, y_train_us)
y_pred = model.predict(X_test_us)

AE1_us = abs(y_test_us-y_pred)*100/y_test_us

In [8]:
knn_uk = KNeighborsRegressor(n_neighbors=5)
model = knn_us.fit(X_train, y_train)
y_pred = model.predict(X_test)

AE1_uk = abs(y_test-y_pred)*100/y_test

In [9]:
lin_us = LinearRegression()
model = lin_us.fit(X_train_us, y_train_us)
y_pred = model.predict(X_test_us)

AE0_us = abs(y_test_us-y_pred)*100/y_test_us

In [10]:
lin_uk = LinearRegression()
model = lin_uk.fit(X_train, y_train)
y_pred = model.predict(X_test)

AE0_uk = abs(y_test-y_pred)*100/y_test

In [11]:
AE4_uk = abs(y_test-y_test.median())*100/y_test.median()

In [12]:
AE4_us = abs(y_test_us-y_test_us.median())*100/y_test_us.median()

In [17]:
fig, ax = plt.subplots()

ax.boxplot([AE0_uk,AE1_uk,AE2_uk,AE3_uk,AE4_uk], whis=(5, 95), patch_artist=True, flierprops={'markersize':3},
           boxprops={'facecolor':'lightgreen'}, medianprops={'color':'black'})
ax.set_xticklabels(['Linear \n regression', 'KNN \n regression', 'Random forest \n regression', 'SVM \n regression', 'Baseline \n (Median)'])
ax.set_ylabel('Absolute error (%)')
ax.set_title('Distribution of absolute errors on testing set (UK Clocks)')
ax.set_ylim(0,200)

plt.tight_layout()
fig.savefig('AE-uk.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [16]:
fig, ax = plt.subplots()

ax.boxplot([AE0_us,AE1_us,AE2_us,AE3_us,AE4_us], whis=(5, 95), patch_artist=True, flierprops={'markersize':3},
           boxprops={'facecolor':'lightgreen'}, medianprops={'color':'black'})
ax.set_xticklabels(['Linear \n regression', 'KNN \n regression', 'Random forest \n regression', 'SVM \n regression', 'Baseline \n (Median)'])
ax.set_ylabel('Absolute error (%)')
ax.set_title('Distribution of absolute errors on testing set (US Clocks)')
ax.set_ylim(0,200)

plt.tight_layout()
fig.savefig('AE-us.png', dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)

In [19]:
AE0_uk.describe()

count     132.000000
mean      316.182402
std       616.162009
min         0.687344
25%        54.342809
50%       126.036472
75%       316.761520
max      5000.578609
Name: price, dtype: float64

In [20]:
AE1_uk.describe()

count      132.000000
mean       420.228030
std       1499.273983
min          0.360000
25%         35.656755
50%         85.915618
75%        295.061158
max      16090.600000
Name: price, dtype: float64

In [21]:
AE2_uk.describe()

count     132.000000
mean      328.900024
std       602.528642
min         0.500428
25%        49.552746
50%       121.733793
75%       303.433022
max      3919.179685
Name: price, dtype: float64

In [22]:
AE3_uk.describe()

count     132.000000
mean      133.107825
std       248.888852
min         0.827683
25%        29.399107
50%        58.758339
75%        95.849483
max      1853.799411
Name: price, dtype: float64

In [23]:
AE4_uk.describe()

count     132.000000
mean      135.913237
std       274.086421
min         0.250766
25%        33.098217
50%        63.782669
75%       111.186960
max      2673.140150
Name: price, dtype: float64

In [24]:
AE0_us.describe()

count      125.000000
mean       583.626287
std       1826.890949
min          1.279678
25%         49.009147
50%         99.246179
75%        385.272594
max      15781.372867
Name: price, dtype: float64

In [25]:
AE1_us.describe()

count     125.000000
mean      391.643009
std      1117.824888
min         1.785714
25%        34.718491
50%        75.990977
75%       235.952000
max      9364.646465
Name: price, dtype: float64

In [26]:
AE2_us.describe()

count      125.000000
mean       616.767445
std       1731.910117
min          2.368643
25%         53.729214
50%        109.269843
75%        424.899771
max      14544.703622
Name: price, dtype: float64

In [27]:
AE3_us.describe()

count     125.000000
mean      273.059859
std       823.715621
min         0.250192
25%        32.765437
50%        71.117092
75%       147.310920
max      6907.779790
Name: price, dtype: float64

In [28]:
AE4_us.describe()

count     125.000000
mean      162.812461
std       284.016501
min         0.000000
25%        37.182052
50%        67.133467
75%       114.346956
max      1703.372392
Name: price, dtype: float64