In [27]:
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score

#### load data

In [3]:
df_train = pd.read_csv("../data/processed/us-train.csv")
df_val = pd.read_csv("../data/processed/us-val.csv")
df_test = pd.read_csv("../data/processed/us-test.csv")

In [4]:
# df_train = df_train.fillna(method='ffill')
# df_val = df_val.fillna(method='ffill')
# df_test = df_test.fillna(method='ffill')

In [5]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(44466, 105)
(14822, 105)
(14823, 105)


#### train-test split

In [6]:
y_train = df_train.log_price
X_train = df_train.drop('log_price', axis=1)

y_val = df_val.log_price
X_val = df_val.drop('log_price', axis=1)

y_test = df_test.log_price
X_test = df_test.drop('log_price', axis=1)

y_train_final = pd.concat([y_train, y_val], axis=0)
X_train_final = pd.concat([X_train, X_val], axis=0)

y_val_exp = np.exp(y_val)
y_test_exp = np.exp(y_test)

In [7]:
X_train_final.shape

(59288, 104)

### Neural Network

**response: original price**

#### default model:

In [8]:
# defualt hyperparameters
default_mlp = MLPRegressor().fit(X_train, y_train)

In [9]:
print(median_absolute_error(y_val_exp, np.exp(default_mlp.predict(X_val))))
print(mean_absolute_error(y_val_exp, np.exp(default_mlp.predict(X_val))))

42.16969163194098
72.45569510909544


#### Random Search

In [10]:
mlp = MLPRegressor(max_iter=200, early_stopping=True, n_iter_no_change=5)
# parameter
layers = [(200,), (220,), (300,), (200, 200), (120, 120)]

param_dist_mlp = dict(hidden_layer_sizes=layers, 
                      learning_rate=['constant','adaptive'],
                      learning_rate_init = [0.001, 0.0001, 0.01],
                      alpha=[0.0001, 0.002 ,0.05])

# randomized search
rand_mlp = RandomizedSearchCV(mlp, param_dist_mlp, random_state=0, 
                              n_iter=10, cv=3, verbose=0, n_jobs=-1)

In [11]:
best_model_mlp = rand_mlp.fit(X_train, y_train)

In [12]:
print('Best hidden_layer_sizes:', best_model_mlp.best_estimator_.get_params()['hidden_layer_sizes'])
print('Best learning_rate:', best_model_mlp.best_estimator_.get_params()['learning_rate'])
print('Best alpha:', best_model_mlp.best_estimator_.get_params()['alpha'])
print('Best learning_rate_init:', best_model_mlp.best_estimator_.get_params()['learning_rate_init'])

Best hidden_layer_sizes: (300,)
Best learning_rate: constant
Best alpha: 0.002
Best learning_rate_init: 0.0001


In [13]:
best_mlp_exp = MLPRegressor(hidden_layer_sizes=(200, 200), 
                         learning_rate='adaptive', learning_rate_init=0.01, alpha=0.002, 
                         max_iter=200, early_stopping=True, 
                         n_iter_no_change=5).fit(X_train, y_train)

In [14]:
print(median_absolute_error(y_val_exp, np.exp(best_mlp_exp.predict(X_val))))
print(mean_absolute_error(y_val_exp, np.exp(best_mlp_exp.predict(X_val))))

26.958491956908063
59.75541539754243


### Training neural net using the best model

In [15]:
final_mlp = MLPRegressor(hidden_layer_sizes=(200, 200), 
                         learning_rate='adaptive', learning_rate_init=0.0001, alpha=0.02, 
                         max_iter=200, early_stopping=True, 
                         n_iter_no_change=5).fit(X_train_final, y_train_final)

**Training error**

In [32]:
y_train_final_exp = np.exp(y_train_final)

medianAE_train_us = median_absolute_error(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)))
meanAE_train_us = mean_absolute_error(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)))
RMSE_train_us = mean_squared_error(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)), squared=False)
R2_train_us = r2_score(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)))

In [33]:
print('Median Absolute Error (train):', medianAE_train_us)
print('Mean Absolute Error (Train):', meanAE_train_us)
print('RMSE (Train):', RMSE_train_us)
print('R2 (Train):', R2_train_us)

Median Absolute Error (train): 30.234438930834358
Mean Absolute Error (Train): 61.02095566022959
RMSE (Train): 130.39419346009564
R2 (Train): 0.4023927931578193


**US testing error**

In [34]:
medianAE_test_us = median_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
meanAE_test_us = mean_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
RMSE_test_us = mean_squared_error(y_test_exp, np.exp(final_mlp.predict(X_test)), squared=False)
R2_test_us = r2_score(y_test_exp, np.exp(final_mlp.predict(X_test)))

In [35]:
print('Median Absolute Error (Test):', medianAE_test_us)
print('Mean Absolute Error (Test):', meanAE_test_us)
print('RMSE (Test):', RMSE_test_us)
print('R2 (Test):', R2_test_us)

Median Absolute Error (Test): 30.669548154009675
Mean Absolute Error (Test): 62.1781519862981
RMSE (Test): 134.7233984529619
R2 (Test): 0.3583993176562267


## Test on International data

#### Load data

In [68]:
madrid = pd.read_csv('../data/processed/madrid-test.csv')
london = pd.read_csv('../data/processed/london-test.csv')
paris = pd.read_csv('../data/processed/paris-test.csv')

In [69]:
print(madrid.shape)
print(london.shape)
print(paris.shape)

(20149, 105)
(76510, 105)
(64628, 105)


In [70]:
y_madrid = madrid.log_price
X_madrid = madrid.drop('log_price', axis=1)

y_london = london.log_price
X_london = london.drop('log_price', axis=1)

y_paris = paris.log_price
X_paris = paris.drop('log_price', axis=1)

#### Madrid

In [71]:
y_madrid_pred = final_mlp.predict(X_madrid)

In [72]:
# cost-index transform
madrid_col = 100/75.8
y_madrid_pred_adj = np.log(np.exp(y_madrid_pred) / madrid_col)
y_madrid_exp = np.exp(y_madrid)

In [73]:
medianAE_test_madrid = median_absolute_error(y_madrid_exp, np.exp(y_madrid_pred_adj))
meanAE_test_madrid = mean_absolute_error(y_madrid_exp, np.exp(y_madrid_pred_adj))
RMSE_test_madrid = mean_squared_error(y_madrid_exp, np.exp(y_madrid_pred_adj), squared=False)
R2_test_madrid = r2_score(y_madrid_exp, np.exp(y_madrid_pred_adj))

In [74]:
print('Median Absolute Error (Madrid):', medianAE_test_madrid)
print('Mean Absolute Error (Madrid):', meanAE_test_madrid)
print('RMSE (Madrid):', RMSE_test_madrid)
print('R2 (Madrid):', R2_test_madrid)

Median Absolute Error (Madrid): 53.869180570776294
Mean Absolute Error (Madrid): 116.1035986895079
RMSE (Madrid): 465.3253105565991
R2 (Madrid): -0.038544649238528184


#### London

In [75]:
y_london_pred = final_mlp.predict(X_london)

# cost-index transform
london_col = 100/95
y_london_pred_adj = np.log(np.exp(y_london_pred) / london_col)

y_london_exp = np.exp(y_london)

In [76]:
medianAE_test_london = median_absolute_error(y_london_exp, np.exp(y_london_pred_adj))
meanAE_test_london = mean_absolute_error(y_london_exp, np.exp(y_london_pred_adj))
RMSE_test_london = mean_squared_error(y_london_exp, np.exp(y_london_pred_adj), squared=False)
R2_test_london = r2_score(y_london_exp, np.exp(y_london_pred_adj))

In [77]:
print('Median Absolute Error (london):', medianAE_test_london)
print('Mean Absolute Error (london):', meanAE_test_london)
print('RMSE (london):', RMSE_test_london)
print('R2 (london):', R2_test_london)

Median Absolute Error (london): 44.65812144170613
Mean Absolute Error (london): 296.07003568203504
RMSE (london): 60475.68350784525
R2 (london): -37123.29218845762


#### Paris

In [78]:
y_paris_pred = final_mlp.predict(X_paris)

# cost-index transform
paris_col = 100/89.3
y_paris_pred_adj = np.log(np.exp(y_paris_pred) / paris_col)

y_paris_exp = np.exp(y_paris)

In [79]:
medianAE_test_paris = median_absolute_error(y_paris_exp, np.exp(y_paris_pred_adj))
meanAE_test_paris = mean_absolute_error(y_paris_exp, np.exp(y_paris_pred_adj))
RMSE_test_paris = mean_squared_error(y_paris_exp, np.exp(y_paris_pred_adj), squared=False)
R2_test_paris = r2_score(y_paris_exp, np.exp(y_paris_pred_adj))

In [80]:
print('Median Absolute Error (Paris):', medianAE_test_paris)
print('Mean Absolute Error (Paris):', meanAE_test_paris)
print('RMSE (Paris):', RMSE_test_paris)
print('R2 (Paris):', R2_test_paris)

Median Absolute Error (Paris): 58.458503038943974
Mean Absolute Error (Paris): 482.1758467822378
RMSE (Paris): 65298.29468895368
R2 (Paris): -92665.74778219625


#### Combined international

In [81]:
X_international = pd.concat([X_madrid, X_london, X_paris])
y_international = pd.concat([y_madrid, y_london, y_paris])
print(X_international.shape)
print(y_international.shape)

(161287, 104)
(161287,)


In [82]:
y_inter_pred = final_mlp.predict(X_international)

# cost-index transform
# weighted avg
eu_col = 100/90.32
y_inter_pred_adj = np.log(np.exp(y_inter_pred) / eu_col)

y_eu_exp = np.exp(y_international)

In [83]:
medianAE_test_eu = median_absolute_error(y_eu_exp, np.exp(y_inter_pred_adj))
meanAE_test_eu = mean_absolute_error(y_eu_exp, np.exp(y_inter_pred_adj))
RMSE_test_eu = mean_squared_error(y_eu_exp, np.exp(y_inter_pred_adj), squared=False)
R2_test_eu = r2_score(y_eu_exp, np.exp(y_inter_pred_adj))

In [84]:
print('Median Absolute Error (Paris):', medianAE_test_eu)
print('Mean Absolute Error (Paris):', meanAE_test_eu)
print('RMSE (Paris):', RMSE_test_eu)
print('R2 (Paris):', R2_test_eu)

Median Absolute Error (Paris): 51.38537679335537
Mean Absolute Error (Paris): 345.5028172301994
RMSE (Paris): 57584.90854968894
R2 (Paris): -36344.35753380213
