In [27]:
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score

#### load data

In [3]:
df_train = pd.read_csv("../data/processed/us-train.csv")
df_val = pd.read_csv("../data/processed/us-val.csv")
df_test = pd.read_csv("../data/processed/us-test.csv")

In [4]:
# df_train = df_train.fillna(method='ffill')
# df_val = df_val.fillna(method='ffill')
# df_test = df_test.fillna(method='ffill')

In [5]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(44466, 105)
(14822, 105)
(14823, 105)


#### train-test split

In [6]:
y_train = df_train.log_price
X_train = df_train.drop('log_price', axis=1)

y_val = df_val.log_price
X_val = df_val.drop('log_price', axis=1)

y_test = df_test.log_price
X_test = df_test.drop('log_price', axis=1)

y_train_final = pd.concat([y_train, y_val], axis=0)
X_train_final = pd.concat([X_train, X_val], axis=0)

y_val_exp = np.exp(y_val)
y_test_exp = np.exp(y_test)

In [7]:
X_train_final.shape

(59288, 104)

### Neural Network

**response: original price**

#### default model:

In [8]:
# defualt hyperparameters
default_mlp = MLPRegressor().fit(X_train, y_train)

In [9]:
print(median_absolute_error(y_val_exp, np.exp(default_mlp.predict(X_val))))
print(mean_absolute_error(y_val_exp, np.exp(default_mlp.predict(X_val))))

42.16969163194098
72.45569510909544


#### Random Search

In [10]:
mlp = MLPRegressor(max_iter=200, early_stopping=True, n_iter_no_change=5)
# parameter
layers = [(200,), (220,), (300,), (200, 200), (120, 120)]

param_dist_mlp = dict(hidden_layer_sizes=layers, 
                      learning_rate=['constant','adaptive'],
                      learning_rate_init = [0.001, 0.0001, 0.01],
                      alpha=[0.0001, 0.002 ,0.05])

# randomized search
rand_mlp = RandomizedSearchCV(mlp, param_dist_mlp, random_state=0, 
                              n_iter=10, cv=3, verbose=0, n_jobs=-1)

In [11]:
best_model_mlp = rand_mlp.fit(X_train, y_train)

In [12]:
print('Best hidden_layer_sizes:', best_model_mlp.best_estimator_.get_params()['hidden_layer_sizes'])
print('Best learning_rate:', best_model_mlp.best_estimator_.get_params()['learning_rate'])
print('Best alpha:', best_model_mlp.best_estimator_.get_params()['alpha'])
print('Best learning_rate_init:', best_model_mlp.best_estimator_.get_params()['learning_rate_init'])

Best hidden_layer_sizes: (300,)
Best learning_rate: constant
Best alpha: 0.002
Best learning_rate_init: 0.0001


In [13]:
best_mlp_exp = MLPRegressor(hidden_layer_sizes=(200, 200), 
                         learning_rate='adaptive', learning_rate_init=0.01, alpha=0.002, 
                         max_iter=200, early_stopping=True, 
                         n_iter_no_change=5).fit(X_train, y_train)

In [14]:
print(median_absolute_error(y_val_exp, np.exp(best_mlp_exp.predict(X_val))))
print(mean_absolute_error(y_val_exp, np.exp(best_mlp_exp.predict(X_val))))

26.958491956908063
59.75541539754243


### Training neural net using the best model

In [15]:
final_mlp = MLPRegressor(hidden_layer_sizes=(200, 200), 
                         learning_rate='adaptive', learning_rate_init=0.0001, alpha=0.02, 
                         max_iter=200, early_stopping=True, 
                         n_iter_no_change=5).fit(X_train_final, y_train_final)

**Training error**

In [32]:
y_train_final_exp = np.exp(y_train_final)

medianAE_train_us = median_absolute_error(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)))
meanAE_train_us = mean_absolute_error(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)))
RMSE_train_us = mean_squared_error(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)), squared=False)
R2_train_us = r2_score(y_train_final_exp, np.exp(final_mlp.predict(X_train_final)))

In [33]:
print('Median Absolute Error (train):', medianAE_train_us)
print('Mean Absolute Error (Train):', meanAE_train_us)
print('RMSE (Train):', RMSE_train_us)
print('R2 (Train):', R2_train_us)

Median Absolute Error (train): 30.234438930834358
Mean Absolute Error (Train): 61.02095566022959
RMSE (Train): 130.39419346009564
R2 (Train): 0.4023927931578193


**US testing error**

In [34]:
medianAE_test_us = median_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
meanAE_test_us = mean_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
RMSE_test_us = mean_squared_error(y_test_exp, np.exp(final_mlp.predict(X_test)), squared=False)
R2_test_us = r2_score(y_test_exp, np.exp(final_mlp.predict(X_test)))

In [35]:
print('Median Absolute Error (Test):', medianAE_test_us)
print('Mean Absolute Error (Test):', meanAE_test_us)
print('RMSE (Test):', RMSE_test_us)
print('R2 (Test):', R2_test_us)

Median Absolute Error (Test): 30.669548154009675
Mean Absolute Error (Test): 62.1781519862981
RMSE (Test): 134.7233984529619
R2 (Test): 0.3583993176562267


## Test on International data

#### Load data

In [25]:
madrid = pd.read_csv('../data/processed/madrid-test.csv')
london = pd.read_csv('../data/processed/london-test.csv')
paris = pd.read_csv('../data/processed/paris-test.csv')

In [26]:
print(madrid.shape)
print(london.shape)
print(paris.shape)

(20149, 105)
(76510, 105)
(64628, 105)


#### Transformation

#### Madrid

In [None]:
medianAE_test_madrid = median_absolute_error(madrid, np.exp(final_mlp.predict(X_test)))
meanAE_test_madrid = mean_absolute_error(madrid, np.exp(final_mlp.predict(X_test)))
RMSE_test_madrid = mean_squared_error(madrid, np.exp(final_mlp.predict(X_test)))
R2_test_madrid = final_mlp.score(X_test, y_test)

In [None]:
print('Median Absolute Error (Madrid):', medianAE_test_madrid)
print('Mean Absolute Error (Madrid):', meanAE_test_madrid)
print('RMSE (Madrid):', RMSE_test_madrid)
print('R2 (Madrid):', R2_test_madrid)

#### London

In [None]:
medianAE_test_madrid = median_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
meanAE_test_madrid = mean_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
RMSE_test_madrid = mean_squared_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
R2_test_madrid = final_mlp.score(X_test, y_test)

In [None]:
print('Median Absolute Error (Madrid):', medianAE_test_madrid)
print('Mean Absolute Error (Madrid):', meanAE_test_madrid)
print('RMSE (Madrid):', RMSE_test_madrid)
print('R2 (Madrid):', R2_test_madrid)

#### Paris

In [None]:
medianAE_test_madrid = median_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
meanAE_test_madrid = mean_absolute_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
RMSE_test_madrid = mean_squared_error(y_test_exp, np.exp(final_mlp.predict(X_test)))
R2_test_madrid = final_mlp.score(X_test, y_test)

In [None]:
print('Median Absolute Error (Madrid):', medianAE_test_madrid)
print('Mean Absolute Error (Madrid):', meanAE_test_madrid)
print('RMSE (Madrid):', RMSE_test_madrid)
print('R2 (Madrid):', R2_test_madrid)