In [1]:
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

#### load data

In [2]:
df = pd.read_csv("../data/processed/cleaned_train.csv")

In [3]:
df.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,number_of_reviews,review_scores_rating,thumbnail_url,...,x0_Tipi,x0_Townhouse,x0_Train,x0_Treehouse,x0_Vacation home,x0_Villa,x0_Yurt,x1_Entire home/apt,x1_Private room,x1_Shared room
0,5.010635,3,1.0,1,1,100.0,0,2,100.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,5.129899,7,1.0,1,0,100.0,1,6,93.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4.976734,5,1.0,1,1,100.0,1,10,92.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,6.620073,4,1.0,1,1,100.0,0,0,40.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4.744932,2,1.0,1,1,100.0,1,4,40.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
df.shape

(74111, 105)

#### train-test split

In [5]:
from sklearn.model_selection import train_test_split

y = df.log_price
X = df.drop('log_price', axis=1)

r_state = 2021
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=r_state)

### Neural Network

**response: original price**

#### default model:

In [6]:
y_test_exp = np.exp(y_test)

In [7]:
# defualt hyperparameters
default_mlp = MLPRegressor().fit(X_train, y_train)

In [9]:
print(median_absolute_error(y_test_exp, np.exp(default_mlp.predict(X_test))))
print(mean_absolute_error(y_test_exp, np.exp(default_mlp.predict(X_test))))

74.91952592930951
121.94548732812079


#### Random Search

In [10]:
mlp = MLPRegressor(max_iter=200, early_stopping=True, n_iter_no_change=5)
# parameter
layers = [(200,), (220,), (300,), (200, 200), (120, 120)]

param_dist_mlp = dict(hidden_layer_sizes=layers, 
                      learning_rate=['constant','adaptive'],
                      learning_rate_init = [0.001, 0.0001, 0.01],
                      alpha=[0.0001, 0.002 ,0.05])

# randomized search
rand_mlp = RandomizedSearchCV(mlp, param_dist_mlp, random_state=0, 
                              n_iter=10, cv=3, verbose=0, n_jobs=-1)

In [11]:
best_model_mlp = rand_mlp.fit(X_train, y_train)

In [16]:
print('Best hidden_layer_sizes:', best_model_mlp.best_estimator_.get_params()['hidden_layer_sizes'])
print('Best learning_rate:', best_model_mlp.best_estimator_.get_params()['learning_rate'])
print('Best alpha:', best_model_mlp.best_estimator_.get_params()['alpha'])
print('Best learning_rate_init:', best_model_mlp.best_estimator_.get_params()['learning_rate_init'])

Best hidden_layer_sizes: (200, 200)
Best learning_rate: adaptive
Best alpha: 0.002
Best learning_rate_init: 0.01


In [17]:
best_mlp_exp = MLPRegressor(hidden_layer_sizes=(200, 200), 
                         learning_rate='adaptive', learning_rate_init=0.01, alpha=0.02, 
                         max_iter=200, early_stopping=True, 
                         n_iter_no_change=5).fit(X_train, y_train)

In [18]:
print(median_absolute_error(y_test_exp, np.exp(best_mlp_exp.predict(X_test))))
print(mean_absolute_error(y_test_exp, np.exp(best_mlp_exp.predict(X_test))))

29.037170460319558
59.76801048386642


In [19]:
df_test = pd.read_csv("../data/processed/cleaned_test_us.csv")

In [20]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,accommodates,bathrooms,host_response_rate,number_of_reviews,review_scores_rating,thumbnail_url,bedrooms,beds,cc_dist,...,x3_f,x3_t,x4_f,x4_t,x0_Casa particular,x0_Hut,x0_Lighthouse,x0_Parking Space,x0_Island,log_price
0,0,2,1.0,100.0,6,97.0,1,1.0,1.0,12.474817,...,1.0,0.0,1.0,0.0,0,0,0,0,0,0
1,1,3,1.0,100.0,2,80.0,1,1.0,1.0,3.250882,...,0.0,1.0,1.0,0.0,0,0,0,0,0,0
2,2,1,1.0,100.0,2,100.0,1,1.0,1.0,4.039575,...,0.0,1.0,1.0,0.0,0,0,0,0,0,0
3,3,1,1.0,100.0,7,94.0,1,0.0,1.0,16.195648,...,0.0,1.0,1.0,0.0,0,0,0,0,0,0
4,4,2,1.0,100.0,0,98.0,1,1.0,1.0,8.232762,...,0.0,1.0,1.0,0.0,0,0,0,0,0,0
