In [1]:
import pandas as pd
import numpy as np
pd.set_option('max.rows',200)

## Train & Evaluate Model in US

### Load data

In [2]:
train = pd.read_csv('../data/processed/us-train.csv')
val = pd.read_csv('../data/processed/us-val.csv')
us = pd.read_csv('../data/processed/us-test.csv')

In [3]:
y_train = train.log_price
y_val = val.log_price

In [4]:
X_train = train.drop(columns=['log_price'])
X_val = val.drop(columns=['log_price'])

In [None]:
# X_train_mean = X_train.copy().mean()
# X_train_stdev = X_train.copy().std()

# # standardize data
# X_train = (X_train - X_train_mean)/X_train_stdev
# X_val = (X_val - X_train_mean)/X_train_stdev

In [None]:
# X_train.fillna(0, inplace=True)
# X_train.replace([np.inf, -np.inf], 0, inplace=True)
# X_val.fillna(0, inplace=True)
# X_val.replace([np.inf, -np.inf], 0, inplace=True)

### Train Random Forest

### Default Parameters

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

In [6]:
rf = RandomForestRegressor().fit(X_train, y_train)

In [8]:
y_pred_train = rf.predict(X_train)
y_pred = rf.predict(X_val)

acc_train = rf.score(X_train, y_train)
acc = rf.score(X_val, y_val)

rmse_train = mean_squared_error(np.exp(y_train), np.exp(y_pred_train), squared=False)
rmse = mean_squared_error(np.exp(y_val), np.exp(y_pred), squared=False)

mae_train = mean_absolute_error(np.exp(y_train), np.exp(y_pred_train))
mae = mean_absolute_error(np.exp(y_val), np.exp(y_pred))

medae_train = median_absolute_error(np.exp(y_train), np.exp(y_pred_train))
medae = median_absolute_error(np.exp(y_val), np.exp(y_pred))

In [9]:
print('R^2 (Train):', round(acc_train,2)) 
print('R^2 (Val):', round(acc,2))
print('')
print('RMSE (Train):', round(rmse_train,2))
print('RMSE (Val):', round(rmse,2))
print('')
print('Mean Abs Error (Train):', round(mae_train,2))
print('Mean Abs Error (Val):', round(mae,2))
print('')
print('Median Absolute Error (Train):', round(medae_train,2))
print('Median Absolute Error (Val):', round(medae,2))

R^2 (Train): 0.95
R^2 (Val): 0.67

RMSE (Train): 59.11
RMSE (Val): 116.66

Mean Abs Error (Train): 21.97
Mean Abs Error (Val): 52.53

Median Absolute Error (Train): 8.71
Median Absolute Error (Val): 24.43


### Random Search Parameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# rf parameters input into random search
n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# produce random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# random search best hyper parameters of model
rf_r = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf_r, param_distributions = random_grid, n_iter = 15, cv = 5, 
                               verbose=2, random_state=0, n_jobs = -1)

In [None]:
# fit the random search model on 3 cross validations with 20 iters each
rf_random.fit(X_train, y_train)

In [None]:
# get best random search parameters
rf_random.best_params_

In [10]:
rf_opt = RandomForestRegressor(n_estimators=100, min_samples_split=2, 
                                min_samples_leaf=2, max_features='auto',
                               max_depth=15, bootstrap=True).fit(X_train, y_train)

In [11]:
y_pred_train = rf_opt.predict(X_train)
y_pred = rf_opt.predict(X_val)

acc_train = rf.score(X_train, y_train)
acc = rf.score(X_val, y_val)

rmse_train = mean_squared_error(np.exp(y_train), np.exp(y_pred_train), squared=False)
rmse = mean_squared_error(np.exp(y_val), np.exp(y_pred), squared=False)

mae_train = mean_absolute_error(np.exp(y_train), np.exp(y_pred_train))
mae = mean_absolute_error(np.exp(y_val), np.exp(y_pred))

medae_train = median_absolute_error(np.exp(y_train), np.exp(y_pred_train))
medae = median_absolute_error(np.exp(y_val), np.exp(y_pred))

In [12]:
print('R^2 (Train):', round(acc_train,2)) 
print('R^2 (Val):', round(acc,2))
print('')
print('RMSE (Train):', round(rmse_train,2))
print('RMSE (Val):', round(rmse,2))
print('')
print('Mean Abs Error (Train):', round(mae_train,2))
print('Mean Abs Error (Val):', round(mae,2))
print('')
print('Median Absolute Error (Train):', round(medae_train,2))
print('Median Absolute Error (Val):', round(medae,2))

R^2 (Train): 0.95
R^2 (Val): 0.67

RMSE (Train): 82.39
RMSE (Val): 117.44

Mean Abs Error (Train): 36.86
Mean Abs Error (Val): 53.22

Median Absolute Error (Train): 18.28
Median Absolute Error (Val): 24.95


## Plotting & Interpretting Training & Validation Errors

In [13]:
import matplotlib.pyplot as plt

In [None]:
diff_train = y_train - y_pred_train
diff_val = y_val - y_pred

In [None]:
plt.hist(diff_train, bins=40, label='train')
plt.hist(diff_val, bins=40, label='val')
plt.title('Distribution of model errors')
plt.ylabel('Number of observations')
plt.xlabel('Log Price')
plt.legend()
plt.show()

## US & International Training Results

In [14]:
# create full train set (val + existing train)
X = pd.concat([X_train, X_val]).reset_index(drop=True)
y = pd.concat([y_train, y_val]).reset_index(drop=True)

In [15]:
# create test set
y_test = us.log_price
X_test = us.drop(columns=['log_price'])

In [None]:
# standardize data
# X_test = (X_test - X_train_mean)/X_train_stdev
# X_test.fillna(0, inplace=True)
# X_test.replace([np.inf, -np.inf], 0, inplace=True)

In [17]:
rf_final = RandomForestRegressor(n_estimators=100, min_samples_split=2, 
                                min_samples_leaf=2, max_features='auto',
                               max_depth=15, bootstrap=True).fit(X, y)

In [19]:
y_pred_train = rf_final.predict(X)
y_pred = rf_final.predict(X_test)

acc_train = rf.score(X, y)
acc = rf.score(X_test, y_test)

rmse_train = mean_squared_error(np.exp(y), np.exp(y_pred_train), squared=False)
rmse = mean_squared_error(np.exp(y_test), np.exp(y_pred), squared=False)

mae_train = mean_absolute_error(np.exp(y), np.exp(y_pred_train))
mae = mean_absolute_error(np.exp(y_test), np.exp(y_pred))

medae_train = median_absolute_error(np.exp(y), np.exp(y_pred_train))
medae = median_absolute_error(np.exp(y_test), np.exp(y_pred))

In [20]:
print('R^2 (Train):', round(acc_train,2)) 
print('R^2 (Test):', round(acc,2))
print('')
print('RMSE (Train):', round(rmse_train,2))
print('RMSE (Test):', round(rmse,2))
print('')
print('Mean Abs Error (Train):', round(mae_train,2))
print('Mean Abs Error (Test):', round(mae,2))
print('')
print('Median Absolute Error (Train):', round(medae_train,2))
print('Median Absolute Error (Test):', round(medae,2))

R^2 (Train): 0.88
R^2 (Test): 0.66

RMSE (Train): 83.0
RMSE (Test): 119.46

Mean Abs Error (Train): 37.66
Mean Abs Error (Test): 53.63

Median Absolute Error (Train): 18.82
Median Absolute Error (Test): 24.94


In [None]:
plt.hist(y_test, bins=40, label='actual')
plt.hist(y_pred, bins=40, label='predictions')
plt.title('Distribution of model errors')
plt.ylabel('Number of observations')
plt.xlabel('Log Price')
plt.legend()
plt.show()

In [None]:
diff_train = y - y_pred_train
diff_test = y_test - y_pred

plt.hist(diff_train, bins=40, label='train')
plt.hist(diff_val, bins=40, label='test')
plt.title('Distribution of model errors')
plt.ylabel('Number of observations')
plt.xlabel('Log Price')
plt.legend()
plt.show()

## Import international data

In [21]:
madrid = pd.read_csv('../data/processed/madrid_test.csv')
london = pd.read_csv('../data/processed/london_test.csv')
paris = pd.read_csv('../data/processed/paris_test.csv')

In [22]:
madrid_y = madrid.log_price
madrid_X = madrid.drop(columns=['log_price', 'price'])

london_y = london.log_price
london_X = london.drop(columns=['log_price', 'price'])

paris_y = paris.log_price
paris_X = paris.drop(columns=['log_price', 'price'])

In [23]:
# standardize madrid data
madrid_X = (madrid_X - X_train_mean)/X_train_stdev
madrid_X.fillna(0, inplace=True)
madrid_X.replace([np.inf, -np.inf], 0, inplace=True)

# standardize london data
london_X = (london_X - X_train_mean)/X_train_stdev
london_X.fillna(0, inplace=True)
london_X.replace([np.inf, -np.inf], 0, inplace=True)

# standardize madrid data
paris_X = (paris_X - X_train_mean)/X_train_stdev
paris_X.fillna(0, inplace=True)
paris_X.replace([np.inf, -np.inf], 0, inplace=True)

NameError: name 'X_train_mean' is not defined

#### Madrid

In [24]:
y_pred = rf_final.predict(madrid_X)

acc = r2_score(np.exp(madrid_y), np.exp(y_pred))

rmse = mean_squared_error(np.exp(madrid_y), np.exp(y_pred), squared=False)

mae = mean_absolute_error(np.exp(madrid_y), np.exp(y_pred))

medae = median_absolute_error(np.exp(madrid_y), np.exp(y_pred))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
print('R^2 (Madrid):', round(acc,2))
print('')
print('RMSE (Madrid):', round(rmse,2))
print('')
print('Mean Abs Error (Madrid):', round(mae,2))
print('')
print('Median Absolute Error (Madrid):', round(medae,2))

#### London

In [None]:
y_pred = rf_final.predict(london_X)

acc = r2_score(np.exp(london_y), np.exp(y_pred))

rmse = mean_squared_error(np.exp(london_y), np.exp(y_pred), squared=False)

mae = mean_absolute_error(np.exp(london_y), np.exp(y_pred))

medae = median_absolute_error(np.exp(london_y), np.exp(y_pred))

In [None]:
print('R^2 (London):', round(acc,2))
print('')
print('RMSE (London):', round(rmse,2))
print('')
print('Mean Abs Error (London):', round(mae,2))
print('')
print('Median Absolute Error (London):', round(medae,2))

#### Paris

In [None]:
y_pred = rf_final.predict(paris_X)

acc = r2_score(np.exp(paris_y), np.exp(y_pred))

rmse = mean_squared_error(np.exp(paris_y), np.exp(y_pred), squared=False)

mae = mean_absolute_error(np.exp(paris_y), np.exp(y_pred))

medae = median_absolute_error(np.exp(paris_y), np.exp(y_pred))

In [None]:
print('R^2 (Paris):', round(acc,2))
print('')
print('RMSE (Paris):', round(rmse,2))
print('')
print('Mean Abs Error (Paris):', round(mae,2))
print('')
print('Median Absolute Error (Paris):', round(medae,2))