In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

For comparing across ML models (i.e. RF, GB, LR), sample 20% and do 10-fold CV. Use default parameters for this.

For the top 2 models, use 60% of data to perform hyperparameter search.

For each model, we want the best set of parameters.

Output: Optimal parameters, as well as loss for each combination of parameters. Output: 10 different loss values/RMSE per model (boxplot)

In [5]:
train = pd.read_csv("../data/processed/train.csv")
train = train.drop(columns=['avg', 'Unnamed: 0'])
print(train.shape)
train.head()

(431732, 81)


Unnamed: 0,floor_area_sqm,resale_price,lease_duration,storey,flat_model_adjoined flat,flat_model_apartment,flat_model_dbss,flat_model_improved,flat_model_improved maisonette,flat_model_maisonette,...,flat_type_4 room,flat_type_5 room,flat_type_executive,flat_type_multi generation,nbr_ssch_1k,nbr_psch_1k,nbr_mall_1k,nbr_hawker_1k,nbr_comm_1k,nbr_station_1k
0,118.0,209700.0,12,2.0,0,0,0,0,0,0,...,1,0,0,0,5,6,1,2,1,1
1,110.0,402300.0,11,11.0,0,0,0,1,0,0,...,0,1,0,0,6,8,4,0,0,2
2,112.0,351000.0,16,2.0,0,0,0,0,0,0,...,0,1,0,0,1,2,2,0,0,0
3,67.0,151200.0,20,8.0,0,0,0,0,0,0,...,0,0,0,0,2,2,3,2,0,1
4,73.0,318600.0,28,8.0,0,0,0,0,0,0,...,0,0,0,0,2,2,1,1,0,1


In [6]:
test = pd.read_csv("../data/processed/test.csv")
test = test.drop(columns=['avg', 'Unnamed: 0'])
print(test.shape)
test.head()

(107934, 80)


Unnamed: 0,floor_area_sqm,lease_duration,storey,flat_model_adjoined flat,flat_model_apartment,flat_model_dbss,flat_model_improved,flat_model_improved maisonette,flat_model_maisonette,flat_model_model a,...,flat_type_4 room,flat_type_5 room,flat_type_executive,flat_type_multi generation,nbr_ssch_1k,nbr_psch_1k,nbr_mall_1k,nbr_hawker_1k,nbr_comm_1k,nbr_station_1k
0,94.0,15,5.0,0,0,0,0,0,0,0,...,1,0,0,0,4,3,2,2,1,102
1,122.0,4,5.0,0,0,0,1,0,0,0,...,0,1,0,0,3,2,0,0,0,117
2,67.0,20,2.0,0,0,0,0,0,0,0,...,0,0,0,0,1,2,4,1,1,116
3,82.0,34,5.0,0,0,0,0,0,0,0,...,0,0,0,0,3,2,3,3,0,72
4,117.0,26,2.0,0,0,0,0,0,0,0,...,0,1,0,0,5,2,3,2,0,101


In [7]:
y = train.resale_price
X = train.drop(columns=['resale_price'])
predictors = X.columns
print(X.shape)
print(y.shape)

(431732, 80)
(431732,)


In [12]:
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(345385, 80)
(345385,)
(86347, 80)
(86347,)


In [13]:
seed = 42
X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=0.33, random_state=seed)
print(X_train.shape) # about 240k rows
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(231407, 80)
(231407,)
(86347, 80)
(86347,)


In [14]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [20]:
%%time

grid = {
    'max_depth' : [1, 10, 100, 1000, 10000],
    'n_estimators' : [1, 10, 100, 1000, 10000]
}

est=RandomForestRegressor()
print(est.get_params())

model = GridSearchCV(estimator=est,
                   param_grid=grid, scoring='neg_root_mean_squared_error',
                    verbose=3, n_jobs=-1)
model.fit(X_train, y_train)

# Store the parameters of the best model
best_params = model.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = model.predict(X_test)

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
CPU times: user 441 µs, sys: 102 µs, total: 543 µs
Wall time: 494 µs


In [None]:
print(best_params)
print('Root Mean Squared Error RF:', np.sqrt(mean_squared_error(y_test, y_pred)))
pickle.dumps(model.best_estimator_)

In [19]:
%%time

grid = {
    'max_depth' : [1, 10, 100, 1000, 10000],
    'n_estimators' : [1, 10, 100, 1000, 10000]
}

est=GradientBoostingRegressor()
print(est.get_params())

model2 = GridSearchCV(estimator=est,
                   param_grid=grid, scoring='neg_root_mean_squared_error',
                    verbose=3, n_jobs=-1)
model2.fit(X_train, y_train)

# Store the parameters of the best model
best_params2 = model2.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred2 = model2.predict(X_test)

{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
CPU times: user 481 µs, sys: 107 µs, total: 588 µs
Wall time: 530 µs


In [None]:
print(best_params)
print('Root Mean Squared Error GB:', np.sqrt(mean_squared_error(y_test, y_pred2)))
pickle.dumps(model2.best_estimator_)