In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from joblib import dump, load

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

For comparing across ML models (i.e. RF, GB, LR), sample 20% and do 10-fold CV. Use default parameters for this.

For the top 2 models, use 60% of data to perform hyperparameter search.

For each model, we want the best set of parameters.

Output: Optimal parameters, as well as loss for each combination of parameters. Output: 10 different loss values/RMSE per model (boxplot)

In [5]:
train = pd.read_csv("../data/processed/train.csv")
print(train.shape)
train.drop(columns=['Unnamed: 0'], inplace=True)
train.head()

(431732, 88)


Unnamed: 0,floor_area_sqm,resale_price,lease_duration,storey,flat_model_adjoined flat,flat_model_apartment,flat_model_dbss,flat_model_improved,flat_model_improved maisonette,flat_model_maisonette,...,mall_nearest_dist,hawker_nearest_dist,comm_nearest_dist,station_nearest_dist,nbr_ssch_1k,nbr_psch_1k,nbr_mall_1k,nbr_hawker_1k,nbr_comm_1k,nbr_station_1k
0,118.0,209700.0,12,2.0,0,0,0,0,0,0,...,1.032567,0.944966,2.312122,1.136808,5,6,1,2,1,1
1,110.0,402300.0,11,11.0,0,0,0,1,0,0,...,0.805534,3.780019,1.461516,0.823036,6,8,4,0,0,2
2,112.0,351000.0,16,2.0,0,0,0,0,0,0,...,0.452272,1.802808,1.444013,2.229299,1,2,2,0,0,0
3,67.0,151200.0,20,8.0,0,0,0,0,0,0,...,0.456213,0.617904,2.385591,0.423055,2,2,3,2,0,1
4,73.0,318600.0,28,8.0,0,0,0,0,0,0,...,0.763692,0.761443,1.805955,0.773734,2,2,1,1,0,1


In [6]:
test = pd.read_csv("../data/processed/test.csv")
print(test.shape)
test.drop(columns=['Unnamed: 0'], inplace=True)
test.head()

(107934, 87)


Unnamed: 0,floor_area_sqm,lease_duration,storey,flat_model_adjoined flat,flat_model_apartment,flat_model_dbss,flat_model_improved,flat_model_improved maisonette,flat_model_maisonette,flat_model_model a,...,mall_nearest_dist,hawker_nearest_dist,comm_nearest_dist,station_nearest_dist,nbr_ssch_1k,nbr_psch_1k,nbr_mall_1k,nbr_hawker_1k,nbr_comm_1k,nbr_station_1k
0,94.0,15,5.0,0,0,0,0,0,0,0,...,0.693506,0.792309,1.749017,0.66713,4,3,2,2,1,2
1,122.0,4,5.0,0,0,0,1,0,0,0,...,1.932765,2.069274,1.818631,1.832348,3,2,0,0,0,0
2,67.0,20,2.0,0,0,0,0,0,0,0,...,0.524283,0.787872,0.928551,0.517931,1,2,4,1,1,1
3,82.0,34,5.0,0,0,0,0,0,0,0,...,0.828608,0.957,2.026039,0.546579,3,2,3,3,0,1
4,117.0,26,2.0,0,0,0,0,0,0,0,...,0.604285,0.112928,1.826417,0.515554,5,2,3,2,0,1


In [7]:
y = train.resale_price
X = train.drop(columns=['resale_price'])
predictors = X.columns
print(X.shape)
print(y.shape)

(431732, 86)
(431732,)


In [8]:
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(345385, 86)
(345385,)
(86347, 86)
(86347,)


In [9]:
seed = 42
X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=0.33, random_state=seed)
print(X_train.shape) # about 240k rows
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(231407, 86)
(231407,)
(86347, 86)
(86347,)


In [114]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
%%time

grid = {
    'max_depth' : [1, 10, 100, 250, None],
    'n_estimators' : [1, 10, 100, 250]
}

est=RandomForestRegressor(random_state=seed)
print(est.get_params())

model = GridSearchCV(estimator=est,
                   param_grid=grid, scoring='neg_root_mean_squared_error',
                    verbose=3, n_jobs=-1)
model.fit(X_train, y_train)

# Store the parameters of the best model
best_params = model.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = model.predict(X_test)

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  9.5min


In [None]:
print(best_params)
print('Root Mean Squared Error RF:', np.sqrt(mean_squared_error(y_test, y_pred)))
dump(model.best_estimator_, 'rf.joblib')

### GBM CV
Follow https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

Choose a relatively high learning rate. Generally the default value of 0.1 works but somewhere between 0.05 to 0.2 should work for different problems

Determine the optimum number of trees for this learning rate. This should range around 40-70. Remember to choose a value on which your system can work fairly fast. This is because it will be used for testing various scenarios and determining the tree parameters.

Tune tree-specific parameters for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.

Lower the learning rate and increase the estimators proportionally to get more robust models.

In [58]:
%%time

grid = {
    'min_samples_split': [2, 4, 8, 16, 32],
    'max_depth': [1, 5, 10, 25, 50],
}

est=GradientBoostingRegressor(n_estimators = 50, random_state=seed)
print(est.get_params())

model2 = GridSearchCV(estimator=est,
                   param_grid=grid, scoring='neg_root_mean_squared_error',
                    verbose=3, n_jobs=-1)
model2.fit(X_train, y_train)

# Store the parameters of t3he best model
best_params2 = model2.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred2 = model2.predict(X_test)

{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_iter_no_change': None, 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 25 candidates, totalling 125 fits
CPU times: user 3min 13s, sys: 1.36 s, total: 3min 14s
Wall time: 2h 50min 28s


In [59]:
model2.best_estimator_

GradientBoostingRegressor(max_depth=25, min_samples_split=32, n_estimators=50,
                          random_state=42)

In [62]:
print(best_params2)
print('Root Mean Squared Error GB:', np.sqrt(mean_squared_error(y_test, y_pred2)))

{'max_depth': 25, 'min_samples_split': 32}
Root Mean Squared Error GB: 22054.93448333611


In [63]:
%%time

grid = {
    'learning_rate': [0.01, 0.001, 0.1, 0.05, 0.005],
    'n_estimators': [10, 25, 50, 100, 200],
}

est=GradientBoostingRegressor(max_depth=25, min_samples_split=32, random_state=seed)
print(est.get_params())

model2_2 = GridSearchCV(estimator=est,
                   param_grid=grid, scoring='neg_root_mean_squared_error',
                    verbose=3, n_jobs=-1)
model2_2.fit(X_train, y_train)

# Store the parameters of t3he best model
best_params2_2 = model2_2.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred2_2 = model2_2.predict(X_test)

{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 25, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 32, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 25 candidates, totalling 125 fits
CPU times: user 13min 40s, sys: 3.16 s, total: 13min 43s
Wall time: 7h 43min 3s


In [64]:
print(best_params2_2)
print('Root Mean Squared Error GB:', np.sqrt(mean_squared_error(y_test, y_pred2_2)))
# dump(model2.best_estimator_, 'gbm.joblib')

{'learning_rate': 0.05, 'n_estimators': 200}
Root Mean Squared Error GB: 21703.547344987855
