### Import Libraries

In [72]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree, ensemble, model_selection, metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

### Import Datasets

In [82]:
# features
x_train = pd.read_csv('training_set.csv')
x_test = pd.read_csv('testing_set.csv')

# labels
y_train = pd.read_csv('training_labels.csv')
y_test = pd.read_csv('testing_labels.csv')

print(x_train.columns)
print(x_test.columns)
print(y_train.columns)
print(y_test.columns)

Index(['district', 'street', 'propertyType', 'remaining_lease', 'school',
       'hawkercentre', 'supermarkets', 'Bus Stops Nearby', 'crime_number',
       'latitude', 'longitude', 'floor_area_sqm', 'floor_range', 'sentiment'],
      dtype='object')
Index(['district', 'street', 'propertyType', 'remaining_lease', 'school',
       'hawkercentre', 'supermarkets', 'Bus Stops Nearby', 'crime_number',
       'latitude', 'longitude', 'floor_area_sqm', 'floor_range', 'sentiment'],
      dtype='object')
Index(['price'], dtype='object')
Index(['price'], dtype='object')


In [89]:
# temporary until sentiment no longer has null values
x_train['sentiment'] = x_train['sentiment'].fillna(0)
x_test['sentiment'] = x_test['sentiment'].fillna(0)
# x_train['sentiment'].isnull().values.any()

In [88]:
x_train.head(2)

Unnamed: 0,district,street,propertyType,remaining_lease,school,hawkercentre,supermarkets,Bus Stops Nearby,crime_number,latitude,longitude,floor_area_sqm,floor_range,sentiment
0,20,1487,8,-0.028291,37,11,2,1,122.714286,1.384051,103.900061,0.333809,1,0.056981
1,21,668,6,0.130119,40,1,2,0,-1.0,1.358962,103.763673,-0.261162,2,


### Summary of Results
*Goal: Low RMSE, Low MAE, R_squared close to 1*

Decision Tree
- (a) Basic Decision Tree with Default Parameters
    - RMSE: 0.2806782357476952
    - MAE: 0.04629039317307514
    - R-Squared: 0.912337161235727
- (b) Decision Tree using Best Estimators from Grid Search
    - RMSE: 0.32020141166895727
    - MAE: 0.07018170145875972
    - R-Squared: 0.8859107990001199        
    
Random Forest
- (a) Basic Random Forest with Default Parameters
    - RMSE: 0.16367380664894712
    - MAE: 0.04204705728830404
    - R-Squared: 0.9701903813340523
- (b) Random Forest using Best Estimators from Grid Search
    - RMSE: 0.15622200217196733
    - MAE: 0.040458029531220296
    - R-Squared: 0.9728429586751479
    
Conclusions:
1. Random Forest better performance
2. For random forest, using Grid Search helped us in finding better estimators to improve the model slightly
3. For decision tree, the model with default parameters surprisingly had better results (dk why?)

### Decision Tree

#### (a) Basic Decision Tree with Default Parameters

In [117]:
# train model
dt = tree.DecisionTreeRegressor()
dt.fit(x_train,y_train)

# make predictions on test set
y_pred = dt.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse) # scale back
mae = mean_absolute_error(y_test,y_pred)
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('R-Squared: {}'.format(r_squared))

[ 0.03943235 -0.39462907 -0.30695848 ... -0.41731308 -0.39095059
 -0.3222855 ]
RMSE: 0.2806782357476952
MAE: 0.04629039317307514
R-Squared: 0.912337161235727


#### (b) Decision Tree using Best Estimators from Grid Search

In [160]:
params = {'max_leaf_nodes': [200,400,600,800], 'min_samples_split': [7,12,15,20]}
gsearch_dt = GridSearchCV(tree.DecisionTreeRegressor(random_state=42), 
                          params, 
                          scoring = 'neg_mean_squared_error',
                          verbose=1, 
                          cv=3, 
                          n_jobs=-1)
gsearch_dt.fit(x_train, y_train)
print('best score:', gsearch_dt.best_score_)
print('best parameters: ', gsearch_dt.best_params_)
print('best estimators: ', gsearch_dt.best_estimator_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    5.4s finished


best score: -0.12369633605268354
best parameters:  {'max_leaf_nodes': 800, 'min_samples_split': 20}
best estimators:  DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=800, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=20, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')


In [161]:
# make predictions on test set
y_pred = gsearch_dt.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse) # scale back
mae = mean_absolute_error(y_test,y_pred)
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('R-Squared: {}'.format(r_squared))

# RMSE: 0.3219627489519811
# MAE: 0.07401432473228167
# R-Squared: 0.8846522021339777

[ 0.13546561 -0.39172244 -0.26942208 ... -0.41950762 -0.33568281
 -0.2747952 ]
RMSE: 0.32020141166895727
MAE: 0.07018170145875972
R-Squared: 0.8859107990001199


### Random Forest

#### (a) Basic Random Forest with Default Parameters

In [105]:
# train model
rforest = ensemble.RandomForestRegressor()
rforest.fit(x_train,y_train)

# make predictions on test set
y_pred = rforest.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse) # scale back
mae = mean_absolute_error(y_test,y_pred)
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('R-Squared: {}'.format(r_squared))

  This is separate from the ipykernel package so we can avoid doing imports until


[ 0.03640324 -0.40529669 -0.29708787 ... -0.41139684 -0.36526249
 -0.32182569]
RMSE: 0.16367380664894712
MAE: 0.04204705728830404
R-Squared: 0.9701903813340523


#### (b) Random Forest using Best Estimators from Grid Search

In [103]:
params = {'n_estimators': [10,50,100,150]}
grid_search_cv = GridSearchCV(ensemble.RandomForestRegressor(random_state=42), 
                              params, 
                              scoring = 'neg_mean_squared_error', 
                              verbose=1, 
                              cv=3,
                              n_jobs = -1)
grid_search_cv.fit(x_train, y_train)
print('best score:', grid_search_cv.best_score_)
print('best parameters: ', grid_search_cv.best_params_)
print('best estimators: ', grid_search_cv.best_estimator_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.9min finished
  self.best_estimator_.fit(X, y, **fit_params)


best score: -0.0944694264782742
best parameters:  {'n_estimators': 100}
best estimators:  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)


In [104]:
# make predictions on test set
y_pred = grid_search_cv.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse) # scale back
mae = mean_absolute_error(y_test,y_pred)
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('R-Squared: {}'.format(r_squared))

[ 0.0389028  -0.40613798 -0.29271456 ... -0.41608578 -0.36667258
 -0.32493317]
RMSE: 0.15622200217196733
MAE: 0.040458029531220296
R-Squared: 0.9728429586751479
