### Import libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree, ensemble, model_selection, metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

### Import data

In [2]:
# df = pd.read_csv('/Users/tohjiaxuan/Desktop/AY20:21 Y3S2/BT4222/Project/FINAL PROJECT/combined.csv')
# df.head()

df = pd.read_csv('combined.csv')
df.head()
df.columns

Index(['district', 'street', 'propertyType', 'remaining_lease', 'price',
       'school', 'hawkercentre', 'supermarkets', 'Bus Stops Nearby',
       'crime_number', 'latitude', 'longitude', 'floor_area_sqm',
       'floor_range', 'sentiment'],
      dtype='object')

In [3]:
# scale price
df['price'] = df['price'] / 100000 # scale price

In [7]:
# replace na values in column 'crime_number' with -1
df['crime_number'] = df['crime_number'].fillna(-1)
df['sentiment'] = df['sentiment'].fillna(0)

### Version 1: Using ALL variables as features
- ['district', 'street', 'propertyType', 'remaining_lease', 'price','school', 'hawkercentre', 'supermarkets', 'Bus Stops Nearby','crime_number', 'latitude', 'longitude', 'floor_area_sqm','floor_range','sentiment']

#### (a) One-Hot Encoding

In [8]:
features = df.loc[:, df.columns != 'price']
x = pd.get_dummies(features)
y = df['price']

x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, random_state=2019)

#### (b) Decision Tree Regressor

In [9]:
# train model
dt = tree.DecisionTreeRegressor()
dt.fit(x_train,y_train)

# make predictions on test set
y_pred = dt.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.4    4.88  33.    ...  3.    21.544  4.6  ]
RMSE: 400191.1336473157
R-Squared: 0.9363929910719793


#### (c) Random Forest Regressor

In [24]:
# train model
# max_features=1? from lecture 
rforest = ensemble.RandomForestRegressor(n_estimators = 100, oob_score = True, random_state = 2019)
rforest.fit(x_train, y_train)

# make predictions on test set
y_pred = rforest.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.39130203  4.9044     26.77847167 ...  2.96226667 19.683616
  4.5270386 ]
RMSE: 323792.58764054347
R-Squared: 0.9583606584066989


In [27]:
# train model
rforest = ensemble.RandomForestRegressor(n_estimators = 50, max_features = 1, oob_score = True, random_state = 2019)
rforest.fit(x_train, y_train)

# make predictions on test set
y_pred = rforest.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.47224267  4.9724     31.80016143 ...  2.9497328  18.1793328
  4.5428    ]
RMSE: 464593.0618490013
R-Squared: 0.9142734219387728


#### Best Performance of Version 1

In [10]:
# train model
rforest = ensemble.RandomForestRegressor(n_estimators = 50, oob_score = True, random_state = 2019)
rforest.fit(x_train, y_train)

# make predictions on test set
y_pred = rforest.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.38421333  4.8928     27.88497333 ...  2.96183333 19.7380592
  4.5484    ]
RMSE: 315879.81362467987
R-Squared: 0.9603709378184906


In [11]:
rforest = ensemble.RandomForestRegressor(n_estimators = 10, oob_score = True, random_state = 2019)
rforest.fit(x_train, y_train)

# make predictions on test set
y_pred = rforest.predict(x_test)
print(y_pred)

# compute metrics 
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y_test,y_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

  warn("Some inputs do not have OOB scores. "


[ 4.377       4.904      26.59353333 ...  2.96166667 19.790888
  4.6       ]
RMSE: 315905.56007362035
R-Squared: 0.9603644774550032


### Version 2: Keep important variables, drop the rest
- drop street, remaining lease, hawker centre and crime number
- ['district', 'propertyType', 'price', 'school', 'supermarkets','Bus Stops Nearby', 'latitude', 'longitude', 'floor_area_sqm','floor_range','sentiment']

In [12]:
df_v2 = df.drop(columns=['street','remaining_lease','hawkercentre','crime_number'])
df_v2.head()

Unnamed: 0,district,propertyType,price,school,supermarkets,Bus Stops Nearby,latitude,longitude,floor_area_sqm,floor_range,sentiment
0,5,Semi-detached,55.0,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",0,1.28213,103.786879,524.3,-,0.129813
1,5,Semi-detached,50.0,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",0,1.28213,103.786879,308.0,-,0.064396
2,5,Semi-detached,47.5,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",0,1.28213,103.786879,314.0,-,0.064396
3,5,Terrace,26.3,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",2,1.292047,103.768591,159.3,-,0.129813
4,4,Condominium,28.0,"['FAIRFIELD METHODIST SCHOOL (SECONDARY)', 'QU...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'THE QUALI...",0,1.244208,103.827487,223.0,01-05,0.074654


In [13]:
df_v2.columns

Index(['district', 'propertyType', 'price', 'school', 'supermarkets',
       'Bus Stops Nearby', 'latitude', 'longitude', 'floor_area_sqm',
       'floor_range', 'sentiment'],
      dtype='object')

#### (a) One-Hot Encoding

In [14]:
features2 = df_v2.loc[:, df_v2.columns != 'price']
x2 = pd.get_dummies(features2)
y2 = df_v2['price']

x2_train, x2_test, y2_train, y2_test = model_selection.train_test_split(x2, y2, test_size=0.2, random_state=2019)

#### (b) Decision Tree Regressor

In [15]:
# train model
dt2 = tree.DecisionTreeRegressor()
dt2.fit(x2_train,y2_train)

# make predictions on test set
y2_pred = dt2.predict(x2_test)
print(y2_pred)

# compute metrics 
mse = mean_squared_error(y2_test,y2_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y2_test,y2_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.38        4.79       45.765      ...  2.99296    21.544
  4.93251368]
RMSE: 679435.8762341171
R-Squared: 0.8166558746205472


#### (c) Random Forest Regressor

In [25]:
# train model
rforest2 = ensemble.RandomForestRegressor(n_estimators = 100, oob_score = True, random_state = 2019)
rforest2.fit(x2_train, y2_train)

# make predictions on test set
y2_pred = rforest2.predict(x2_test)
print(y2_pred)

# compute metrics 
mse = mean_squared_error(y2_test,y2_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y2_test,y2_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.34542684  4.82392    43.73329722 ...  2.98915666 19.8719988
  4.9384801 ]
MSE: 348566.0376505628
R-Squared: 0.9517452368978099


In [28]:
# train model
rforest2 = ensemble.RandomForestRegressor(n_estimators = 50, max_features = 1, oob_score = True, random_state = 2019)
rforest2.fit(x2_train, y2_train)

# make predictions on test set
y2_pred = rforest2.predict(x2_test)
print(y2_pred)

# compute metrics 
mse = mean_squared_error(y2_test,y2_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y2_test,y2_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.47558888  4.7937632  36.04833476 ...  2.99847337 17.7379088
  4.95928658]
MSE: 411809.8891707914
R-Squared: 0.9326459691099857


#### Best Performance of Version 2

In [18]:
# train model
rforest2 = ensemble.RandomForestRegressor(n_estimators = 50, oob_score = True, random_state = 2019)
rforest2.fit(x2_train, y2_train)

# make predictions on test set
y2_pred = rforest2.predict(x2_test)
print(y2_pred)

# compute metrics 
mse = mean_squared_error(y2_test,y2_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y2_test,y2_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.34330118  4.83344    43.04468111 ...  2.99367337 19.8255376
  4.95928658]
MSE: 343927.83448579506
R-Squared: 0.9530208992871184


### Version 3: Variables Vicki used
- ['district', 'propertyType', 'remaining_lease', 'price', 'school','hawkercentre', 'supermarkets', 'Bus Stops Nearby', 'crime_number','latitude', 'longitude', 'floor_area_sqm', 'floor_range']

In [19]:
df_v3 = df.drop(columns=['street','sentiment'])
df_v3.head()

Unnamed: 0,district,propertyType,remaining_lease,price,school,hawkercentre,supermarkets,Bus Stops Nearby,crime_number,latitude,longitude,floor_area_sqm,floor_range
0,5,Semi-detached,999.0,55.0,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['Pasir Panjang Food Centre', 'Clementi West S...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",0,-1.0,1.28213,103.786879,524.3,-
1,5,Semi-detached,999.0,50.0,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['Pasir Panjang Food Centre', 'Clementi West S...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",0,-1.0,1.28213,103.786879,308.0,-
2,5,Semi-detached,999.0,47.5,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['Pasir Panjang Food Centre', 'Clementi West S...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",0,-1.0,1.28213,103.786879,314.0,-
3,5,Terrace,999.0,26.3,"['HILLGROVE SECONDARY SCHOOL', 'QUEENSTOWN SEC...","['Pasir Panjang Food Centre', 'Clementi West S...","['NTUC FAIRPRICE CO-OPERATIVE LTD', 'COLD STOR...",2,-1.0,1.292047,103.768591,159.3,-
4,4,Condominium,84.92,28.0,"['FAIRFIELD METHODIST SCHOOL (SECONDARY)', 'QU...",['Telok Blangah Drive Blk 82 (Telok Blangah Ma...,"['NTUC FAIRPRICE CO-OPERATIVE LTD', 'THE QUALI...",0,-1.0,1.244208,103.827487,223.0,01-05


In [20]:
df_v3.columns

Index(['district', 'propertyType', 'remaining_lease', 'price', 'school',
       'hawkercentre', 'supermarkets', 'Bus Stops Nearby', 'crime_number',
       'latitude', 'longitude', 'floor_area_sqm', 'floor_range'],
      dtype='object')

#### (a) One-Hot Encoding

In [21]:
features3 = df_v3.loc[:, df_v3.columns != 'price']
x3 = pd.get_dummies(features3)
y3 = df_v3['price']

x3_train, x3_test, y3_train, y3_test = model_selection.train_test_split(x3, y3, test_size=0.2, random_state=2019)

#### (b) Decision Tree Regressor

In [22]:
# train model
dt3 = tree.DecisionTreeRegressor()
dt3.fit(x3_train,y3_train)

# make predictions on test set
y3_pred = dt3.predict(x3_test)
print(y3_pred)

# compute metrics 
mse = mean_squared_error(y3_test,y3_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y3_test,y3_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.38   5.05  17.425 ...  3.    18.     4.6  ]
RMSE: 664326.0905462931
R-Squared: 0.8247198779435705


#### (c) Random Forest Regressor

In [26]:
# train model
rforest3 = ensemble.RandomForestRegressor(n_estimators = 100, oob_score = True, random_state = 2019)
rforest3.fit(x3_train, y3_train)

# make predictions on test set
y3_pred = rforest3.predict(x3_test)
print(y3_pred)

# compute metrics 
mse = mean_squared_error(y3_test,y3_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y3_test,y3_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.39182903  4.8936     23.53893667 ...  2.9691     19.6756696
  4.71787466]
MSE: 315411.1557761954
R-Squared: 0.9604884425947657


In [29]:
# train model
rforest3 = ensemble.RandomForestRegressor(n_estimators = 50, max_features = 1, oob_score = True, random_state = 2019)
rforest3.fit(x3_train, y3_train)

# make predictions on test set
y3_pred = rforest3.predict(x3_test)
print(y3_pred)

# compute metrics 
mse = mean_squared_error(y3_test,y3_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y3_test,y3_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.78177322  4.9558     28.39278667 ...  2.9735776  17.8267456
  4.65455121]
MSE: 382307.16001179384
R-Squared: 0.9419509773325448


#### Best Performance of Version 3 and Best Performance overall

In [23]:
# train model
rforest3 = ensemble.RandomForestRegressor(n_estimators = 50, oob_score = True, random_state = 2019)
rforest3.fit(x3_train, y3_train)

# make predictions on test set
y3_pred = rforest3.predict(x3_test)
print(y3_pred)

# compute metrics 
mse = mean_squared_error(y3_test,y3_pred)
rmse = np.sqrt(mse)*100000 # scale back
r_squared = r2_score(y3_test,y3_pred)


print('RMSE: {}'.format(rmse))
print('R-Squared: {}'.format(r_squared))

[ 4.3904776   4.9012     23.91300667 ...  2.9688     19.8086816
  4.66287881]
MSE: 308166.85234000505
R-Squared: 0.9622825873178068
