In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import neural_network
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [2]:
## Importing the dataset: 
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/ResaleFlatPrice.csv")




## If done on local machine, uncomment this line of code:
## df = pd.read_csv("/Users/junlongng/Desktop/NTU/Year_2/Semester 2/BC3415 AI in ACC & Finance/Week 4 Regression/Homework/Loan Default (Employed).csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'remaining_lease', 'resale_price'],
      dtype='object')

In [4]:
## Checking for null values
df.isnull().any()

month                  False
town                   False
flat_type              False
block                  False
street_name            False
storey_range           False
floor_area_sqm         False
flat_model             False
lease_commence_date    False
remaining_lease        False
resale_price           False
dtype: bool

In [5]:
df["flat_model"].value_counts()

Model A                   7664
Improved                  5560
New Generation            3056
Premium Apartment         2122
Simplified                 930
Apartment                  816
Maisonette                 662
Standard                   647
DBSS                       310
Model A2                   254
Model A-Maisonette          46
Adjoined flat               46
Type S1                     37
Type S2                     16
Terrace                     14
Multi Generation            14
Premium Apartment Loft       4
Premium Maisonette           4
Improved-Maisonette          2
Name: flat_model, dtype: int64

In [6]:
df = df.loc[:, ['town', 'flat_type','floor_area_sqm','flat_model', 'lease_commence_date', "resale_price"]]

In [7]:
df

Unnamed: 0,town,flat_type,floor_area_sqm,flat_model,lease_commence_date,resale_price
0,ANG MO KIO,3 ROOM,68.0,New Generation,1981,270000.0
1,ANG MO KIO,3 ROOM,73.0,New Generation,1976,295000.0
2,ANG MO KIO,3 ROOM,67.0,New Generation,1978,270000.0
3,ANG MO KIO,3 ROOM,67.0,New Generation,1978,230000.0
4,ANG MO KIO,3 ROOM,68.0,New Generation,1981,262500.0
...,...,...,...,...,...,...
22199,YISHUN,EXECUTIVE,142.0,Apartment,1988,580000.0
22200,YISHUN,EXECUTIVE,146.0,Maisonette,1988,565000.0
22201,YISHUN,EXECUTIVE,164.0,Apartment,1992,633000.0
22202,YISHUN,EXECUTIVE,164.0,Apartment,1992,788888.0


In [8]:
## Normalizing Z score value for floor area / SQM
df['floor_area_sqm'] = stats.zscore(df['floor_area_sqm'])

In [9]:
## Encoding variables
d = pd.get_dummies(df['town'])
df = df.merge(d,left_index=True, right_index = True).drop(columns ="town")

In [10]:
d = pd.get_dummies(df['flat_model'])
df = df.merge(d,left_index=True, right_index = True).drop(columns ="flat_model")

In [11]:
d = pd.get_dummies(df['flat_type'])
df = df.merge(d,left_index=True, right_index=True).drop(columns = "flat_type")

In [12]:
df.head()

Unnamed: 0,floor_area_sqm,lease_commence_date,resale_price,ANG MO KIO,BEDOK,BISHAN,BUKIT BATOK,BUKIT MERAH,BUKIT PANJANG,BUKIT TIMAH,...,Terrace,Type S1,Type S2,1 ROOM,2 ROOM,3 ROOM,4 ROOM,5 ROOM,EXECUTIVE,MULTI-GENERATION
0,-1.193788,1981,270000.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,-0.988498,1976,295000.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,-1.234846,1978,270000.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,-1.234846,1978,230000.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,-1.193788,1981,262500.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [13]:
## Target Variable (Y) shall be the resale price:
X = df.drop(columns ="resale_price")
Y = df['resale_price']

In [14]:
## Train test split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y)

In [15]:
## Using Linear Model since Target Y is a numerical attribute
model = linear_model.LinearRegression()
model.fit(X_train,Y_train)
pred = model.predict(X_test)
print("RMSE using linear model is: " , mean_squared_error(Y_test, pred)**0.5)

RMSE using linear model is:  63052.52863449865


In [16]:
## Using a CART Model:

model = tree.DecisionTreeRegressor(random_state=42)
model.fit(X_train,Y_train)
pred = model.predict(X_test)
print("RMSE using CART model is: ", mean_squared_error(Y_test, pred)**0.5)

RMSE using CART model is:  42929.253467955474


In [21]:
## Using a Random Forest Model:
model = ensemble.RandomForestRegressor(random_state = 42)
model.fit(X_train,Y_train)
pred = model.predict(X_test)
print("RMSE for RF model is ", mean_squared_error(Y_test,pred)**0.5)

RMSE for RF model is  40198.72176582685


In [26]:
## Using a XGBoost Model:
model = ensemble.GradientBoostingRegressor(random_state=42)
model.fit(X_train,Y_train)
pred = model.predict(X_test)
print("RMSE for XG Boost is ", mean_squared_error(Y_test, pred)**0.05)

RMSE for XG Boost is  3.0225404054780953


In [28]:
## Using a Neural Network Model:
model = neural_network.MLPRegressor(random_state =42)
model.fit(X_train,Y_train)
pred = model.predict(X_test)
print("RMSE for NN is ", mean_squared_error(Y_test, pred)**0.05)

RMSE for NN is  3.300545055040365




In [44]:
## Using Keras to make NN, with activation function of relu & linear
model = Sequential()
model.add(Dense(54,input_dim = 54, activation ="relu"))
model.add(Dropout(0.2))
model.add(Dense(54,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1,activation="linear"))
model.add(Dropout(0.2))
model.compile(loss="mse", optimizer="adam", metrics=['mse'])
history = model.fit(X_train,Y_train,batch_size = 20, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [45]:
result = model.evaluate(X_test,Y_test)




In [46]:
result[1]**0.5

135230.44769577598