In [1]:
import numpy as np
import pandas as pd
from math import radians
import sklearn.metrics



In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

print(f"Train size: {df_train.shape[0]}")
print(f"Test size: {df_test.shape[0]}")

Train size: 20254
Test size: 6966


Primary School

In [3]:
# count the number of primary schools per subzone
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv')
df_pri_sch = df_pri_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
pri_sch_cleaned = df_pri_sch.value_counts().to_frame(name="pri_sch")
print(df_pri_sch.isnull().sum())

subzone    0
dtype: int64


Secondary School

In [4]:
# count the number of secondary schools per subzone
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv')
df_sec_sch = df_sec_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
sec_sch_cleaned = df_sec_sch.value_counts().to_frame(name="sec_sch")

Population Density of Subzone

In [5]:
# get population density per subzone
df_subzone = pd.read_csv('../data/auxiliary-data/sg-subzones.csv')

## replace 0 population with 10 to give some non-zero population density
# df_subzone.loc[df_subzone['population'] == 0, 'population'] = 10

df_subzone["population_density"] = df_subzone['population']/df_subzone["area_size"].replace(0, np.nan)
df_subzone = df_subzone.drop(['area_size', 'population', 'planning_area'],axis=1).set_index("name")
print(df_subzone.isnull().sum())

population_density    0
dtype: int64


In [6]:
df_dirty_test = df_test.copy()
df_dirty_train = df_train.copy()

Remove duplicates and invalid data (Rows)


In [7]:
df_cleaned = df_dirty_train.drop_duplicates()
df_cleaned = df_cleaned[df_cleaned.size_sqft > 0].copy()
df_cleaned.dropna(subset=['price', 'size_sqft', 'tenure'], inplace = True)
df_cleaned = df_cleaned[(df_cleaned.price > 0)].copy()
print(f'Records dropped :{df_dirty_train.shape[0] - df_cleaned.shape[0]}')
#num_beds, num_baths, furnishing, built_year


Records dropped :1824


#c.	Standardize capitalization


In [8]:
df_cleaned['property_type'] = df_cleaned['property_type'].str.lower()
df_cleaned['tenure'] = df_cleaned['tenure'].str.lower()
df_cleaned['furnishing'] = df_cleaned['furnishing'].str.lower()
df_cleaned['subzone'] = df_cleaned['subzone'].str.lower()
df_cleaned['planning_area'] = df_cleaned['planning_area'].str.lower()

#df_cleaned['built_year'] = df_cleaned['built_year'].astype(int)
#df_cleaned['num_beds'] = df_cleaned['num_beds'].astype(int)
#df_cleaned['num_baths'] = df_cleaned['num_baths'].astype(int)
df_cleaned['lng'] = df_cleaned['lng'].astype(np.float64)
df_cleaned['lat'] = df_cleaned['lat'].astype(np.float64)

df_test_cleaned = df_dirty_test.copy()
df_test_cleaned['property_type'] = df_test_cleaned['property_type'].str.lower()
df_test_cleaned['tenure'] = df_test_cleaned['tenure'].str.lower()
df_test_cleaned['furnishing'] = df_test_cleaned['furnishing'].str.lower()
df_test_cleaned['subzone'] = df_test_cleaned['subzone'].str.lower()
df_test_cleaned['planning_area'] = df_test_cleaned['planning_area'].str.lower()

#TODO: Fix comments
#df_test_cleaned['built_year'] = df_test_cleaned['built_year'].astype(int)
#df_test_cleaned['num_beds'] = df_test_cleaned['num_beds'].astype(int)
#df_test_cleaned['num_baths'] = df_test_cleaned['num_baths'].astype(int)
df_test_cleaned['lng'] = df_test_cleaned['lng'].astype(np.float64)
df_test_cleaned['lat'] = df_test_cleaned['lat'].astype(np.float64)


Clean Lease tenure column, Property_Type Column

In [9]:
#TENURE COLUMN
mask_999 = ['947-year leasehold', '929-year leasehold', '946-year leasehold', '956-year leasehold']
mask_99 = ['100-year leasehold', '102-year leasehold', '110-year leasehold', '103-year leasehold']
df_cleaned = df_cleaned.replace(mask_999, '999-year leasehold')
df_cleaned = df_cleaned.replace(mask_99, '99-year leasehold')

df_test_cleaned = df_test_cleaned.replace(mask_999, '999-year leasehold')
df_test_cleaned = df_test_cleaned.replace(mask_99, '99-year leasehold')

print(f"Train size: {df_cleaned.shape[0]}")
print(f"Test size: {df_test_cleaned.shape[0]}")


Train size: 18430
Test size: 6966


In [10]:
#PROPERTY TYPE
# changing "hdb 3 rooms", "hdb 4 rooms" and likewise to "hdb" since the number of rooms info can
# be obtained from "num_beds"
df_cleaned['property_type'].mask(df_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)
df_cleaned.drop(df_cleaned[df_cleaned['property_type']  == 'land only'].index, inplace = True)

df_test_cleaned['property_type'].mask(df_test_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)

#TODO: Reduce number of property types with less properties. Maybe do an EDA and figure out best way to remove them. Perhaps club 
#them in different category.

df_cleaned.loc[(df_cleaned['property_type']=="apartment") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_cleaned.loc[(df_cleaned['property_type']=="walk-up") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_cleaned.loc[(df_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

#Test
df_test_cleaned.loc[(df_test_cleaned['property_type']=="apartment") & 
       (df_test_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="walk-up") & 
       (df_test_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

#The following results have been picked manually from the website 99.co
df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('blair-plain-conservation-area')),['property_type']] = "landed"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('beng-tong-mansion')),['property_type']] = "landed"

print(f"Train size: {df_cleaned.shape[0]}")
print(f"Test size: {df_test_cleaned.shape[0]}")


Train size: 18428
Test size: 6966


Handle Missing Values

In [11]:
#h. Handle missing values

#built_year
dfmap = df_cleaned.dropna(subset = ['built_year'])[['built_year', 'property_name']].drop_duplicates()
dfmap = dfmap.sort_values(['property_name', 'built_year']).drop_duplicates(subset = ['property_name'], keep='first')
df_cleaned = df_cleaned.drop(columns=['built_year']).merge(dfmap, on=['property_name'], how='left')

# fill nan built year of train dataset with median built year of the same subzone in the training dataset
dftemp1 = df_train.groupby('subzone')['built_year'].median().reset_index(name='subzone_built_year')
df_cleaned = df_cleaned.merge(dftemp1, how='left', on=['subzone'])

mask = df_cleaned['built_year'].isnull()
df_cleaned.loc[mask, 'built_year'] = df_cleaned.loc[mask, 'subzone_built_year']
df_cleaned = df_cleaned.drop(columns=['subzone_built_year'])

# check how many nans left in df_test
print(df_cleaned['built_year'].isnull().sum())

# fill nan built year of train dataset with median built year of the same subzone in the training dataset
dftemp1 = df_train.groupby('planning_area')['built_year'].median().reset_index(name='planning_area_built_year')
df_cleaned = df_cleaned.merge(dftemp1, how='left', on=['planning_area'])

mask = df_cleaned['built_year'].isnull()
df_cleaned.loc[mask, 'built_year'] = df_cleaned.loc[mask, 'planning_area_built_year']
df_cleaned = df_cleaned.drop(columns=['planning_area_built_year'])

# check how many nans left in df_test
print(df_cleaned['built_year'].isnull().sum())

df_cleaned['built_year'] = df_cleaned['built_year'].fillna(df_cleaned['built_year'].median())

print(df_cleaned['built_year'].isnull().sum())


37
28
0


In [12]:
# fill nan built year of test dataset with median built year of the same subzone in the training dataset
dftemp1 = df_train.groupby('subzone')['built_year'].median().reset_index(name='subzone_built_year')
df_test_cleaned = df_test_cleaned.merge(dftemp1, how='left', on=['subzone'])

mask = df_test_cleaned['built_year'].isnull()
df_test_cleaned.loc[mask, 'built_year'] = df_test_cleaned.loc[mask, 'subzone_built_year']
df_test_cleaned = df_test_cleaned.drop(columns=['subzone_built_year'])

# check how many nans left in df_test
print("We can see that there are still 20 properties without built year.")
print(df_test_cleaned[df_test_cleaned['built_year'].isnull()]['property_name'].unique())

# quick search of data online


df_test_cleaned.loc[df_test_cleaned["property_name"] == 'copen grand', 'built_year'] = 2027
df_test_cleaned.loc[df_test_cleaned["property_name"] == "jurong park", 'built_year'] = 1971
df_test_cleaned.loc[df_test_cleaned["property_name"] == "ponggol park", 'built_year'] = 2011

We can see that there are still 20 properties without built year.
['copen grand' 'jurong park' 'ponggol park']


In [13]:
df_cleaned1 = df_cleaned.copy()
df_test_cleaned1 = df_test_cleaned.copy()

In [14]:
df_cleaned1.groupby('num_beds')['size_sqft'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
num_beds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,1669.0,540.603355,125.571278,129.0,474.0,506.0,560.0,1539.0
2.0,4146.0,778.088037,283.17047,65.0,667.0,721.0,807.0,8449.0
3.0,6947.0,1349.038146,14211.133365,92.0,990.0,1098.0,1249.0,1185000.0
4.0,3170.0,2516.93123,26563.911725,103.0,1313.0,1518.0,2292.75,1496000.0
5.0,1494.0,4144.005355,2283.154536,377.0,2045.0,3945.5,5661.0,20000.0
6.0,713.0,6704.269285,4620.196437,1174.0,4865.0,6337.0,7670.0,86080.0
7.0,134.0,7231.559701,3187.888448,1851.0,4800.0,6790.0,8970.0,20000.0
8.0,40.0,8747.2,4273.548902,3300.0,5918.5,8000.0,11500.0,27500.0
9.0,12.0,7350.75,2170.445875,4800.0,6125.0,6600.0,9015.75,11515.0
10.0,26.0,12035.269231,4780.932474,5285.0,8800.0,12000.0,14250.0,30000.0


In [15]:
#h. Handle missing values
#print(df_test_cleaned.isnull().sum())
print("----------------------------------------------------")

mask = df_cleaned1.num_baths.isnull()
#Fill in number of baths from number of bedrooms.
df_cleaned1.loc[mask & (df_cleaned1.num_beds == 1), ['num_baths']] = 1
df_cleaned1.loc[mask & (df_cleaned1.num_beds == 2), ['num_baths']] = 2
df_cleaned1.loc[mask & (df_cleaned1.num_beds > 2), ['num_baths']] = df_cleaned1.loc[mask & (df_cleaned1.num_beds > 2), 'num_beds'] - 1

#Fill in number of beds from number of bathrooms.
mask = df_cleaned1.num_beds.isnull()
df_cleaned1.loc[mask & (df_cleaned1.num_baths == 1), ['num_beds']] = 1
df_cleaned1.loc[mask & (df_cleaned1.num_baths == 2), ['num_beds']] = 2
df_cleaned1.loc[mask & (df_cleaned1.num_baths > 2), ['num_beds']] = df_cleaned1.loc[mask & (df_cleaned1.num_baths > 2), 'num_baths'] + 1

#There are 3 properties which have both Nan values for bed and baths. Estimate their beds and baths from area and quick search online
df_cleaned1[df_cleaned1.num_beds.isnull()]
#https://www.99.co/singapore/sale/property/camborne-road-landed-3TPbVtLAGHNX8kMDYqMV38
df_cleaned1.loc[(df_cleaned1.property_name == 'dunearn estate') & mask, ['num_beds']] = 5
df_cleaned1.loc[(df_cleaned1.property_name == 'dunearn estate') & mask, ['num_baths']] = 5

df_cleaned1.loc[(df_cleaned1.property_name == 'one pearl bank') & mask, ['num_beds']] = 1
df_cleaned1.loc[(df_cleaned1.property_name == 'one pearl bank') & mask, ['num_baths']] = 1

df_cleaned1.loc[(df_cleaned1.property_name == 'gombak view') & mask, ['num_beds']] = 2
df_cleaned1.loc[(df_cleaned1.property_name == 'gombak view') & mask, ['num_baths']] = 2

df_cleaned = df_cleaned1.copy()

----------------------------------------------------


In [16]:
mask = df_test_cleaned1.num_baths.isnull()
df_test_cleaned1.loc[mask & (df_test_cleaned1.num_beds == 1), ['num_baths']] = 1
df_test_cleaned1.loc[mask & (df_test_cleaned1.num_beds == 2), ['num_baths']] = 2
df_test_cleaned1.loc[mask & (df_test_cleaned1.num_beds > 2), ['num_baths']] = df_test_cleaned1.loc[mask & (df_test_cleaned1.num_beds > 2), 'num_beds'] - 1

#Fill in number of beds from number of bathrooms.
mask = df_test_cleaned1.num_beds.isnull()
df_test_cleaned1.loc[mask & (df_test_cleaned1.num_baths == 1), ['num_beds']] = 1
df_test_cleaned1.loc[mask & (df_test_cleaned1.num_baths == 2), ['num_beds']] = 2
df_test_cleaned1.loc[mask & (df_test_cleaned1.num_baths > 2), ['num_beds']] = df_test_cleaned1.loc[mask & (df_test_cleaned1.num_baths > 2), 'num_baths'] + 1

#There are 3 properties which have both Nan values for bed and baths. Estimate their beds and baths from area and quick search online
df_cleaned1[df_cleaned1.num_beds.isnull()]

#https://www.99.co/singapore/sale/property/48-strathmore-avenue-hdb-3pepL7Q8CQFJhbD53xx9w5
df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'forfar heights') & mask, ['num_beds']] = 3
df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'forfar heights') & mask, ['num_baths']] = 2

df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'the carrara') & mask, ['num_beds']] = 4
df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'the carrara') & mask, ['num_baths']] = 5

#https://www.99.co/singapore/sale/property/170-bishan-street-13-hdb-G5to4naWPjPMQgRjT48wAi
df_test_cleaned1.loc[(df_test_cleaned1.property_name == '170 bishan street 13') & mask, ['num_beds']] = 3
df_test_cleaned1.loc[(df_test_cleaned1.property_name == '170 bishan street 13') & mask, ['num_baths']] = 2

df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'hdb-toa payoh') & mask, ['num_beds']] = 3
df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'hdb-toa payoh') & mask, ['num_baths']] = 2

#https://www.99.co/singapore/sale/property/735-yishun-street-72-hdb-HJ4QUT6cTLghnmehNtmcT6
df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'nee soon central vista') & mask, ['num_beds']] = 2
df_test_cleaned1.loc[(df_test_cleaned1.property_name == 'nee soon central vista') & mask, ['num_baths']] = 2

df_test_cleaned = df_test_cleaned1.copy()

In [17]:
# get most frequent tenure per property_type and num_beds
data = list(df_cleaned.groupby(['property_type', 'num_beds', 'tenure']).size().groupby(level=[0, 1]).idxmax().values)
dfmap = pd.DataFrame(data, columns=['property_type', 'num_beds', 'train_tenure'])
# fill nan test set tenures with the most frequent tenure in the training set for the corresponding property_type and num_beds
df_test_cleaned = df_test_cleaned.merge(dfmap, on=['property_type', 'num_beds'], how='left')
mask = df_test_cleaned['tenure'].isnull()
df_test_cleaned.loc[mask, 'tenure'] = df_test_cleaned.loc[mask, 'train_tenure']
df_test_cleaned = df_test_cleaned.drop(columns=['train_tenure'])
print(df_test_cleaned['tenure'].isnull().sum())


# for the remaining nan tenure
df_test_cleaned.loc[df_test_cleaned['tenure'].isnull(), 'tenure'] = 'freehold'

1


In [18]:
# there are only 2 records which appear to be incorrect as num_beds =1, whereas num_baths = 10 and price is > 3e8
# hence we can drop these two records so that the results are not skewed

df_cleaned = df_cleaned[df_cleaned['price']<3e8]
df_cleaned = df_cleaned[df_cleaned['size_sqft']< 1e5]

Singapore has the following latitude and longitude coordinates in its extreme ends:
1. left-most (Tuas) :  1.30871,103.64287
2. right-most (Changi) : 1.34538,104.00270
3. top-most (Sembawang) : 1.46227,103.79487
4. bottom-most (Bukit Merah) : 1.28762,103.82467


#### Min latitude - 1.28762       Max latitude - 1.46227

#### Min longitude - 103.64         Max longitude - 104.00

But we can see that in the data, min longitude is -77.065364 and max latitude is 69.486768 which are out of the range of latitude and longitude values 

<img src="images/singapore-lat-long-map.jpeg" width=600 height=600 />

In [19]:
df_max_lng = df_cleaned[df_cleaned.lng > 121.0]
df_min_lng = df_cleaned[df_cleaned.lng < -77.0]
df_max_lat = df_cleaned[df_cleaned.lat > 69.0]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

print("It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values")
print(df_wrong_coordinates["address"].value_counts())
# coordinates are incorrect for 5 'address'


# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_cleaned.loc[df_cleaned.address == "1 tessensohn road", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.3164313, 103.8575321, 'balestier', 'novena'
df_cleaned.loc[df_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.31262, 103.88686, 'aljunied', 'geylang'
df_cleaned.loc[df_cleaned.address == "5 jalan mutiara", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.29565, 103.82887, 'leonie hill', 'river valley'
df_cleaned.loc[df_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'
df_cleaned.loc[df_cleaned.address == "15 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'

It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values
1 tessensohn road       97
38 lorong 32 geylang     6
5 jalan mutiara          5
17 farrer drive          3
15 farrer drive          2
Name: address, dtype: int64


In [20]:
df_max_lng = df_test_cleaned[df_test_cleaned.lng > 121.0]
df_min_lng = df_test_cleaned[df_test_cleaned.lng < -77.0]
df_max_lat = df_test_cleaned[df_test_cleaned.lat > 69.0]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

print("It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values")
print(df_wrong_coordinates["address"].value_counts())
# coordinates are incorrect for 5 'address'


# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_test_cleaned.loc[df_test_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.31262, 103.88686, 'aljunied', 'geylang'
df_test_cleaned.loc[df_test_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'
        

It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values
1 tessensohn road       29
38 lorong 32 geylang     2
5 jalan mutiara          1
17 farrer drive          1
Name: address, dtype: int64


In [21]:
df_test_cleaned[df_test_cleaned['subzone'].isnull()]['address'].unique()

array(['1 tessensohn road', '5 jalan mutiara'], dtype=object)

In [22]:
# fill nan subzones and planning areas on the basis of address and online lookup
df_test_cleaned.loc[df_test_cleaned["address"].str.contains("1 tessensohn road", na=False), 'subzone'] = 'lavender'
df_test_cleaned.loc[df_test_cleaned["address"].str.contains("1 tessensohn road", na=False), 'planning_area'] = 'kallang'

df_test_cleaned.loc[df_test_cleaned["address"].str.contains("38 lorong 32 geylang", na=False), 'subzone'] = 'aljunied'
df_test_cleaned.loc[df_test_cleaned["address"].str.contains("38 lorong 32 geylang", na=False), 'planning_area'] = 'geylang'

df_test_cleaned.loc[df_test_cleaned["address"].str.contains("17 farrer drive", na=False), 'subzone'] = 'leonie hill'
df_test_cleaned.loc[df_test_cleaned["address"].str.contains("17 farrer drive", na=False), 'planning_area'] = 'river valley'

df_test_cleaned.loc[df_test_cleaned["address"].str.contains("5 jalan mutiara", na=False), 'subzone'] = 'central subzone'
df_test_cleaned.loc[df_test_cleaned["address"].str.contains("5 jalan mutiara", na=False), 'planning_area'] = 'downtown core'

df_test_cleaned.isnull().sum()

listing_id                 0
title                      0
address                    2
property_name              0
property_type              0
tenure                     0
built_year                 0
num_beds                   0
num_baths                  0
size_sqft                  0
floor_level             5810
furnishing                 0
available_unit_types     520
total_num_units         1900
property_details_url       0
lat                        0
lng                        0
elevation                  0
subzone                    0
planning_area              0
dtype: int64

Merging With Auxiliary Data

In [23]:
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
pri_sch_coor = np.array([[radians(_) for _ in coor] for coor in df_pri_sch])

In [24]:
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
sec_sch_coor = np.array([[radians(_) for _ in coor] for coor in df_sec_sch])

In [25]:
df_commercial_centres = pd.read_csv('../data/auxiliary-data/sg-commerical-centres.csv').drop(["name", "type", "subzone", 'planning_area'], axis=1).to_numpy()
commercial_centres_coor = np.array([[radians(_) for _ in coor] for coor in df_commercial_centres])

df_shopping_malls = pd.read_csv('../data/auxiliary-data/sg-shopping-malls.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
shopping_malls_coor = np.array([[radians(_) for _ in coor] for coor in df_shopping_malls])

In [26]:
df_mrt_station = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv')
df_mrt_station = df_mrt_station.drop(["name", "lat", "lng", 'planning_area', 'code', 'line', 'opening_year'], axis=1)
mrt_station_cleaned = df_mrt_station.value_counts().to_frame(name="mrt_station")

mrt_station_coor = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv').drop(["name", "subzone", 'planning_area', 'code', 'line', 'opening_year'], axis=1).to_numpy()
mrt_station_coor = np.array([[radians(_) for _ in coor] for coor in mrt_station_coor])

df_train_coor = df_cleaned[["lat", "lng"]].to_numpy()
df_train_coor = np.array([[radians(_) for _ in coor] for coor in df_train_coor])

df_test_coor = df_test_cleaned[["lat", "lng"]].to_numpy()
df_test_coor = np.array([[radians(_) for _ in coor] for coor in df_test_coor])

dist_matrix_train_mrt = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_train_coor)
# multiply to get meters
closest_dist_to_mrt_train = pd.DataFrame(np.amin(dist_matrix_train_mrt, axis=0)* 6371000, columns=["closest_dist_to_mrt"])

dist_matrix_test_mrt = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_test_coor)
# multiply to get meters
closest_dist_to_mrt_test = pd.DataFrame(np.amin(dist_matrix_test_mrt, axis=0)* 6371000, columns=["closest_dist_to_mrt"])

dist_matrix_train_pri = sklearn.metrics.pairwise.haversine_distances(pri_sch_coor, df_train_coor)
closest_dist_to_pri_train = pd.DataFrame(np.amin(dist_matrix_train_pri, axis=0)* 6371000, columns=["closest_dist_to_pri"])

dist_matrix_test_pri = sklearn.metrics.pairwise.haversine_distances(pri_sch_coor, df_test_coor)
closest_dist_to_pri_test = pd.DataFrame(np.amin(dist_matrix_test_pri, axis=0)* 6371000, columns=["closest_dist_to_pri"])

dist_matrix_train_sec = sklearn.metrics.pairwise.haversine_distances(sec_sch_coor, df_train_coor)
closest_dist_to_sec_train = pd.DataFrame(np.amin(dist_matrix_train_sec, axis=0)* 6371000, columns=["closest_dist_to_sec"])

dist_matrix_test_sec = sklearn.metrics.pairwise.haversine_distances(sec_sch_coor, df_test_coor)
closest_dist_to_sec_test = pd.DataFrame(np.amin(dist_matrix_test_sec, axis=0)* 6371000, columns=["closest_dist_to_sec"])

dist_matrix_test_com = sklearn.metrics.pairwise.haversine_distances(commercial_centres_coor, df_test_coor)
closest_dist_to_com_test = pd.DataFrame(np.amin(dist_matrix_test_com, axis=0)* 6371000, columns=["closest_dist_to_com"])

dist_matrix_test_shop = sklearn.metrics.pairwise.haversine_distances(shopping_malls_coor, df_test_coor)
closest_dist_to_shop_test = pd.DataFrame(np.amin(dist_matrix_test_shop, axis=0)* 6371000, columns=["closest_dist_to_shop"])

dist_matrix_train_com = sklearn.metrics.pairwise.haversine_distances(commercial_centres_coor, df_train_coor)
closest_dist_to_com_train = pd.DataFrame(np.amin(dist_matrix_train_com, axis=0)* 6371000, columns=["closest_dist_to_com"])

dist_matrix_train_shop = sklearn.metrics.pairwise.haversine_distances(shopping_malls_coor, df_train_coor)
closest_dist_to_shop_train = pd.DataFrame(np.amin(dist_matrix_train_shop, axis=0)* 6371000, columns=["closest_dist_to_shop"])

#during pri sch registration exercise, homeowners within 1km will be given priority
test = pd.DataFrame(dist_matrix_test_pri * 6371000)
test_near_pri_sch = test[test <= 1000].count().rename("close_pri_sch")

test = pd.DataFrame(dist_matrix_train_pri * 6371000)
train_near_pri_sch = test[test <= 1000].count().rename("close_pri_sch")

#no such priority for sec sch, put as 1km for now
#change 1000 to desired distance if necessary
test = pd.DataFrame(dist_matrix_test_sec * 6371000)
test_near_sec_sch = test[test <= 1000].count().rename("close_sec_sch")

test = pd.DataFrame(dist_matrix_train_sec * 6371000)
train_near_sec_sch = test[test <= 1000].count().rename("close_sec_sch")

df_cleaned = df_cleaned.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_train)\
    .join(closest_dist_to_pri_train)\
    .join(closest_dist_to_sec_train)\
    .join(train_near_pri_sch)\
    .join(train_near_sec_sch)\
    .join(closest_dist_to_shop_train)\
    .join(closest_dist_to_com_train)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

df_test_cleaned = df_test_cleaned.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_test)\
    .join(closest_dist_to_pri_test)\
    .join(closest_dist_to_sec_test)\
    .join(test_near_pri_sch)\
    .join(test_near_sec_sch)\
    .join(closest_dist_to_shop_test)\
    .join(closest_dist_to_com_test)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

Furnishing

In [27]:
df_test_cleaned['furnishing'].replace('na', 'unspecified', inplace = True)
df_test_cleaned['furnishing'].unique()

df_test_cleaned['furnishing'].replace('na', 'unspecified', inplace = True)
df_test_cleaned['furnishing'].unique()

array(['unfurnished', 'unspecified', 'fully', 'partial'], dtype=object)

Outliers in Price Per SQFT

In [28]:
# Flooring and capping price per sqft at 1% and 99%ile respectively
df_cleaned["price_per_sqft"] = df_cleaned["price"]/df_cleaned["size_sqft"]
low, high = df_cleaned['price_per_sqft'].quantile([0.01, 0.99])
df_cleaned['price_per_sqft'] = np.clip(df_cleaned['price_per_sqft'], low, high)


Drop Columns at the end

In [29]:
df_cleaned = df_cleaned.drop(['title', 'address','property_name', 'property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level', 'available_unit_types'], axis = 1)
df_test_cleaned = df_test_cleaned.drop(['title','address','property_name', 'property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level', 'available_unit_types'], axis = 1)

In [30]:
print(df_cleaned.isnull().sum().sum())
print(df_test_cleaned.isnull().sum().sum())
print(df_cleaned.shape)
print(df_test_cleaned.shape)

0
0
(18424, 24)
(6966, 22)


Saving the data

In [31]:
df_test_cleaned.to_csv('../data/test_cleaned.csv', index = False)
df_cleaned.to_csv('../data/train_cleaned.csv', index = False)