In [1]:
import numpy as np
import pandas as pd
from math import radians
import sklearn.metrics

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

print(f"Train size: {df_train.shape[0]}")
print(f"Test size: {df_test.shape[0]}")

Train size: 20254
Test size: 6966


Primary School

In [3]:
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv')
df_pri_sch = df_pri_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
pri_sch_cleaned = df_pri_sch.value_counts().to_frame(name="pri_sch")

#TODO: Get schools in vicinity instead of just subzone.

Secondary School

In [4]:
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv')
df_sec_sch = df_sec_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
sec_sch_cleaned = df_sec_sch.value_counts().to_frame(name="sec_sch")

#TODO: Get schools in vicinity instead of just subzone.

Population Density of Subzone

In [5]:
df_subzone = pd.read_csv('../data/auxiliary-data/sg-subzones.csv')
df_subzone["population_density"] = df_subzone['population']/df_subzone["area_size"]
df_subzone = df_subzone.drop(['area_size', 'population', 'planning_area'],axis=1).set_index("name")

In [6]:
df_dirty_test = df_test
df_dirty_train = df_train

Remove duplicates and invalid data (Rows)


In [7]:
df_cleaned = df_dirty_train.drop_duplicates()
df_cleaned = df_cleaned[df_cleaned.size_sqft > 0]
df_cleaned.dropna(subset=['num_beds', 'num_baths', 'price', 'size_sqft', 'built_year', 'available_unit_types', 'tenure'], inplace = True)
df_cleaned = df_cleaned[df_cleaned.price > 0]
df_cleaned = df_cleaned[df_cleaned.furnishing != "na"]
print(f'Records dropped :{df_dirty_train.shape[0] - df_cleaned.shape[0]}' )


Records dropped :3034


#c.	Standardize capitalization


In [8]:
df_cleaned['property_type'] = df_cleaned['property_type'].str.lower()
df_cleaned['tenure'] = df_cleaned['tenure'].str.lower()
df_cleaned['furnishing'] = df_cleaned['furnishing'].str.lower()
df_cleaned['subzone'] = df_cleaned['subzone'].str.lower()
df_cleaned['planning_area'] = df_cleaned['planning_area'].str.lower()

df_cleaned['built_year'] = df_cleaned['built_year'].astype(int)
df_cleaned['num_beds'] = df_cleaned['num_beds'].astype(int)
df_cleaned['num_baths'] = df_cleaned['num_baths'].astype(int)
df_cleaned['lng'] = df_cleaned['lng'].astype(np.float16)
df_cleaned['lat'] = df_cleaned['lat'].astype(np.float16)

df_test_cleaned = df_dirty_test
df_test_cleaned['property_type'] = df_test_cleaned['property_type'].str.lower()
df_test_cleaned['tenure'] = df_test_cleaned['tenure'].str.lower()
df_test_cleaned['furnishing'] = df_test_cleaned['furnishing'].str.lower()
df_test_cleaned['subzone'] = df_test_cleaned['subzone'].str.lower()
df_test_cleaned['planning_area'] = df_test_cleaned['planning_area'].str.lower()

#TODO: Fix comments
#df_test_cleaned['built_year'] = df_test_cleaned['built_year'].astype(int)
#df_test_cleaned['num_beds'] = df_test_cleaned['num_beds'].astype(int)
#df_test_cleaned['num_baths'] = df_test_cleaned['num_baths'].astype(int)
df_test_cleaned['lng'] = df_test_cleaned['lng'].astype(np.float16)
df_test_cleaned['lat'] = df_test_cleaned['lat'].astype(np.float16)


Clean Lease tenure column, Property_Type Column

In [9]:
#TENURE COLUMN
mask_999 = ['947-year leasehold', '929-year leasehold', '946-year leasehold',
'956-year leasehold']
mask_99 = ['100-year leasehold', '102-year leasehold', '110-year leasehold', '103-year leasehold']
df_cleaned = df_cleaned.replace(mask_999, '999-year leasehold')
df_cleaned = df_cleaned.replace(mask_99, '99-year leasehold')

df_test_cleaned = df_test_cleaned.replace(mask_999, '999-year leasehold')
df_test_cleaned = df_test_cleaned.replace(mask_99, '99-year leasehold')

print(f"Train size: {df_cleaned.shape[0]}")
print(f"Test size: {df_test_cleaned.shape[0]}")


Train size: 17220
Test size: 6966


In [10]:
#PROPERTY TYPE
# changing "hdb 3 rooms", "hdb 4 rooms" and likewise to "hdb" since the number of rooms info can
# be obtained from "num_beds"
df_cleaned['property_type'].mask(df_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)
df_cleaned.drop(df_cleaned[df_cleaned['property_type']  == 'land only'].index, inplace = True)

df_test_cleaned['property_type'].mask(df_test_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)

#TODO: Reduce number of property types with less properties. Maybe do an EDA and figure out best way to remove them. Perhaps club 
#them in different category.

df_cleaned.loc[(df_cleaned['property_type']=="apartment") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_cleaned.loc[(df_cleaned['property_type']=="walk-up") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_cleaned.loc[(df_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

#Test
df_test_cleaned.loc[(df_test_cleaned['property_type']=="apartment") & 
       (df_test_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="walk-up") & 
       (df_test_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

#The following results have been picked manually from the website 99.co
df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('blair-plain-conservation-area')),['property_type']] = "landed"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('beng-tong-mansion')),['property_type']] = "landed"

print(f"Train size: {df_cleaned.shape[0]}")
print(f"Test size: {df_test_cleaned.shape[0]}")


Train size: 17219
Test size: 6966


Handle Missing Values

In [11]:
#h. Handle missing values

#built_year
dfmap = df_cleaned.dropna(subset = ['built_year'])[['built_year', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_cleaned = df_cleaned.drop(columns=['built_year']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_cleaned.dropna(subset = ['tenure'])[['tenure', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_cleaned = df_cleaned.drop(columns=['tenure']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_cleaned.dropna(subset = ['tenure'])[['tenure', 'address']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['address'])
df_cleaned = df_cleaned.drop(columns=['tenure']).merge(dfmap, on=['address'], how='left')

#df_cleaned.isnull().sum()



In [12]:
#h. Handle missing values
#print(df_test_cleaned.isnull().sum())
print("----------------------------------------------------")

#built_year
dfmap = df_test_cleaned.dropna(subset = ['built_year'])[['built_year', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_test_cleaned = df_test_cleaned.drop(columns=['built_year']).merge(dfmap, on=['property_name'], how='left')


#df_test_cleaned = df_test_cleaned.dropna(subset=['built_year'])

dfmap = df_test_cleaned.dropna(subset = ['tenure'])[['tenure', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_test_cleaned = df_test_cleaned.drop(columns=['tenure']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_test_cleaned.dropna(subset = ['tenure'])[['tenure', 'address']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['address'])
df_test_cleaned = df_test_cleaned.drop(columns=['tenure']).merge(dfmap, on=['address'], how='left')

#Fill in number of baths from number of bedrooms.
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 1), ['num_baths']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 2), ['num_baths']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds > 2), ['num_baths']] = df_test_cleaned.num_beds -1

#Fill in number of beds from number of bathrooms.
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.num_baths == 1), ['num_beds']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.num_baths == 2), ['num_beds']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.num_baths > 2), ['num_beds']] = df_test_cleaned.num_baths + 1

#There are 5 properties which have both Nan values for bed and baths. Estimate their beds from area.
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 600), ['num_beds']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 1000), ['num_beds']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 1700), ['num_beds']] = 3
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 3000), ['num_beds']] = 4

#Estimate their baths from beds.
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 1), ['num_baths']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 2), ['num_baths']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds > 2), ['num_baths']] = df_test_cleaned.num_beds -1


----------------------------------------------------


In [13]:
#fill in missing tenure type for HDB

df_test_cleaned.loc[(df_test_cleaned["property_type"] == "hdb") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "99-year leasehold"
df_test_cleaned

Unnamed: 0,listing_id,title,address,property_name,property_type,num_beds,num_baths,size_sqft,floor_level,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area,built_year,tenure
0,777912,1 bed condo for sale in the gazania,17 how sun drive,the gazania,condo,4.0,1.0,463,,unfurnished,"studio, 1, 2, 3, 4, 5 br",250.0,https://www.99.co/singapore/condos-apartments/...,1.344727,103.8750,0,upper paya lebar,serangoon,2022,freehold
1,936612,3 bed condo for sale in vue 8 residence,95 pasir ris heights,vue 8 residence,condo,4.0,3.0,1033,high,unspecified,"studio, 1, 2, 3, 4, 5 br",463.0,https://www.99.co/singapore/condos-apartments/...,1.379883,103.9375,0,pasir ris west,pasir ris,2017,99-year leasehold
2,995264,1 bed condo for sale in icon,10 gopeng street,icon,condo,4.0,1.0,570,,fully,"studio, 1, 2, 3 br",646.0,https://www.99.co/singapore/condos-apartments/...,1.294922,103.8750,0,bras basah,museum,2007,99-year leasehold
3,477435,hdb flat for sale in 812b choa chu kang avenue 7,bukit batok / bukit panjang / choa chu kang (d23),keat hong colours,hdb,4.0,2.0,1216,,unspecified,"1, 2, 3, 4, 5 br",968.0,https://www.99.co/singapore/hdb/keat-hong-colo...,1.373047,103.7500,0,keat hong,choa chu kang,2017,99-year leasehold
4,222529,hdb flat for sale in 204 toa payoh north,balestier / toa payoh (d12),toa payoh spring,hdb,4.0,2.0,936,,unspecified,"1, 2, 3, 4 br",,https://www.99.co/singapore/hdb/toa-payoh-spri...,1.341797,103.8750,0,braddell,toa payoh,1973,99-year leasehold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6961,289879,5 bed house for sale in paradise island,paradise island,paradise island,bungalow,4.0,6.0,8000,,unspecified,"4, 5, 6, 7, 8 br",29.0,https://www.99.co/singapore/houses/paradise-is...,1.250000,103.8750,0,sentosa,southern islands,2009,99-year leasehold
6962,396404,5 bed house for sale in orchid village,vanda drive,orchid village,semi-detached house,4.0,7.0,6974,,unspecified,"4, 5, 6, 7, 8 br",,https://www.99.co/singapore/houses/orchid-vill...,1.332031,103.8125,0,hillcrest,bukit timah,2010,freehold
6963,620343,3 bed condo for sale in the avenir,8 river valley close,the avenir,condo,4.0,2.0,1141,,partial,"studio, 1, 2, 3, 4, 5 br",376.0,https://www.99.co/singapore/condos-apartments/...,1.293945,103.8125,0,institution hill,river valley,2025,freehold
6964,364805,hdb flat for sale in 31 marine crescent,east coast / marine parade (d15),marine crescent ville,hdb,4.0,1.0,818,,unspecified,"1, 2, 3, 4 br",,https://www.99.co/singapore/hdb/marine-crescen...,1.303711,103.9375,0,marine parade,marine parade,1975,99-year leasehold


In [14]:
# find the most probable type of tenure from the train data, use them to fill in the unknown tenure type
df_cleaned[df_cleaned["property_type"] == "condo"]["tenure"].value_counts()

99-year leasehold     4822
freehold              4628
999-year leasehold     180
Name: tenure, dtype: int64

In [15]:
df_cleaned[df_cleaned["property_type"] == "terraced house"]["tenure"].value_counts()

freehold              187
99-year leasehold      88
999-year leasehold     14
Name: tenure, dtype: int64

In [16]:
df_cleaned[df_cleaned["property_type"] == "semi-detached house"]["tenure"].value_counts()

freehold              530
99-year leasehold     123
999-year leasehold     31
Name: tenure, dtype: int64

In [17]:
df_cleaned[df_cleaned["property_type"] == "bungalow"]["tenure"].value_counts()

freehold              343
99-year leasehold     181
999-year leasehold     14
Name: tenure, dtype: int64

In [18]:
df_cleaned[df_cleaned["property_type"] == "corner terrace"]["tenure"].value_counts()

freehold              117
99-year leasehold      18
999-year leasehold     17
Name: tenure, dtype: int64

In [19]:
df_cleaned[df_cleaned["property_type"] == "landed"]["tenure"].value_counts()

99-year leasehold     9
freehold              6
999-year leasehold    1
Name: tenure, dtype: int64

In [20]:
df_cleaned[df_cleaned["property_type"] == "cluster house"]["tenure"].value_counts()

99-year leasehold     106
freehold               43
999-year leasehold     15
Name: tenure, dtype: int64

In [21]:
# there are only 2 records which appear to be incorrect as num_beds =1, whereas num_baths = 10 and price is > 3e8
# hence we can drop these two records so that the results are not skewed

df_cleaned = df_cleaned[df_cleaned['price']<3e8]

Singapore has the following latitude and longitude coordinates in its extreme ends:
1. left-most (Tuas) :  1.30871,103.64287
2. right-most (Changi) : 1.34538,104.00270
3. top-most (Sembawang) : 1.46227,103.79487
4. bottom-most (Bukit Merah) : 1.28762,103.82467


#### Min latitude - 1.28762       Max latitude - 1.46227

#### Min longitude - 103.64         Max longitude - 104.00

But we can see that in the data, min longitude is -77.065364 and max latitude is 69.486768 which are out of the range of latitude and longitude values 

<img src="images/singapore-lat-long-map.jpeg" width=600 height=600 />

In [22]:
df_max_lng = df_cleaned[df_cleaned.lng > 121.0]
df_min_lng = df_cleaned[df_cleaned.lng < -77.0]
df_max_lat = df_cleaned[df_cleaned.lat > 69.0]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

print("It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values")
print(df_wrong_coordinates["address"].value_counts())
# coordinates are incorrect for 5 'address'


# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_cleaned.loc[df_cleaned.address == "1 tessensohn road", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.3164313, 103.8575321, 'balestier', 'novena'
df_cleaned.loc[df_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.31262, 103.88686, 'aljunied', 'geylang'
df_cleaned.loc[df_cleaned.address == "5 jalan mutiara", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.29565, 103.82887, 'leonie hill', 'river valley'
df_cleaned.loc[df_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'
df_cleaned.loc[df_cleaned.address == "15 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'

It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values
38 lorong 32 geylang    6
17 farrer drive         3
15 farrer drive         2
Name: address, dtype: int64


In [23]:
df_max_lng = df_test_cleaned[df_test_cleaned.lng > 121.0]
df_min_lng = df_test_cleaned[df_test_cleaned.lng < -77.0]
df_max_lat = df_test_cleaned[df_test_cleaned.lat > 69.0]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

print("It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values")
print(df_wrong_coordinates["address"].value_counts())
# coordinates are incorrect for 5 'address'


# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_test_cleaned.loc[df_test_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.31262, 103.88686, 'aljunied', 'geylang'
df_test_cleaned.loc[df_test_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'
        

It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values
38 lorong 32 geylang    2
17 farrer drive         1
Name: address, dtype: int64


Merging With Auxiliary Data

In [24]:
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
pri_sch_coor = np.array([[radians(_) for _ in coor] for coor in df_pri_sch])
pri_sch_coor

array([[0.02518408, 1.81165779],
       [0.02502538, 1.81222266],
       [0.02399856, 1.81265384],
       [0.02253715, 1.81207179],
       [0.02426671, 1.81317637],
       [0.02416302, 1.81237461],
       [0.02301657, 1.81226397],
       [0.023799  , 1.81407625],
       [0.02389531, 1.81233873],
       [0.02313785, 1.81242773],
       [0.02415736, 1.81119459],
       [0.02310915, 1.81405736],
       [0.02307876, 1.81278752],
       [0.02227748, 1.81180553],
       [0.02343782, 1.81013216],
       [0.02397065, 1.8111182 ],
       [0.02334411, 1.81107318],
       [0.02348691, 1.81084051],
       [0.02532212, 1.81192393],
       [0.0231518 , 1.81307993],
       [0.02226716, 1.81234795],
       [0.02396002, 1.81439591],
       [0.02264075, 1.81255653],
       [0.02330942, 1.81296922],
       [0.02338433, 1.81430947],
       [0.02280164, 1.81358615],
       [0.02224708, 1.81214142],
       [0.02369771, 1.81276756],
       [0.0239653 , 1.81335677],
       [0.02384901, 1.81109583],
       [0.

In [25]:
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
sec_sch_coor = np.array([[radians(_) for _ in coor] for coor in df_sec_sch])
sec_sch_coor

array([[0.02523592, 1.81170252],
       [0.02506458, 1.81217632],
       [0.02400556, 1.81255449],
       [0.02322382, 1.81413586],
       [0.02303435, 1.81228105],
       [0.02273813, 1.81131498],
       [0.02387036, 1.81239431],
       [0.02390286, 1.81106569],
       [0.02339885, 1.81308864],
       [0.02341857, 1.81256411],
       [0.02328376, 1.81402294],
       [0.02313538, 1.8142877 ],
       [0.02312108, 1.81415707],
       [0.02311665, 1.81270744],
       [0.02343555, 1.80993565],
       [0.02392644, 1.81300617],
       [0.02281608, 1.81316822],
       [0.02353421, 1.81061705],
       [0.02244361, 1.81187903],
       [0.02411551, 1.81058399],
       [0.02348688, 1.81081736],
       [0.02364606, 1.8124395 ],
       [0.02537261, 1.81190009],
       [0.02328645, 1.81294771],
       [0.02339603, 1.81432179],
       [0.02280866, 1.81370116],
       [0.02325952, 1.81239958],
       [0.0242945 , 1.81344042],
       [0.02398345, 1.81224768],
       [0.02228446, 1.81204633],
       [0.

In [26]:
df_mrt_station = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv')
df_mrt_station = df_mrt_station.drop(["name", "lat", "lng", 'planning_area', 'code', 'line', 'opening_year'], axis=1)
mrt_station_cleaned = df_mrt_station.value_counts().to_frame(name="mrt_station")

mrt_station_coor = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv').drop(["name", "subzone", 'planning_area', 'code', 'line', 'opening_year'], axis=1).to_numpy()
mrt_station_coor = np.array([[radians(_) for _ in coor] for coor in mrt_station_coor])

df_train_coor = df_cleaned[["lat", "lng"]].to_numpy()
df_train_coor = np.array([[radians(_) for _ in coor] for coor in df_train_coor])

df_test_coor = df_test_cleaned[["lat", "lng"]].to_numpy()
df_test_coor = np.array([[radians(_) for _ in coor] for coor in df_test_coor])

dist_matrix_train_mrt = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_train_coor)
# multiply to get meters
closest_dist_to_mrt_train = pd.DataFrame(np.amin(dist_matrix_train_mrt, axis=0)* 6371000, columns=["closest_dist_to_mrt"])

dist_matrix_test_mrt = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_test_coor)
# multiply to get meters
closest_dist_to_mrt_test = pd.DataFrame(np.amin(dist_matrix_test_mrt, axis=0)* 6371000, columns=["closest_dist_to_mrt"])

dist_matrix_train_pri = sklearn.metrics.pairwise.haversine_distances(pri_sch_coor, df_train_coor)
closest_dist_to_pri_train = pd.DataFrame(np.amin(dist_matrix_train_pri, axis=0)* 6371000, columns=["closest_dist_to_pri"])

dist_matrix_test_pri = sklearn.metrics.pairwise.haversine_distances(pri_sch_coor, df_test_coor)
closest_dist_to_pri_test = pd.DataFrame(np.amin(dist_matrix_test_pri, axis=0)* 6371000, columns=["closest_dist_to_pri"])

dist_matrix_train_sec = sklearn.metrics.pairwise.haversine_distances(sec_sch_coor, df_train_coor)
closest_dist_to_sec_train = pd.DataFrame(np.amin(dist_matrix_train_sec, axis=0)* 6371000, columns=["closest_dist_to_sec"])

dist_matrix_test_sec = sklearn.metrics.pairwise.haversine_distances(sec_sch_coor, df_test_coor)
closest_dist_to_sec_test = pd.DataFrame(np.amin(dist_matrix_test_sec, axis=0)* 6371000, columns=["closest_dist_to_sec"])

#during pri sch registration exercise, homeowners within 1km will be given priority
test = pd.DataFrame(dist_matrix_test_pri * 6371000)
test_near_pri_sch = test[test <= 1000].count().rename("close_pri_sch")

test = pd.DataFrame(dist_matrix_train_pri * 6371000)
train_near_pri_sch = test[test <= 1000].count().rename("close_pri_sch")

#no such priority for sec sch, put as 1km for now
#change 1000 to desired distance if necessary
test = pd.DataFrame(dist_matrix_test_sec * 6371000)
test_near_sec_sch = test[test <= 1000].count().rename("close_sec_sch")

test = pd.DataFrame(dist_matrix_train_sec * 6371000)
train_near_sec_sch = test[test <= 1000].count().rename("close_sec_sch")

df_cleaned = df_cleaned.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_train)\
    .join(closest_dist_to_pri_train)\
    .join(closest_dist_to_sec_train)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

df_test_cleaned = df_test_cleaned.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_test)\
    .join(closest_dist_to_pri_test)\
    .join(closest_dist_to_sec_test)\
    .join(test_near_pri_sch)\
    .join(test_near_sec_sch)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

Drop Columns at the end

In [27]:
df_cleaned = df_cleaned.drop(['title', 'address','property_name', 'property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level', 'available_unit_types'], axis = 1)
df_test_cleaned = df_test_cleaned.drop(['title','address','property_name', 'property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level', 'available_unit_types'], axis = 1)

In [28]:
print(df_cleaned.isnull().sum())
print(df_test_cleaned.isnull().sum())
print(df_cleaned.shape)
print(df_test_cleaned.shape)

property_type          0
num_beds               0
num_baths              0
size_sqft              0
furnishing             0
lat                    0
lng                    0
subzone                0
planning_area          0
price                  0
built_year             0
tenure                 0
pri_sch                0
sec_sch                0
mrt_station            0
population_density     0
closest_dist_to_mrt    0
closest_dist_to_pri    0
closest_dist_to_sec    0
dtype: int64
property_type          0
num_beds               0
num_baths              0
size_sqft              0
furnishing             0
lat                    0
lng                    0
subzone                0
planning_area          0
built_year             0
tenure                 0
pri_sch                0
sec_sch                0
mrt_station            0
population_density     0
closest_dist_to_mrt    0
closest_dist_to_pri    0
closest_dist_to_sec    0
close_pri_sch          0
close_sec_sch          0
dtype: int64

Saving the data

In [29]:
df_test_cleaned.to_csv('../data/test_cleaned.csv', index = False)
df_cleaned.to_csv('../data/train_cleaned.csv', index = False)

In [30]:
df_cleaned.isnull().sum()

property_type          0
num_beds               0
num_baths              0
size_sqft              0
furnishing             0
lat                    0
lng                    0
subzone                0
planning_area          0
price                  0
built_year             0
tenure                 0
pri_sch                0
sec_sch                0
mrt_station            0
population_density     0
closest_dist_to_mrt    0
closest_dist_to_pri    0
closest_dist_to_sec    0
dtype: int64

In [31]:
df_test_cleaned.isnull().sum()
df_test_cleaned.shape


(6966, 20)