For subzone: calculate population density
For pri sch, sec sch, shopping malls: calculate number of pri sch, sec schs, shopping malls in the subzone
For mrt-station: calculate distance to the nearest mrt station for each property

In [45]:
import numpy as np
import pandas as pd
from math import radians
import sklearn.metrics

In [2]:
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv')
df_pri_sch = df_pri_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
print(df_pri_sch.head())

           subzone
0   woodlands east
1      yishun west
2        cheng san
3  alexandra north
4       anchorvale


In [3]:
pri_sch_cleaned = df_pri_sch.value_counts().to_frame(name="pri_sch")
print(pri_sch_cleaned)

                     pri_sch
subzone                     
woodlands east             6
tampines east              6
yunnan                     5
punggol town centre        4
bedok north                4
...                      ...
keat hong                  1
katong                     1
kampong ubi                1
kampong java               1
midview                    1

[109 rows x 1 columns]


In [4]:
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv')
df_sec_sch = df_sec_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
print(df_sec_sch.head())

          subzone
0  woodlands east
1     yishun west
2       cheng san
3     bedok north
4         malcolm


In [5]:
sec_sch_cleaned = df_sec_sch.value_counts().to_frame(name="sec_sch")

In [6]:
print(sec_sch_cleaned)

                sec_sch
subzone                
tampines east         5
marymount             3
bedok south           3
tampines west         3
woodlands east        3
...                 ...
joo seng              1
jelebu                1
hougang east          1
henderson hill        1
yunnan                1

[101 rows x 1 columns]


In [7]:
df_subzone = pd.read_csv('../data/auxiliary-data/sg-subzones.csv')
df_subzone["population_density"] = df_subzone['population']/df_subzone["area_size"]
df_subzone = df_subzone.drop(['area_size', 'population', 'planning_area'],axis=1).set_index("name")
print(df_subzone.head())

                        population_density
name                                      
ang mo kio town centre        15178.289681
cheng san                     29371.141572
chong boon                    24568.885592
kebun bahru                   21616.972477
sembawang hills                7657.909447


In [16]:
df_mrt_station = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv')
df_mrt_station = df_mrt_station.drop(["name", "lat", "lng", 'planning_area', 'code', 'line', 'opening_year'], axis=1)
print(df_mrt_station.head())

         subzone
0    dhoby ghaut
1     bras basah
2      city hall
3  marina centre
4         nicoll


In [17]:
mrt_station_cleaned = df_mrt_station.value_counts().to_frame(name="mrt_station")
print(mrt_station_cleaned.head())

                   mrt_station
subzone                       
maritime square              4
aljunied                     3
serangoon central            3
geylang east                 3
city hall                    3


In [51]:
mrt_station_coor = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv').drop(["name", "subzone", 'planning_area', 'code', 'line', 'opening_year'], axis=1).to_numpy()
mrt_station_coor = np.array([[radians(_) for _ in coor] for coor in mrt_station_coor])
mrt_station_coor

array([[0.02267028, 1.81245972],
       [0.02263451, 1.81253608],
       [0.02257859, 1.81261311],
       [0.02258453, 1.81270507],
       [0.02268521, 1.81276244],
       [0.02273837, 1.81296665],
       [0.02279752, 1.81309215],
       [0.02283847, 1.81320624],
       [0.0229903 , 1.81326202],
       [0.02314569, 1.81321032],
       [0.02330262, 1.81319445],
       [0.02343107, 1.81305113],
       [0.023573  , 1.81292187],
       [0.02358867, 1.81278295],
       [0.02358491, 1.81250942],
       [0.02353938, 1.81233983],
       [0.02333934, 1.81235272],
       [0.02308001, 1.81192029],
       [0.02299365, 1.81178055],
       [0.02290291, 1.81158892],
       [0.02279881, 1.81148357],
       [0.02268509, 1.81143286],
       [0.02257642, 1.81138251],
       [0.02237437, 1.81133741],
       [0.02227413, 1.8115008 ],
       [0.02220702, 1.81171139],
       [0.02217801, 1.81182214],
       [0.02208521, 1.81202754],
       [0.02237292, 1.8126829 ],
       [0.02227583, 1.8126048 ],
       [0.

In [52]:
df_train_coor = pd.read_csv('../data/train.csv')[["lat", "lng"]].to_numpy()
df_train_coor = np.array([[radians(_) for _ in coor] for coor in df_train_coor])
df_train_coor

array([[0.02468593, 1.81230095],
       [0.02395633, 1.81297166],
       [0.02266786, 1.81332375],
       ...,
       [0.02296785, 1.81229488],
       [0.02514589, 1.8117682 ],
       [0.02296785, 1.81229488]])

In [63]:
dist_matrix_train = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_train_coor)
# multiply to get meters
closest_dist_to_mrt_train = pd.DataFrame(np.amin(dist_matrix_train, axis=0)* 6371000, columns=["closest_dist_to_mrt"])
print(closest_dist_to_mrt_train)

       closest_dist_to_mrt
0               574.204611
1              1734.306081
2              1319.767117
3               726.004555
4               371.115948
...                    ...
20249           149.940377
20250           443.604873
20251           424.243848
20252           631.719872
20253           424.243848

[20254 rows x 1 columns]


In [64]:
df_test_coor = pd.read_csv('../data/test.csv')[["lat", "lng"]].to_numpy()
df_test_coor = np.array([[radians(_) for _ in coor] for coor in df_test_coor])
df_test_coor

array([[0.02346305, 1.81302517],
       [0.02409045, 1.81416291],
       [0.02259623, 1.81252571],
       ...,
       [0.02258641, 1.8122929 ],
       [0.02275449, 1.81359673],
       [0.02273763, 1.81286363]])

In [65]:
dist_matrix_test = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_test_coor)
# multiply to get meters
closest_dist_to_mrt_test = pd.DataFrame(np.amin(dist_matrix_test, axis=0)* 6371000, columns=["closest_dist_to_mrt"])
print(closest_dist_to_mrt_test)

      closest_dist_to_mrt
0              262.436752
1             1004.449411
2              252.644179
3             1374.809476
4              273.508629
...                   ...
6995          3009.557197
6996           389.284983
6997           732.925850
6998          1931.187202
6999           656.186344

[7000 rows x 1 columns]


In [56]:
df_train = pd.read_csv('../data/train_cleaned.csv')
df_test = pd.read_csv('../data/test_cleaned.csv')

In [66]:
df_train.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_train)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,address,property_name,property_type,num_beds,num_baths,size_sqft,furnishing,...,planning_area,price,built_year,tenure,available_unit_types,pri_sch,sec_sch,mrt_station,population_density,closest_dist_to_mrt
0,0,0,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,hdb,3.0,2.0,1115,unspecified,...,yishun,514500.0,1988.0,99-year leasehold,,2.0,2.0,0.0,31517.683928,574.204611
1,1,1,hdb flat for sale in 506b serangoon north aven...,hougang / punggol / sengkang (d19),hdb-serangoon estate,hdb,4.0,2.0,1575,unspecified,...,serangoon,995400.0,1992.0,99-year leasehold,"1, 2, 3, 4, 5, 6 br",1.0,0.0,0.0,23280.268731,1734.306081
2,2,2,4 bed condo for sale in meyerhouse,128 meyer road,meyerhouse,condo,4.0,6.0,3070,partial,...,marine parade,8485000.0,2022.0,freehold,"studio, 3, 4, 5, 6 br",0.0,1.0,0.0,5829.779777,1319.767117
3,3,3,3 bed condo for sale in leedon green,26 leedon heights,leedon green,condo,3.0,2.0,958,partial,...,bukit timah,2626000.0,2023.0,freehold,"studio, 1, 2, 3, 4 br",1.0,1.0,1.0,11059.413028,726.004555
4,4,4,2 bed condo for sale in one bernam,1 bernam street,one bernam,condo,2.0,1.0,732,unspecified,...,downtown core,1764000.0,2026.0,99-year leasehold,"studio, 1, 2, 3, 4, 5 br",0.0,0.0,0.0,775.193798,371.115948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19632,19641,19641,2 bed condo for sale in lentor modern,lentor central,lentor modern,condo,2.0,2.0,635,unspecified,...,ang mo kio,1050000.0,2026.0,99-year leasehold,"1, 2, 3, 4 br",0.0,0.0,1.0,2384.380061,370.288233
19633,19642,19642,2 bed condo for sale in mori,223 guillemard road,mori,condo,2.0,2.0,883,unspecified,...,kallang,2087400.0,2026.0,freehold,"1, 2, 3, 4 br",0.0,0.0,2.0,12802.219580,220.711188
19634,19643,19643,4 bed condo for sale in pullman residences newton,18 dunearn road,pullman residences newton,condo,4.0,4.0,1378,unspecified,...,novena,4193700.0,2023.0,freehold,"studio, 1, 2, 3, 4, 5 br",1.0,0.0,1.0,6901.669759,1391.456418
19635,19644,19644,hdb flat for sale in 691d woodlands drive 73,admiralty / woodlands (d25),admiralty flora,hdb,3.0,2.0,1205,unfurnished,...,woodlands,754800.0,2017.0,99-year leasehold,"2, 3, 4 br",6.0,3.0,1.0,38762.482867,732.641452


In [67]:
df_train.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_test)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,address,property_name,property_type,num_beds,num_baths,size_sqft,furnishing,...,planning_area,price,built_year,tenure,available_unit_types,pri_sch,sec_sch,mrt_station,population_density,closest_dist_to_mrt
0,0,0,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,hdb,3.0,2.0,1115,unspecified,...,yishun,514500.0,1988.0,99-year leasehold,,2.0,2.0,0.0,31517.683928,262.436752
1,1,1,hdb flat for sale in 506b serangoon north aven...,hougang / punggol / sengkang (d19),hdb-serangoon estate,hdb,4.0,2.0,1575,unspecified,...,serangoon,995400.0,1992.0,99-year leasehold,"1, 2, 3, 4, 5, 6 br",1.0,0.0,0.0,23280.268731,1004.449411
2,2,2,4 bed condo for sale in meyerhouse,128 meyer road,meyerhouse,condo,4.0,6.0,3070,partial,...,marine parade,8485000.0,2022.0,freehold,"studio, 3, 4, 5, 6 br",0.0,1.0,0.0,5829.779777,252.644179
3,3,3,3 bed condo for sale in leedon green,26 leedon heights,leedon green,condo,3.0,2.0,958,partial,...,bukit timah,2626000.0,2023.0,freehold,"studio, 1, 2, 3, 4 br",1.0,1.0,1.0,11059.413028,1374.809476
4,4,4,2 bed condo for sale in one bernam,1 bernam street,one bernam,condo,2.0,1.0,732,unspecified,...,downtown core,1764000.0,2026.0,99-year leasehold,"studio, 1, 2, 3, 4, 5 br",0.0,0.0,0.0,775.193798,273.508629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19632,19641,19641,2 bed condo for sale in lentor modern,lentor central,lentor modern,condo,2.0,2.0,635,unspecified,...,ang mo kio,1050000.0,2026.0,99-year leasehold,"1, 2, 3, 4 br",0.0,0.0,1.0,2384.380061,
19633,19642,19642,2 bed condo for sale in mori,223 guillemard road,mori,condo,2.0,2.0,883,unspecified,...,kallang,2087400.0,2026.0,freehold,"1, 2, 3, 4 br",0.0,0.0,2.0,12802.219580,
19634,19643,19643,4 bed condo for sale in pullman residences newton,18 dunearn road,pullman residences newton,condo,4.0,4.0,1378,unspecified,...,novena,4193700.0,2023.0,freehold,"studio, 1, 2, 3, 4, 5 br",1.0,0.0,1.0,6901.669759,
19635,19644,19644,hdb flat for sale in 691d woodlands drive 73,admiralty / woodlands (d25),admiralty flora,hdb,3.0,2.0,1205,unfurnished,...,woodlands,754800.0,2017.0,99-year leasehold,"2, 3, 4 br",6.0,3.0,1.0,38762.482867,
