For subzone: calculate population density
For pri sch, sec sch, shopping malls: calculate number of pri sch, sec schs, shopping malls in the subzone
For mrt-station: calculate distance to the nearest mrt station for each property

In [None]:
import numpy as np
import pandas as pd
from math import radians
import sklearn.metrics

In [None]:
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv')
df_pri_sch = df_pri_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
print(df_pri_sch.head())

In [None]:
pri_sch_cleaned = df_pri_sch.value_counts().to_frame(name="pri_sch")
print(pri_sch_cleaned)

In [None]:
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv')
df_sec_sch = df_sec_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
print(df_sec_sch.head())

In [None]:
sec_sch_cleaned = df_sec_sch.value_counts().to_frame(name="sec_sch")

In [None]:
print(sec_sch_cleaned)

In [None]:
df_subzone = pd.read_csv('../data/auxiliary-data/sg-subzones.csv')
df_subzone["population_density"] = df_subzone['population']/df_subzone["area_size"]
df_subzone = df_subzone.drop(['area_size', 'population', 'planning_area'],axis=1).set_index("name")
print(df_subzone.head())

In [None]:
df_mrt_station = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv')
df_mrt_station = df_mrt_station.drop(["name", "lat", "lng", 'planning_area', 'code', 'line', 'opening_year'], axis=1)
print(df_mrt_station.head())

In [None]:
mrt_station_cleaned = df_mrt_station.value_counts().to_frame(name="mrt_station")
print(mrt_station_cleaned.head())

In [None]:
mrt_station_coor = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv').drop(["name", "subzone", 'planning_area', 'code', 'line', 'opening_year'], axis=1).to_numpy()
mrt_station_coor = np.array([[radians(_) for _ in coor] for coor in mrt_station_coor])
mrt_station_coor

In [None]:
df_train_coor = pd.read_csv('../data/train.csv')[["lat", "lng"]].to_numpy()
df_train_coor = np.array([[radians(_) for _ in coor] for coor in df_train_coor])
df_train_coor

In [None]:
dist_matrix_train = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_train_coor)
# multiply to get meters
closest_dist_to_mrt_train = pd.DataFrame(np.amin(dist_matrix_train, axis=0)* 6371000, columns=["closest_dist_to_mrt"])
print(closest_dist_to_mrt_train)

In [None]:
df_test_coor = pd.read_csv('../data/test.csv')[["lat", "lng"]].to_numpy()
df_test_coor = np.array([[radians(_) for _ in coor] for coor in df_test_coor])
df_test_coor

In [None]:
dist_matrix_test = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_test_coor)
# multiply to get meters
closest_dist_to_mrt_test = pd.DataFrame(np.amin(dist_matrix_test, axis=0)* 6371000, columns=["closest_dist_to_mrt"])
print(closest_dist_to_mrt_test)

In [None]:
df_train = pd.read_csv('../data/train_cleaned.csv')
df_test = pd.read_csv('../data/test_cleaned.csv')

In [None]:
df_train.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_train)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

In [None]:
df_train.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_test)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

In [None]:
df_train.to_csv('../data/train_cleaned_1.csv', index = False)