In [None]:
import numpy as np
import pandas as pd
from math import radians
import sklearn.metrics

In [None]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

print(f"Train size: {df_train.shape[0]}")
print(f"Test size: {df_test.shape[0]}")

Primary School

In [None]:
np.median(df_test[df_test["subzone"] == "serangoon garden"]["built_year"].dropna())

In [None]:
# we fill in empty build year of properties with median of built years of the properties in the same subzone
for row in df_test.index:
    if np.isnan(df_test.loc[row, "built_year"]):
        df_test.loc[row, "built_year"] = np.median(df_test[df_test["subzone"] == df_test.loc[row, "subzone"]]["built_year"].dropna())

print(df_test.isnull().sum())

In [None]:
print("We can see that there are still 20 properties without built year.")
df_test[np.isnan(df_test['built_year'])]

In [None]:
#quick search of data online
df_test.loc[df_test["title"].str.contains("copen grand"), 'built_year']=2027
df_test.loc[df_test["title"].str.contains("jurong park"), 'built_year']=1971
df_test.loc[df_test["title"].str.contains("ponggol park"), 'built_year']=2011

In [None]:
df_test[pd.isnull(df_test['subzone'])]

In [None]:
df_test.loc[df_test["address"].str.contains("1 tessensohn road", na=False), 'subzone'] = 'lavender'
df_test.loc[df_test["address"].str.contains("1 tessensohn road", na=False), 'planning_area'] = 'kallang'

df_test.loc[df_test["address"].str.contains("38 lorong 32 geylang", na=False), 'subzone'] = 'aljunied'
df_test.loc[df_test["address"].str.contains("38 lorong 32 geylang", na=False), 'planning_area'] = 'geylang'

df_test.loc[df_test["address"].str.contains("17 farrer drive", na=False), 'subzone'] = 'leonie hill'
df_test.loc[df_test["address"].str.contains("17 farrer drive", na=False), 'planning_area'] = 'river valley'

df_test.loc[df_test["address"].str.contains("5 jalan mutiara", na=False), 'subzone'] = 'central subzone'
df_test.loc[df_test["address"].str.contains("5 jalan mutiara", na=False), 'planning_area'] = 'downtown core'

df_test.isnull().sum()

In [None]:
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv')
df_pri_sch = df_pri_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
pri_sch_cleaned = df_pri_sch.value_counts().to_frame(name="pri_sch")

Secondary School

In [None]:
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv')
df_sec_sch = df_sec_sch.drop(["name", "lat", "lng", 'planning_area'], axis=1)
sec_sch_cleaned = df_sec_sch.value_counts().to_frame(name="sec_sch")

Population Density of Subzone

In [None]:
df_subzone = pd.read_csv('../data/auxiliary-data/sg-subzones.csv')

for row in df_subzone.index:
    if df_subzone.loc[row, "population"] == 0:
        df_subzone.loc[row, "population"] += 10

df_subzone["population_density"] = df_subzone['population']/df_subzone["area_size"]
df_subzone = df_subzone.drop(['area_size', 'population', 'planning_area'],axis=1).set_index("name")

In [None]:
df_dirty_test = df_test
df_dirty_train = df_train

Remove duplicates and invalid data (Rows)


In [None]:
df_cleaned = df_dirty_train.drop_duplicates()
df_cleaned = df_cleaned[df_cleaned.size_sqft > 0]
df_cleaned.dropna(subset=['num_beds', 'num_baths', 'price', 'size_sqft', 'built_year', 'available_unit_types', 'tenure'], inplace = True)
df_cleaned = df_cleaned[df_cleaned.price > 0]
df_cleaned = df_cleaned[df_cleaned.furnishing != "na"]
print(f'Records dropped :{df_dirty_train.shape[0] - df_cleaned.shape[0]}' )


#c.	Standardize capitalization


In [None]:
df_cleaned['property_type'] = df_cleaned['property_type'].str.lower()
df_cleaned['tenure'] = df_cleaned['tenure'].str.lower()
df_cleaned['furnishing'] = df_cleaned['furnishing'].str.lower()
df_cleaned['subzone'] = df_cleaned['subzone'].str.lower()
df_cleaned['planning_area'] = df_cleaned['planning_area'].str.lower()

df_cleaned['built_year'] = df_cleaned['built_year'].astype(int)
df_cleaned['num_beds'] = df_cleaned['num_beds'].astype(int)
df_cleaned['num_baths'] = df_cleaned['num_baths'].astype(int)
df_cleaned['lng'] = df_cleaned['lng'].astype(np.float16)
df_cleaned['lat'] = df_cleaned['lat'].astype(np.float16)

df_test_cleaned = df_dirty_test
df_test_cleaned['property_type'] = df_test_cleaned['property_type'].str.lower()
df_test_cleaned['tenure'] = df_test_cleaned['tenure'].str.lower()
df_test_cleaned['furnishing'] = df_test_cleaned['furnishing'].str.lower()
df_test_cleaned['subzone'] = df_test_cleaned['subzone'].str.lower()
df_test_cleaned['planning_area'] = df_test_cleaned['planning_area'].str.lower()

#TODO: Fix comments
#df_test_cleaned['built_year'] = df_test_cleaned['built_year'].astype(int)
#df_test_cleaned['num_beds'] = df_test_cleaned['num_beds'].astype(int)
#df_test_cleaned['num_baths'] = df_test_cleaned['num_baths'].astype(int)
df_test_cleaned['lng'] = df_test_cleaned['lng'].astype(np.float16)
df_test_cleaned['lat'] = df_test_cleaned['lat'].astype(np.float16)


Clean Lease tenure column, Property_Type Column

In [None]:
#TENURE COLUMN
mask_999 = ['947-year leasehold', '929-year leasehold', '946-year leasehold',
'956-year leasehold']
mask_99 = ['100-year leasehold', '102-year leasehold', '110-year leasehold', '103-year leasehold']
df_cleaned = df_cleaned.replace(mask_999, '999-year leasehold')
df_cleaned = df_cleaned.replace(mask_99, '99-year leasehold')

df_test_cleaned = df_test_cleaned.replace(mask_999, '999-year leasehold')
df_test_cleaned = df_test_cleaned.replace(mask_99, '99-year leasehold')

print(f"Train size: {df_cleaned.shape[0]}")
print(f"Test size: {df_test_cleaned.shape[0]}")


In [None]:
#PROPERTY TYPE
# changing "hdb 3 rooms", "hdb 4 rooms" and likewise to "hdb" since the number of rooms info can
# be obtained from "num_beds"
df_cleaned['property_type'].mask(df_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)
df_cleaned.drop(df_cleaned[df_cleaned['property_type']  == 'land only'].index, inplace = True)

df_test_cleaned['property_type'].mask(df_test_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)

#TODO: Reduce number of property types with less properties. Maybe do an EDA and figure out best way to remove them. Perhaps club 
#them in different category.

df_cleaned.loc[(df_cleaned['property_type']=="apartment") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_cleaned.loc[(df_cleaned['property_type']=="walk-up") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_cleaned.loc[(df_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

#Test
df_test_cleaned.loc[(df_test_cleaned['property_type']=="apartment") & 
       (df_test_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="walk-up") & 
       (df_test_cleaned['title'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('condo')),['property_type']] = "condo"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="good class bungalow"),['property_type']] = "bungalow"

#The following results have been picked manually from the website 99.co
df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('blair-plain-conservation-area')),['property_type']] = "landed"

df_test_cleaned.loc[(df_test_cleaned['property_type']=="conservation house") & 
       (df_test_cleaned['property_details_url'].str.contains('beng-tong-mansion')),['property_type']] = "landed"

print(f"Train size: {df_cleaned.shape[0]}")
print(f"Test size: {df_test_cleaned.shape[0]}")


Handle Missing Values

In [None]:
#h. Handle missing values

#built_year
dfmap = df_cleaned.dropna(subset = ['built_year'])[['built_year', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_cleaned = df_cleaned.drop(columns=['built_year']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_cleaned.dropna(subset = ['tenure'])[['tenure', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_cleaned = df_cleaned.drop(columns=['tenure']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_cleaned.dropna(subset = ['tenure'])[['tenure', 'address']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['address'])
df_cleaned = df_cleaned.drop(columns=['tenure']).merge(dfmap, on=['address'], how='left')

#df_cleaned.isnull().sum()



In [None]:
#h. Handle missing values
#print(df_test_cleaned.isnull().sum())
print("----------------------------------------------------")

#built_year
dfmap = df_test_cleaned.dropna(subset = ['built_year'])[['built_year', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_test_cleaned = df_test_cleaned.drop(columns=['built_year']).merge(dfmap, on=['property_name'], how='left')


#df_test_cleaned = df_test_cleaned.dropna(subset=['built_year'])

dfmap = df_test_cleaned.dropna(subset = ['tenure'])[['tenure', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_test_cleaned = df_test_cleaned.drop(columns=['tenure']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_test_cleaned.dropna(subset = ['tenure'])[['tenure', 'address']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['address'])
df_test_cleaned = df_test_cleaned.drop(columns=['tenure']).merge(dfmap, on=['address'], how='left')

#Fill in number of baths from number of bedrooms.
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 1), ['num_baths']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 2), ['num_baths']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds > 2), ['num_baths']] = df_test_cleaned.num_beds -1

#Fill in number of beds from number of bathrooms.
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.num_baths == 1), ['num_beds']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.num_baths == 2), ['num_beds']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.num_baths > 2), ['num_beds']] = df_test_cleaned.num_baths + 1

#There are 5 properties which have both Nan values for bed and baths. Estimate their beds from area.
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 600), ['num_beds']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 1000), ['num_beds']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 1700), ['num_beds']] = 3
df_test_cleaned.loc[(df_test_cleaned.num_beds.isnull() & df_test_cleaned.size_sqft < 3000), ['num_beds']] = 4

#Estimate their baths from beds.
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 1), ['num_baths']] = 1
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds == 2), ['num_baths']] = 2
df_test_cleaned.loc[(df_test_cleaned.num_baths.isnull() & df_test_cleaned.num_beds > 2), ['num_baths']] = df_test_cleaned.num_beds -1


In [None]:
#fill in missing tenure type for HDB

df_test_cleaned.loc[(df_test_cleaned["property_type"] == "hdb") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "99-year leasehold"
df_test_cleaned

In [None]:
# find the most probable type of tenure from the train data, use them to fill in the unknown tenure type
df_cleaned[df_cleaned["property_type"] == "condo"]["tenure"].value_counts()

In [None]:
df_cleaned[df_cleaned["property_type"] == "terraced house"]["tenure"].value_counts()

In [None]:
df_cleaned[df_cleaned["property_type"] == "semi-detached house"]["tenure"].value_counts()

In [None]:
df_cleaned[df_cleaned["property_type"] == "bungalow"]["tenure"].value_counts()

In [None]:
df_cleaned[df_cleaned["property_type"] == "corner terrace"]["tenure"].value_counts()

In [None]:
df_cleaned[df_cleaned["property_type"] == "landed"]["tenure"].value_counts()

In [None]:
df_cleaned[df_cleaned["property_type"] == "cluster house"]["tenure"].value_counts()

In [None]:
df_test_cleaned.loc[(df_test_cleaned["property_type"] == "condo") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "99-year leasehold"
df_test_cleaned.loc[(df_test_cleaned["property_type"] == "landed") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "99-year leasehold"
df_test_cleaned.loc[(df_test_cleaned["property_type"] == "cluster house") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "99-year leasehold"
df_test_cleaned.loc[(df_test_cleaned["property_type"] == "terraced house") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "freehold"
df_test_cleaned.loc[(df_test_cleaned["property_type"] == "semi-detached house") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "freehold"
df_test_cleaned.loc[(df_test_cleaned["property_type"] == "bungalow") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "freehold"
df_test_cleaned.loc[(df_test_cleaned["property_type"] == "corner terrace") & (df_test_cleaned["tenure"].isnull()), "tenure"] = "freehold"

In [None]:
# there are only 2 records which appear to be incorrect as num_beds =1, whereas num_baths = 10 and price is > 3e8
# hence we can drop these two records so that the results are not skewed

df_cleaned = df_cleaned[df_cleaned['price']<3e8]

Singapore has the following latitude and longitude coordinates in its extreme ends:
1. left-most (Tuas) :  1.30871,103.64287
2. right-most (Changi) : 1.34538,104.00270
3. top-most (Sembawang) : 1.46227,103.79487
4. bottom-most (Bukit Merah) : 1.28762,103.82467


#### Min latitude - 1.28762       Max latitude - 1.46227

#### Min longitude - 103.64         Max longitude - 104.00

But we can see that in the data, min longitude is -77.065364 and max latitude is 69.486768 which are out of the range of latitude and longitude values 

<img src="images/singapore-lat-long-map.jpeg" width=600 height=600 />

In [None]:
df_max_lng = df_cleaned[df_cleaned.lng > 121.0]
df_min_lng = df_cleaned[df_cleaned.lng < -77.0]
df_max_lat = df_cleaned[df_cleaned.lat > 69.0]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

print("It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values")
print(df_wrong_coordinates["address"].value_counts())
# coordinates are incorrect for 5 'address'


# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_cleaned.loc[df_cleaned.address == "1 tessensohn road", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.3164313, 103.8575321, 'balestier', 'novena'
df_cleaned.loc[df_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.31262, 103.88686, 'aljunied', 'geylang'
df_cleaned.loc[df_cleaned.address == "5 jalan mutiara", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.29565, 103.82887, 'leonie hill', 'river valley'
df_cleaned.loc[df_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'
df_cleaned.loc[df_cleaned.address == "15 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'

In [None]:
df_max_lng = df_test_cleaned[df_test_cleaned.lng > 121.0]
df_min_lng = df_test_cleaned[df_test_cleaned.lng < -77.0]
df_max_lat = df_test_cleaned[df_test_cleaned.lat > 69.0]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

print("It is interesting to note that in all the records where latitude and longitude have incorrect coordinates, planning_area and subzone have missing values, this can also be verified by checking for count of missing values")
print(df_wrong_coordinates["address"].value_counts())
# coordinates are incorrect for 5 'address'


# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_test_cleaned.loc[df_test_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.31262, 103.88686, 'aljunied', 'geylang'
df_test_cleaned.loc[df_test_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', 1.313259, 103.806622, 'holland road', 'bukit timah'
        

Merging With Auxiliary Data

In [None]:
df_pri_sch = pd.read_csv('../data/auxiliary-data/sg-primary-schools.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
pri_sch_coor = np.array([[radians(_) for _ in coor] for coor in df_pri_sch])
pri_sch_coor

In [None]:
df_sec_sch = pd.read_csv('../data/auxiliary-data/sg-secondary-schools.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
sec_sch_coor = np.array([[radians(_) for _ in coor] for coor in df_sec_sch])
sec_sch_coor

In [None]:
df_commercial_centres = pd.read_csv('../data/auxiliary-data/sg-commerical-centres.csv').drop(["name", "type", "subzone", 'planning_area'], axis=1).to_numpy()
commercial_centres_coor = np.array([[radians(_) for _ in coor] for coor in df_commercial_centres])

df_shopping_malls = pd.read_csv('../data/auxiliary-data/sg-shopping-malls.csv').drop(["name", "subzone", 'planning_area'], axis=1).to_numpy()
shopping_malls_coor = np.array([[radians(_) for _ in coor] for coor in df_shopping_malls])

In [None]:
df_mrt_station = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv')
df_mrt_station = df_mrt_station.drop(["name", "lat", "lng", 'planning_area', 'code', 'line', 'opening_year'], axis=1)
mrt_station_cleaned = df_mrt_station.value_counts().to_frame(name="mrt_station")

mrt_station_coor = pd.read_csv('../data/auxiliary-data/sg-mrt-stations.csv').drop(["name", "subzone", 'planning_area', 'code', 'line', 'opening_year'], axis=1).to_numpy()
mrt_station_coor = np.array([[radians(_) for _ in coor] for coor in mrt_station_coor])

df_train_coor = df_cleaned[["lat", "lng"]].to_numpy()
df_train_coor = np.array([[radians(_) for _ in coor] for coor in df_train_coor])

df_test_coor = df_test_cleaned[["lat", "lng"]].to_numpy()
df_test_coor = np.array([[radians(_) for _ in coor] for coor in df_test_coor])

dist_matrix_train_mrt = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_train_coor)
# multiply to get meters
closest_dist_to_mrt_train = pd.DataFrame(np.amin(dist_matrix_train_mrt, axis=0)* 6371000, columns=["closest_dist_to_mrt"])

dist_matrix_test_mrt = sklearn.metrics.pairwise.haversine_distances(mrt_station_coor, df_test_coor)
# multiply to get meters
closest_dist_to_mrt_test = pd.DataFrame(np.amin(dist_matrix_test_mrt, axis=0)* 6371000, columns=["closest_dist_to_mrt"])

dist_matrix_train_pri = sklearn.metrics.pairwise.haversine_distances(pri_sch_coor, df_train_coor)
closest_dist_to_pri_train = pd.DataFrame(np.amin(dist_matrix_train_pri, axis=0)* 6371000, columns=["closest_dist_to_pri"])

dist_matrix_test_pri = sklearn.metrics.pairwise.haversine_distances(pri_sch_coor, df_test_coor)
closest_dist_to_pri_test = pd.DataFrame(np.amin(dist_matrix_test_pri, axis=0)* 6371000, columns=["closest_dist_to_pri"])

dist_matrix_train_sec = sklearn.metrics.pairwise.haversine_distances(sec_sch_coor, df_train_coor)
closest_dist_to_sec_train = pd.DataFrame(np.amin(dist_matrix_train_sec, axis=0)* 6371000, columns=["closest_dist_to_sec"])

dist_matrix_test_sec = sklearn.metrics.pairwise.haversine_distances(sec_sch_coor, df_test_coor)
closest_dist_to_sec_test = pd.DataFrame(np.amin(dist_matrix_test_sec, axis=0)* 6371000, columns=["closest_dist_to_sec"])

dist_matrix_test_com = sklearn.metrics.pairwise.haversine_distances(commercial_centres_coor, df_test_coor)
closest_dist_to_com_test = pd.DataFrame(np.amin(dist_matrix_test_com, axis=0)* 6371000, columns=["closest_dist_to_com"])

dist_matrix_test_shop = sklearn.metrics.pairwise.haversine_distances(shopping_malls_coor, df_test_coor)
closest_dist_to_shop_test = pd.DataFrame(np.amin(dist_matrix_test_shop, axis=0)* 6371000, columns=["closest_dist_to_shop"])

dist_matrix_train_com = sklearn.metrics.pairwise.haversine_distances(commercial_centres_coor, df_train_coor)
closest_dist_to_com_train = pd.DataFrame(np.amin(dist_matrix_train_com, axis=0)* 6371000, columns=["closest_dist_to_com"])

dist_matrix_train_shop = sklearn.metrics.pairwise.haversine_distances(shopping_malls_coor, df_train_coor)
closest_dist_to_shop_train = pd.DataFrame(np.amin(dist_matrix_train_shop, axis=0)* 6371000, columns=["closest_dist_to_shop"])

#during pri sch registration exercise, homeowners within 1km will be given priority
test = pd.DataFrame(dist_matrix_test_pri * 6371000)
test_near_pri_sch = test[test <= 1000].count().rename("close_pri_sch")

test = pd.DataFrame(dist_matrix_train_pri * 6371000)
train_near_pri_sch = test[test <= 1000].count().rename("close_pri_sch")

#no such priority for sec sch, put as 1km for now
#change 1000 to desired distance if necessary
test = pd.DataFrame(dist_matrix_test_sec * 6371000)
test_near_sec_sch = test[test <= 1000].count().rename("close_sec_sch")

test = pd.DataFrame(dist_matrix_train_sec * 6371000)
train_near_sec_sch = test[test <= 1000].count().rename("close_sec_sch")

df_cleaned = df_cleaned.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_train)\
    .join(closest_dist_to_pri_train)\
    .join(closest_dist_to_sec_train)\
    .join(train_near_pri_sch)\
    .join(train_near_sec_sch)\
    .join(closest_dist_to_shop_train)\
    .join(closest_dist_to_com_train)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

df_test_cleaned = df_test_cleaned.merge(pri_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(sec_sch_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(mrt_station_cleaned, how='left',left_on="subzone",right_on="subzone")\
    .merge(df_subzone, how='left',left_on="subzone",right_on="name")\
    .join(closest_dist_to_mrt_test)\
    .join(closest_dist_to_pri_test)\
    .join(closest_dist_to_sec_test)\
    .join(test_near_pri_sch)\
    .join(test_near_sec_sch)\
    .join(closest_dist_to_shop_test)\
    .join(closest_dist_to_com_test)\
    .fillna({'pri_sch':0, 'sec_sch':0, 'mrt_station':0})

Drop Columns at the end

In [None]:
df_cleaned = df_cleaned.drop(['title', 'address','property_name', 'property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level', 'available_unit_types'], axis = 1)
df_test_cleaned = df_test_cleaned.drop(['title','address','property_name', 'property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level', 'available_unit_types'], axis = 1)

In [None]:
print(df_cleaned.isnull().sum())
print(df_test_cleaned.isnull().sum())
print(df_cleaned.shape)
print(df_test_cleaned.shape)

Saving the data

In [None]:
df_test_cleaned.to_csv('../data/test_cleaned.csv', index = False)
df_cleaned.to_csv('../data/train_cleaned.csv', index = False)

In [None]:
df_cleaned.isnull().sum()

In [None]:
df_test_cleaned.isnull().sum()
df_test_cleaned.shape
