In [2]:
import pandas as pd
import requests
import time
import geopandas as gpd
from dateutil.relativedelta import relativedelta
import datetime as dt
import math

In [5]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 10)

In [6]:
# Read in new file
file_loc_raw = "../datasets/resale_hdb_price_raw_31oct.csv"
df_raw = pd.read_csv(file_loc_raw, parse_dates=["month", "lease_commence_date"])

df_raw["floor_area_sqft"] = df_raw["floor_area_sqm"] * 10.7639
df_raw["price_per_sqft"] = (
    df_raw["resale_price"] / df_raw["floor_area_sqft"]
)

# df_raw = df_raw.drop(["remaining_lease", "lease_commence_date"], axis=1)

In [7]:
df_raw.head(1)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft
0,2017-01-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979-01-01,61 years 04 months,232000.0,473.6116,489.852867


In [8]:
df_raw.shape

(192863, 13)

### Begin the synchronisation process

#### 1. Open the masterlist of HDB addresses. This will be used to perform a left join to the new data

In [9]:
masterlist_file_loc = "../datasets/hdb_resale_flat_address_masterlist.csv"
# masterlist_file_loc = "./temp_masterlist.csv"
masterlist_df = pd.read_csv(
    masterlist_file_loc, index_col=0, dtype={"postal": "object"}, parse_dates=["lease_commence_date"]
)

masterlist_df.head()

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
0,ANG MO KIO,205,ANG MO KIO AVE 1,205,ANG MO KIO AVENUE 1,NIL,560205,205 ANG MO KIO AVENUE 1 SINGAPORE 560205,1977-01-01,ANG MO KIO,NORTH-EAST REGION,29142.244275,38774.891527,1.366941,103.843582,Ang Mo Kio,742.768808,MRT,Red,9199.172507,ANG MO KIO PRIMARY SCHOOL,512.545254
1,ANG MO KIO,207,ANG MO KIO AVE 1,207,ANG MO KIO AVENUE 1,ANG MO KIO 22,560207,207 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-07-01,ANG MO KIO,NORTH-EAST REGION,29060.485578,38651.052977,1.365821,103.842848,Ang Mo Kio,874.305666,MRT,Red,9084.16982,ANG MO KIO PRIMARY SCHOOL,527.275749
2,ANG MO KIO,208,ANG MO KIO AVE 1,208,ANG MO KIO AVENUE 1,ANG MO KIO 22,560208,208 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-07-01,ANG MO KIO,NORTH-EAST REGION,29045.715075,38609.483079,1.365445,103.842715,Ang Mo Kio,908.966103,MRT,Red,9044.41098,ANG MO KIO PRIMARY SCHOOL,549.157654
3,ANG MO KIO,215,ANG MO KIO AVE 1,215,ANG MO KIO AVENUE 1,ANG MO KIO 22,560215,215 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-04-01,ANG MO KIO,NORTH-EAST REGION,28924.303291,38732.591142,1.366558,103.841624,Mayflower,781.530965,MRT,Brown,9180.500545,ANG MO KIO PRIMARY SCHOOL,377.594832
4,ANG MO KIO,216,ANG MO KIO AVE 1,216,ANG MO KIO AVENUE 1,ANG MO KIO 22,560216,216 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-04-01,ANG MO KIO,NORTH-EAST REGION,28911.05224,38692.616791,1.366197,103.841505,Mayflower,800.632187,MRT,Brown,9142.4094,ANG MO KIO PRIMARY SCHOOL,403.611794


#### 1a: First, check if there are any new addresses that appear in the new dataframe

In [11]:
df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
).query("_merge != 'both'")

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date_r,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters,_merge
192114,2024-10-01,SEMBAWANG,3 ROOM,131A,CANBERRA CRES,10 TO 12,68.0,Model A,2020-01-01,95 years,563000.0,731.9452,769.183267,,,,,,NaT,,,,,,,,,,,,,,left_only
192116,2024-10-01,SEMBAWANG,3 ROOM,131A,CANBERRA CRES,07 TO 09,68.0,Model A,2020-01-01,95 years,520000.0,731.9452,710.435699,,,,,,NaT,,,,,,,,,,,,,,left_only
192132,2024-10-01,SEMBAWANG,4 ROOM,131A,CANBERRA CRES,07 TO 09,92.0,Model A,2020-01-01,95 years,765000.0,990.2788,772.509722,,,,,,NaT,,,,,,,,,,,,,,left_only
192135,2024-10-01,SEMBAWANG,4 ROOM,132A,CANBERRA CRES,04 TO 06,92.0,Model A,2020-01-01,94 years 11 months,748000.0,990.2788,755.342839,,,,,,NaT,,,,,,,,,,,,,,left_only
192140,2024-10-01,SEMBAWANG,4 ROOM,101A,CANBERRA ST,01 TO 03,93.0,Model A,2020-01-01,95 years 04 months,580000.0,1001.0427,579.395864,,,,,,NaT,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192421,2024-10-01,TAMPINES,4 ROOM,610A,TAMPINES NTH DR 1,13 TO 15,93.0,Model A,2020-01-01,95 years 01 month,780000.0,1001.0427,779.187541,,,,,,NaT,,,,,,,,,,,,,,left_only
192422,2024-10-01,TAMPINES,4 ROOM,610C,TAMPINES NTH DR 1,10 TO 12,93.0,Model A,2020-01-01,95 years 01 month,777888.0,1001.0427,777.077741,,,,,,NaT,,,,,,,,,,,,,,left_only
192472,2024-10-01,TAMPINES,5 ROOM,612B,TAMPINES NTH DR 1,07 TO 09,113.0,Improved,2020-01-01,95 years 01 month,832000.0,1216.3207,684.030125,,,,,,NaT,,,,,,,,,,,,,,left_only
192473,2024-10-01,TAMPINES,5 ROOM,610A,TAMPINES NTH DR 1,07 TO 09,113.0,Improved,2020-01-01,95 years 01 month,930000.0,1216.3207,764.600981,,,,,,NaT,,,,,,,,,,,,,,left_only


#### 1b: If there are new addresses, use the following code to select the correct address, and insert ancillary information like floor area in square feet, price per square foot, and recalculating lease commence date

In [12]:
# First, check for any missing values in the masterlist
missing_val = masterlist_df[masterlist_df.isna().any(axis=1)]
missing_val

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters


In [14]:
# Obtain the index of data points whose block + street name + postal + building information aren't present in the masterlist
for_editing_index = df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
).query("_merge != 'both'").drop_duplicates(
    subset=["town", "block", "street_name"]
).index

for_editing = df_raw.loc[for_editing_index, :]
for_editing.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft
192114,2024-10-01,SEMBAWANG,3 ROOM,131A,CANBERRA CRES,10 TO 12,68.0,Model A,2020-01-01,95 years,563000.0,731.9452,769.183267
192135,2024-10-01,SEMBAWANG,4 ROOM,132A,CANBERRA CRES,04 TO 06,92.0,Model A,2020-01-01,94 years 11 months,748000.0,990.2788,755.342839
192140,2024-10-01,SEMBAWANG,4 ROOM,101A,CANBERRA ST,01 TO 03,93.0,Model A,2020-01-01,95 years 04 months,580000.0,1001.0427,579.395864
192181,2024-10-01,SENGKANG,2 ROOM,353A,ANCHORVALE LANE,13 TO 15,47.0,2-room,2021-01-01,95 years 05 months,375000.0,505.9033,741.248377
192281,2024-10-01,SENGKANG,4 ROOM,458A,SENGKANG WEST RD,01 TO 03,93.0,Model A,2020-01-01,95 years 01 month,630000.0,1001.0427,629.343783


In [15]:
def obtain_lease_yearmth(row):
    today = pd.to_datetime(dt.date.today().replace(day=1))
    lease_commence = today - pd.DateOffset(
        years=row["lease_year"] + 99, months=row["lease_month"]
    )
    return lease_commence


for_editing["lease_year"] = for_editing["remaining_lease"].str.slice(0, 2).astype("int")
for_editing["lease_month"] = pd.to_numeric(
    for_editing["remaining_lease"].str.slice(9, 11), errors="coerce"
).fillna(0)

for_editing["lease_commence_date"] = for_editing.apply(obtain_lease_yearmth, axis=1)
for_editing = for_editing.drop(
    ["remaining_lease", "lease_year", "lease_month"], axis=1
)

In [16]:
for_editing.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft
192114,2024-10-01,SEMBAWANG,3 ROOM,131A,CANBERRA CRES,10 TO 12,68.0,Model A,1830-10-01,563000.0,731.9452,769.183267
192135,2024-10-01,SEMBAWANG,4 ROOM,132A,CANBERRA CRES,04 TO 06,92.0,Model A,1830-11-01,748000.0,990.2788,755.342839
192140,2024-10-01,SEMBAWANG,4 ROOM,101A,CANBERRA ST,01 TO 03,93.0,Model A,1830-06-01,580000.0,1001.0427,579.395864
192181,2024-10-01,SENGKANG,2 ROOM,353A,ANCHORVALE LANE,13 TO 15,47.0,2-room,1830-05-01,375000.0,505.9033,741.248377
192281,2024-10-01,SENGKANG,4 ROOM,458A,SENGKANG WEST RD,01 TO 03,93.0,Model A,1830-09-01,630000.0,1001.0427,629.343783


In [17]:
if for_editing.shape[0] > 0:
    for i in for_editing.index:
        search_value = (
            for_editing.loc[i, "block"] + " " + for_editing.loc[i, "street_name"]
        )
        print(f"{i}: {search_value}")
        response = requests.get(
            f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
        )
        while response.status_code != 200:
            time.sleep(1)
            print("not 200")
            response = requests.get(
                f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
            )

        json_data = response.json()
        for j in json_data["results"]:
            print(j)
        time.sleep(0.5)
        user_input = input("Select the correct address: ")
        k = int(user_input) - 1
        print(k)

        if k == -2:
            for_editing.loc[i, "found"] = math.nan
            for_editing.loc[i, "search_val"] = math.nan
            for_editing.loc[i, "blk_no"] = math.nan
            for_editing.loc[i, "road_name"] = math.nan
            for_editing.loc[i, "building"] = math.nan
            for_editing.loc[i, "address"] = math.nan
            for_editing.loc[i, "postal"] = math.nan
            for_editing.loc[i, "x"] = math.nan
            for_editing.loc[i, "y"] = math.nan
            for_editing.loc[i, "latitude"] = math.nan
            for_editing.loc[i, "longitude"] = math.nan
        else:
            for_editing.loc[i, "found"] = 1
            for_editing.loc[i, "search_val"] = json_data["results"][k]["SEARCHVAL"]
            for_editing.loc[i, "blk_no"] = json_data["results"][k]["BLK_NO"]
            for_editing.loc[i, "road_name"] = json_data["results"][k]["ROAD_NAME"]
            for_editing.loc[i, "building"] = json_data["results"][k]["BUILDING"]
            for_editing.loc[i, "address"] = json_data["results"][k]["ADDRESS"]
            for_editing.loc[i, "postal"] = json_data["results"][k]["POSTAL"]
            for_editing.loc[i, "x"] = json_data["results"][k]["X"]
            for_editing.loc[i, "y"] = json_data["results"][k]["Y"]
            for_editing.loc[i, "latitude"] = json_data["results"][k]["LATITUDE"]
            for_editing.loc[i, "longitude"] = json_data["results"][k]["LONGITUDE"]

192114: 131A CANBERRA CRES
{'SEARCHVAL': 'EASTLINK I @ CANBERRA', 'BLK_NO': '131A', 'ROAD_NAME': 'CANBERRA CRESCENT', 'BUILDING': 'EASTLINK I @ CANBERRA', 'ADDRESS': '131A CANBERRA CRESCENT EASTLINK I @ CANBERRA SINGAPORE 751131', 'POSTAL': '751131', 'X': '27745.2948914105', 'Y': '47265.4007779453', 'LATITUDE': '1.44372597469524', 'LONGITUDE': '103.831029801721'}
0
192135: 132A CANBERRA CRES
{'SEARCHVAL': 'EASTLINK I @ CANBERRA', 'BLK_NO': '132A', 'ROAD_NAME': 'CANBERRA CRESCENT', 'BUILDING': 'EASTLINK I @ CANBERRA', 'ADDRESS': '132A CANBERRA CRESCENT EASTLINK I @ CANBERRA SINGAPORE 751132', 'POSTAL': '751132', 'X': '27774.7603498073', 'Y': '47235.3685212843', 'LATITUDE': '1.44345437411826', 'LONGITUDE': '103.831294578169'}
0
192140: 101A CANBERRA ST
{'SEARCHVAL': 'EASTCREEK @ CANBERRA', 'BLK_NO': '101A', 'ROAD_NAME': 'CANBERRA STREET', 'BUILDING': 'EASTCREEK @ CANBERRA', 'ADDRESS': '101A CANBERRA STREET EASTCREEK @ CANBERRA SINGAPORE 751101', 'POSTAL': '751101', 'X': '27681.2301008264

In [18]:
for_editing_gdf = gpd.GeoDataFrame(
    for_editing, geometry=gpd.points_from_xy(for_editing["x"], for_editing["y"])
)

for_editing_gdf

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft,found,search_val,blk_no,road_name,building,address,postal,x,y,latitude,longitude,geometry
192114,2024-10-01,SEMBAWANG,3 ROOM,131A,CANBERRA CRES,10 TO 12,68.0,Model A,1830-10-01,563000.0,731.9452,769.183267,1.0,EASTLINK I @ CANBERRA,131A,CANBERRA CRESCENT,EASTLINK I @ CANBERRA,131A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,751131,27745.2948914105,47265.4007779453,1.44372597469524,103.831029801721,POINT (27745.295 47265.401)
192135,2024-10-01,SEMBAWANG,4 ROOM,132A,CANBERRA CRES,04 TO 06,92.0,Model A,1830-11-01,748000.0,990.2788,755.342839,1.0,EASTLINK I @ CANBERRA,132A,CANBERRA CRESCENT,EASTLINK I @ CANBERRA,132A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,751132,27774.7603498073,47235.3685212843,1.44345437411826,103.831294578169,POINT (27774.760 47235.369)
192140,2024-10-01,SEMBAWANG,4 ROOM,101A,CANBERRA ST,01 TO 03,93.0,Model A,1830-06-01,580000.0,1001.0427,579.395864,1.0,EASTCREEK @ CANBERRA,101A,CANBERRA STREET,EASTCREEK @ CANBERRA,101A CANBERRA STREET EASTCREEK @ CANBERRA SING...,751101,27681.2301008264,48151.4330438618,1.45173892803325,103.83045410628,POINT (27681.230 48151.433)
192181,2024-10-01,SENGKANG,2 ROOM,353A,ANCHORVALE LANE,13 TO 15,47.0,2-room,1830-05-01,375000.0,505.9033,741.248377,1.0,ANCHORVALE PLAINS,353A,ANCHORVALE LANE,ANCHORVALE PLAINS,353A ANCHORVALE LANE ANCHORVALE PLAINS SINGAPO...,541353,33820.002827079,41745.6216366235,1.39380650147896,103.885615916234,POINT (33820.003 41745.622)
192281,2024-10-01,SENGKANG,4 ROOM,458A,SENGKANG WEST RD,01 TO 03,93.0,Model A,1830-09-01,630000.0,1001.0427,629.343783,1.0,FERNVALE WOODS,458A,SENGKANG WEST ROAD,FERNVALE WOODS,458A SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,791458,32034.5499790786,41652.915675642,1.39296840561775,103.869572197299,POINT (32034.550 41652.916)
192282,2024-10-01,SENGKANG,4 ROOM,458C,SENGKANG WEST RD,13 TO 15,93.0,Model A,1830-10-01,652000.0,1001.0427,651.320868,1.0,FERNVALE WOODS,458C,SENGKANG WEST ROAD,FERNVALE WOODS,458C SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,793458,32179.552054286,41640.6374075115,1.39285734486634,103.870875153546,POINT (32179.552 41640.637)
192420,2024-10-01,TAMPINES,4 ROOM,610C,TAMPINES NTH DR 1,07 TO 09,93.0,Model A,1830-09-01,758000.0,1001.0427,757.210457,1.0,TAMPINES GREENWEAVE,610C,TAMPINES NORTH DRIVE 1,TAMPINES GREENWEAVE,610C TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,523610,39378.0788980145,38496.1410125233,1.36441776002022,103.935558340158,POINT (39378.079 38496.141)
192421,2024-10-01,TAMPINES,4 ROOM,610A,TAMPINES NTH DR 1,13 TO 15,93.0,Model A,1830-09-01,780000.0,1001.0427,779.187541,1.0,TAMPINES GREENWEAVE,610A,TAMPINES NORTH DRIVE 1,TAMPINES GREENWEAVE,610A TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,521610,39529.1191938305,38505.0529965758,1.36449829827687,103.936915541696,POINT (39529.119 38505.053)
192472,2024-10-01,TAMPINES,5 ROOM,612B,TAMPINES NTH DR 1,07 TO 09,113.0,Improved,1830-09-01,832000.0,1216.3207,684.030125,1.0,TAMPINES GREENVIEW,612B,TAMPINES NORTH DRIVE 1,TAMPINES GREENVIEW,612B TAMPINES NORTH DRIVE 1 TAMPINES GREENVIEW...,522612,39403.8349242033,38600.3785002083,1.36536043524759,103.935789815182,POINT (39403.835 38600.379)
192565,2024-10-01,TOA PAYOH,5 ROOM,105B,BIDADARI PK DR,07 TO 09,114.0,Improved,1830-11-01,1220000.0,1227.0846,994.226478,1.0,ALKAFF VISTA,105B,BIDADARI PARK DRIVE,ALKAFF VISTA,105B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,342105,32136.8359385133,35074.3319537621,1.33347403879152,103.870490405284,POINT (32136.836 35074.332)


#### 1c. Appending MRT information

In [19]:
mrt_file_loc = "../datasets/mrt_lrt_stations.csv"
mrt_df = pd.read_csv(mrt_file_loc, parse_dates=["opening"], index_col=0)
mrt_gdf = gpd.GeoDataFrame(mrt_df, geometry=gpd.points_from_xy(mrt_df["x"], mrt_df["y"]))

mrt_gdf.head(1)

Unnamed: 0,code,station_name,line,color,opening,type,blk_no,road_name,building,address,postal,x,y,latitude,longitude,planning_area_ura,region_ura,geometry
0,NS1,Jurong East,North-South Line,Red,1990-03-10,MRT,10,JURONG EAST STREET 12,JURONG EAST MRT STATION (EW24 / NS1),10 JURONG EAST STREET 12 JURONG EAST MRT STATI...,609690,17869.057052,35038.96887,1.333153,103.742286,JURONG EAST,WEST REGION,POINT (17869.057 35038.969)


In [20]:
def find_closest_station(row, mrt_gdf):
    """Remove comments for the following 3 lines if you want the closest MRT at the time of transaction"""
    # mrt_gdf["OPENING_DATE"] = mrt_gdf["OPENING"].dt.to_period("M").dt.to_timestamp()
    # mrt_stations_filtered = mrt_gdf[mrt_gdf["OPENING"] < row["month"]]
    # distances = mrt_stations_filtered.distance(row["geometry"])

    """Remove comments for this line if you want the closest MRT station today"""
    distances = mrt_gdf.distance(row["geometry"])

    closest_station_index = distances.idxmin()
    shortest_distance = distances.min()

    closest_station_name = mrt_gdf.loc[closest_station_index, "station_name"]
    closest_transport_type = mrt_gdf.loc[closest_station_index, "type"]
    closest_mrt_color = mrt_gdf.loc[closest_station_index, "color"]

    # distance to cbd
    raffles_place_index = mrt_gdf.query("station_name == 'Raffles Place'").index[0]
    distance_to_cbd = mrt_gdf.loc[raffles_place_index, "geometry"].distance(
        row["geometry"]
    )

    return pd.Series(
        [
            closest_station_name,
            shortest_distance,
            closest_transport_type,
            closest_mrt_color,
            distance_to_cbd,
        ],
        index=[
            "closest_mrt_station",
            "distance_to_mrt_meters",
            "transport_type",
            "line_color",
            "distance_to_cbd",
        ],
    )


if for_editing_gdf.shape[0] > 0:
    for_editing_gdf[
        [
            "closest_mrt_station",
            "distance_to_mrt_meters",
            "transport_type",
            "line_color",
            "distance_to_cbd",
        ]
    ] = for_editing_gdf.apply(find_closest_station, mrt_gdf=mrt_gdf, axis=1)

In [21]:
for_editing_gdf

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft,found,search_val,blk_no,road_name,building,address,postal,x,y,latitude,longitude,geometry,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd
192114,2024-10-01,SEMBAWANG,3 ROOM,131A,CANBERRA CRES,10 TO 12,68.0,Model A,1830-10-01,563000.0,731.9452,769.183267,1.0,EASTLINK I @ CANBERRA,131A,CANBERRA CRESCENT,EASTLINK I @ CANBERRA,131A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,751131,27745.2948914105,47265.4007779453,1.44372597469524,103.831029801721,POINT (27745.295 47265.401),Canberra,164.225182,MRT,Red,17793.678203
192135,2024-10-01,SEMBAWANG,4 ROOM,132A,CANBERRA CRES,04 TO 06,92.0,Model A,1830-11-01,748000.0,990.2788,755.342839,1.0,EASTLINK I @ CANBERRA,132A,CANBERRA CRESCENT,EASTLINK I @ CANBERRA,132A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,751132,27774.7603498073,47235.3685212843,1.44345437411826,103.831294578169,POINT (27774.760 47235.369),Canberra,182.020357,MRT,Red,17760.14488
192140,2024-10-01,SEMBAWANG,4 ROOM,101A,CANBERRA ST,01 TO 03,93.0,Model A,1830-06-01,580000.0,1001.0427,579.395864,1.0,EASTCREEK @ CANBERRA,101A,CANBERRA STREET,EASTCREEK @ CANBERRA,101A CANBERRA STREET EASTCREEK @ CANBERRA SING...,751101,27681.2301008264,48151.4330438618,1.45173892803325,103.83045410628,POINT (27681.230 48151.433),Canberra,961.476445,MRT,Red,18680.698943
192181,2024-10-01,SENGKANG,2 ROOM,353A,ANCHORVALE LANE,13 TO 15,47.0,2-room,1830-05-01,375000.0,505.9033,741.248377,1.0,ANCHORVALE PLAINS,353A,ANCHORVALE LANE,ANCHORVALE PLAINS,353A ANCHORVALE LANE ANCHORVALE PLAINS SINGAPO...,541353,33820.002827079,41745.6216366235,1.39380650147896,103.885615916234,POINT (33820.003 41745.622),Tongkang,493.658458,LRT,Grey,12709.636254
192281,2024-10-01,SENGKANG,4 ROOM,458A,SENGKANG WEST RD,01 TO 03,93.0,Model A,1830-09-01,630000.0,1001.0427,629.343783,1.0,FERNVALE WOODS,458A,SENGKANG WEST ROAD,FERNVALE WOODS,458A SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,791458,32034.5499790786,41652.915675642,1.39296840561775,103.869572197299,POINT (32034.550 41652.916),Fernvale,759.171851,LRT,Grey,12202.878124
192282,2024-10-01,SENGKANG,4 ROOM,458C,SENGKANG WEST RD,13 TO 15,93.0,Model A,1830-10-01,652000.0,1001.0427,651.320868,1.0,FERNVALE WOODS,458C,SENGKANG WEST ROAD,FERNVALE WOODS,458C SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,793458,32179.552054286,41640.6374075115,1.39285734486634,103.870875153546,POINT (32179.552 41640.637),Fernvale,614.141227,LRT,Grey,12215.577392
192420,2024-10-01,TAMPINES,4 ROOM,610C,TAMPINES NTH DR 1,07 TO 09,93.0,Model A,1830-09-01,758000.0,1001.0427,757.210457,1.0,TAMPINES GREENWEAVE,610C,TAMPINES NORTH DRIVE 1,TAMPINES GREENWEAVE,610C TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,523610,39378.0788980145,38496.1410125233,1.36441776002022,103.935558340158,POINT (39378.079 38496.141),Tampines,1318.248401,MRT,Blue,12900.28084
192421,2024-10-01,TAMPINES,4 ROOM,610A,TAMPINES NTH DR 1,13 TO 15,93.0,Model A,1830-09-01,780000.0,1001.0427,779.187541,1.0,TAMPINES GREENWEAVE,610A,TAMPINES NORTH DRIVE 1,TAMPINES GREENWEAVE,610A TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,521610,39529.1191938305,38505.0529965758,1.36449829827687,103.936915541696,POINT (39529.119 38505.053),Tampines,1236.290953,MRT,Blue,13016.356268
192472,2024-10-01,TAMPINES,5 ROOM,612B,TAMPINES NTH DR 1,07 TO 09,113.0,Improved,1830-09-01,832000.0,1216.3207,684.030125,1.0,TAMPINES GREENVIEW,612B,TAMPINES NORTH DRIVE 1,TAMPINES GREENVIEW,612B TAMPINES NORTH DRIVE 1 TAMPINES GREENVIEW...,522612,39403.8349242033,38600.3785002083,1.36536043524759,103.935789815182,POINT (39403.835 38600.379),Tampines,1385.71733,MRT,Blue,12990.836063
192565,2024-10-01,TOA PAYOH,5 ROOM,105B,BIDADARI PK DR,07 TO 09,114.0,Improved,1830-11-01,1220000.0,1227.0846,994.226478,1.0,ALKAFF VISTA,105B,BIDADARI PARK DRIVE,ALKAFF VISTA,105B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,342105,32136.8359385133,35074.3319537621,1.33347403879152,103.870490405284,POINT (32136.836 35074.332),Potong Pasir,281.311189,MRT,Purple,5853.225667


#### 1d. Appending Closest School information

In [22]:
school_file_loc = "../datasets/schools_for_plotly.csv"
school_df = pd.read_csv(school_file_loc, index_col=0, dtype={"postal":"string"})
school_df["postal"] = school_df["postal"].astype("str").apply(lambda x: f"{x:0>6}")

# Convert the df into a gdf
school_gdf = gpd.GeoDataFrame(school_df, geometry=gpd.points_from_xy(school_df["x"], school_df["y"]))

In [23]:
def find_closest_school(row, school_gdf, level="PRIMARY"):
    """Remove comments for the following 3 lines if you want the closest MRT at the time of transaction"""
    # school_gdf["OPENING_DATE"] = school_gdf["OPENING"].dt.to_period("M").dt.to_timestamp()
    # mrt_stations_filtered = school_gdf[school_gdf["OPENING"] < row["month"]]
    # distances = mrt_stations_filtered.distance(row["geometry"])

    school_gdf_filtered = school_gdf.query("mainlevel_code == @level")
    """Remove comments for this line if you want the closest MRT station today"""
    distances = school_gdf_filtered.distance(row["geometry"])

    closest_school_index = distances.idxmin()
    shortest_distance = distances.min()

    closest_school = school_gdf_filtered.loc[closest_school_index, "school_name"]

    return pd.Series(
        [
            closest_school,
            shortest_distance,
        ],
        index=[
            "closest_pri_school",
            "distance_to_pri_school_meters",
        ],
    )

for_editing_gdf[["closest_pri_school", "distance_to_pri_school_meters"]] = (
    for_editing_gdf.apply(find_closest_school, school_gdf=school_gdf, axis=1)
)

In [24]:
for_editing_gdf

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft,found,search_val,blk_no,road_name,building,address,postal,x,y,latitude,longitude,geometry,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
192114,2024-10-01,SEMBAWANG,3 ROOM,131A,CANBERRA CRES,10 TO 12,68.0,Model A,1830-10-01,563000.0,731.9452,769.183267,1.0,EASTLINK I @ CANBERRA,131A,CANBERRA CRESCENT,EASTLINK I @ CANBERRA,131A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,751131,27745.2948914105,47265.4007779453,1.44372597469524,103.831029801721,POINT (27745.295 47265.401),Canberra,164.225182,MRT,Red,17793.678203,CHONGFU SCHOOL,1093.759996
192135,2024-10-01,SEMBAWANG,4 ROOM,132A,CANBERRA CRES,04 TO 06,92.0,Model A,1830-11-01,748000.0,990.2788,755.342839,1.0,EASTLINK I @ CANBERRA,132A,CANBERRA CRESCENT,EASTLINK I @ CANBERRA,132A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,751132,27774.7603498073,47235.3685212843,1.44345437411826,103.831294578169,POINT (27774.760 47235.369),Canberra,182.020357,MRT,Red,17760.14488,CHONGFU SCHOOL,1052.797207
192140,2024-10-01,SEMBAWANG,4 ROOM,101A,CANBERRA ST,01 TO 03,93.0,Model A,1830-06-01,580000.0,1001.0427,579.395864,1.0,EASTCREEK @ CANBERRA,101A,CANBERRA STREET,EASTCREEK @ CANBERRA,101A CANBERRA STREET EASTCREEK @ CANBERRA SING...,751101,27681.2301008264,48151.4330438618,1.45173892803325,103.83045410628,POINT (27681.230 48151.433),Canberra,961.476445,MRT,Red,18680.698943,WELLINGTON PRIMARY SCHOOL,907.229079
192181,2024-10-01,SENGKANG,2 ROOM,353A,ANCHORVALE LANE,13 TO 15,47.0,2-room,1830-05-01,375000.0,505.9033,741.248377,1.0,ANCHORVALE PLAINS,353A,ANCHORVALE LANE,ANCHORVALE PLAINS,353A ANCHORVALE LANE ANCHORVALE PLAINS SINGAPO...,541353,33820.002827079,41745.6216366235,1.39380650147896,103.885615916234,POINT (33820.003 41745.622),Tongkang,493.658458,LRT,Grey,12709.636254,ANCHOR GREEN PRIMARY SCHOOL,417.28651
192281,2024-10-01,SENGKANG,4 ROOM,458A,SENGKANG WEST RD,01 TO 03,93.0,Model A,1830-09-01,630000.0,1001.0427,629.343783,1.0,FERNVALE WOODS,458A,SENGKANG WEST ROAD,FERNVALE WOODS,458A SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,791458,32034.5499790786,41652.915675642,1.39296840561775,103.869572197299,POINT (32034.550 41652.916),Fernvale,759.171851,LRT,Grey,12202.878124,FERNVALE PRIMARY SCHOOL,617.278667
192282,2024-10-01,SENGKANG,4 ROOM,458C,SENGKANG WEST RD,13 TO 15,93.0,Model A,1830-10-01,652000.0,1001.0427,651.320868,1.0,FERNVALE WOODS,458C,SENGKANG WEST ROAD,FERNVALE WOODS,458C SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,793458,32179.552054286,41640.6374075115,1.39285734486634,103.870875153546,POINT (32179.552 41640.637),Fernvale,614.141227,LRT,Grey,12215.577392,SENGKANG GREEN PRIMARY SCHOOL,483.266179
192420,2024-10-01,TAMPINES,4 ROOM,610C,TAMPINES NTH DR 1,07 TO 09,93.0,Model A,1830-09-01,758000.0,1001.0427,757.210457,1.0,TAMPINES GREENWEAVE,610C,TAMPINES NORTH DRIVE 1,TAMPINES GREENWEAVE,610C TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,523610,39378.0788980145,38496.1410125233,1.36441776002022,103.935558340158,POINT (39378.079 38496.141),Tampines,1318.248401,MRT,Blue,12900.28084,ANGSANA PRIMARY SCHOOL,386.995502
192421,2024-10-01,TAMPINES,4 ROOM,610A,TAMPINES NTH DR 1,13 TO 15,93.0,Model A,1830-09-01,780000.0,1001.0427,779.187541,1.0,TAMPINES GREENWEAVE,610A,TAMPINES NORTH DRIVE 1,TAMPINES GREENWEAVE,610A TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,521610,39529.1191938305,38505.0529965758,1.36449829827687,103.936915541696,POINT (39529.119 38505.053),Tampines,1236.290953,MRT,Blue,13016.356268,ANGSANA PRIMARY SCHOOL,246.052131
192472,2024-10-01,TAMPINES,5 ROOM,612B,TAMPINES NTH DR 1,07 TO 09,113.0,Improved,1830-09-01,832000.0,1216.3207,684.030125,1.0,TAMPINES GREENVIEW,612B,TAMPINES NORTH DRIVE 1,TAMPINES GREENVIEW,612B TAMPINES NORTH DRIVE 1 TAMPINES GREENVIEW...,522612,39403.8349242033,38600.3785002083,1.36536043524759,103.935789815182,POINT (39403.835 38600.379),Tampines,1385.71733,MRT,Blue,12990.836063,ANGSANA PRIMARY SCHOOL,400.767462
192565,2024-10-01,TOA PAYOH,5 ROOM,105B,BIDADARI PK DR,07 TO 09,114.0,Improved,1830-11-01,1220000.0,1227.0846,994.226478,1.0,ALKAFF VISTA,105B,BIDADARI PARK DRIVE,ALKAFF VISTA,105B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,342105,32136.8359385133,35074.3319537621,1.33347403879152,103.870490405284,POINT (32136.836 35074.332),Potong Pasir,281.311189,MRT,Purple,5853.225667,CEDAR PRIMARY SCHOOL,617.00209


#### 1e. Appending URA planning area and regions

In [25]:
geo_file_loc = "../datasets/sg_map/mp2014/MP14_PLNG_AREA_NO_SEA_PL.shp"
planning_areas_gdf = gpd.read_file(geo_file_loc)

planning_areas_gdf.head()

Unnamed: 0,OBJECTID,PLN_AREA_N,PLN_AREA_C,CA_IND,REGION_N,REGION_C,INC_CRC,FMEL_UPD_D,X_ADDR,Y_ADDR,SHAPE_Leng,SHAPE_Area,geometry
0,1,ANG MO KIO,AM,N,NORTH-EAST REGION,NER,E5CBDDE0C2113055,2016-05-11,28976.8763,40229.1238,17494.24019,13941380.0,"POLYGON ((30658.500 42047.527, 30679.195 42020..."
1,2,BEDOK,BD,N,EAST REGION,ER,1719251260799DF6,2016-05-11,38582.665,34032.0961,21872.798962,21733190.0,"POLYGON ((38974.269 36138.243, 39371.471 35747..."
2,3,BISHAN,BS,N,CENTRAL REGION,CR,BA616285F402846F,2016-05-11,28789.763,37450.8865,13517.121556,7618921.0,"POLYGON ((29772.191 38311.805, 29784.826 38304..."
3,4,BOON LAY,BL,N,WEST REGION,WR,A3DC87118B43CDED,2016-05-11,13410.3824,33008.9884,18528.467448,8279408.0,"POLYGON ((12861.383 32207.492, 12860.555 32208..."
4,5,BUKIT BATOK,BK,N,WEST REGION,WR,FB44C870B04B7F57,2016-05-11,19255.415,37527.6527,15234.223423,11133260.0,"POLYGON ((20294.455 39114.528, 20334.318 39054..."


In [26]:
for_editing_gdf = for_editing_gdf.sjoin(planning_areas_gdf[["PLN_AREA_N", "REGION_N", "geometry"]], how='left')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: PROJCS["SVY21",GEOGCS["SVY21[WGS84]",DATUM["WGS_19 ...

  return geopandas.sjoin(left_df=self, right_df=df, *args, **kwargs)  # noqa: B026


In [27]:
for_editing_gdf = for_editing_gdf.rename(columns={"PLN_AREA_N":"planning_area_ura", "REGION_N":"region_ura"})

In [28]:
for_editing = for_editing_gdf[
    [
        "town",
        "block",
        "road_name",
        "blk_no",
        "street_name",
        "building",
        "postal",
        "address",
        "lease_commence_date",
        "planning_area_ura",
        "region_ura",
        "x",
        "y",
        "latitude",
        "longitude",
        "closest_mrt_station",
        "distance_to_mrt_meters",
        "transport_type",
        "line_color",
        "distance_to_cbd",
        "closest_pri_school",
        "distance_to_pri_school_meters",
    ]
]

In [29]:
for_editing

Unnamed: 0,town,block,road_name,blk_no,street_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
192114,SEMBAWANG,131A,CANBERRA CRESCENT,131A,CANBERRA CRES,EASTLINK I @ CANBERRA,751131,131A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,1830-10-01,SEMBAWANG,NORTH REGION,27745.2948914105,47265.4007779453,1.44372597469524,103.831029801721,Canberra,164.225182,MRT,Red,17793.678203,CHONGFU SCHOOL,1093.759996
192135,SEMBAWANG,132A,CANBERRA CRESCENT,132A,CANBERRA CRES,EASTLINK I @ CANBERRA,751132,132A CANBERRA CRESCENT EASTLINK I @ CANBERRA S...,1830-11-01,SEMBAWANG,NORTH REGION,27774.7603498073,47235.3685212843,1.44345437411826,103.831294578169,Canberra,182.020357,MRT,Red,17760.14488,CHONGFU SCHOOL,1052.797207
192140,SEMBAWANG,101A,CANBERRA STREET,101A,CANBERRA ST,EASTCREEK @ CANBERRA,751101,101A CANBERRA STREET EASTCREEK @ CANBERRA SING...,1830-06-01,SEMBAWANG,NORTH REGION,27681.2301008264,48151.4330438618,1.45173892803325,103.83045410628,Canberra,961.476445,MRT,Red,18680.698943,WELLINGTON PRIMARY SCHOOL,907.229079
192181,SENGKANG,353A,ANCHORVALE LANE,353A,ANCHORVALE LANE,ANCHORVALE PLAINS,541353,353A ANCHORVALE LANE ANCHORVALE PLAINS SINGAPO...,1830-05-01,SENGKANG,NORTH-EAST REGION,33820.002827079,41745.6216366235,1.39380650147896,103.885615916234,Tongkang,493.658458,LRT,Grey,12709.636254,ANCHOR GREEN PRIMARY SCHOOL,417.28651
192281,SENGKANG,458A,SENGKANG WEST ROAD,458A,SENGKANG WEST RD,FERNVALE WOODS,791458,458A SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,1830-09-01,SENGKANG,NORTH-EAST REGION,32034.5499790786,41652.915675642,1.39296840561775,103.869572197299,Fernvale,759.171851,LRT,Grey,12202.878124,FERNVALE PRIMARY SCHOOL,617.278667
192282,SENGKANG,458C,SENGKANG WEST ROAD,458C,SENGKANG WEST RD,FERNVALE WOODS,793458,458C SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,1830-10-01,SENGKANG,NORTH-EAST REGION,32179.552054286,41640.6374075115,1.39285734486634,103.870875153546,Fernvale,614.141227,LRT,Grey,12215.577392,SENGKANG GREEN PRIMARY SCHOOL,483.266179
192420,TAMPINES,610C,TAMPINES NORTH DRIVE 1,610C,TAMPINES NTH DR 1,TAMPINES GREENWEAVE,523610,610C TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,1830-09-01,TAMPINES,EAST REGION,39378.0788980145,38496.1410125233,1.36441776002022,103.935558340158,Tampines,1318.248401,MRT,Blue,12900.28084,ANGSANA PRIMARY SCHOOL,386.995502
192421,TAMPINES,610A,TAMPINES NORTH DRIVE 1,610A,TAMPINES NTH DR 1,TAMPINES GREENWEAVE,521610,610A TAMPINES NORTH DRIVE 1 TAMPINES GREENWEAV...,1830-09-01,TAMPINES,EAST REGION,39529.1191938305,38505.0529965758,1.36449829827687,103.936915541696,Tampines,1236.290953,MRT,Blue,13016.356268,ANGSANA PRIMARY SCHOOL,246.052131
192472,TAMPINES,612B,TAMPINES NORTH DRIVE 1,612B,TAMPINES NTH DR 1,TAMPINES GREENVIEW,522612,612B TAMPINES NORTH DRIVE 1 TAMPINES GREENVIEW...,1830-09-01,TAMPINES,EAST REGION,39403.8349242033,38600.3785002083,1.36536043524759,103.935789815182,Tampines,1385.71733,MRT,Blue,12990.836063,ANGSANA PRIMARY SCHOOL,400.767462
192565,TOA PAYOH,105B,BIDADARI PARK DRIVE,105B,BIDADARI PK DR,ALKAFF VISTA,342105,105B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,1830-11-01,TOA PAYOH,CENTRAL REGION,32136.8359385133,35074.3319537621,1.33347403879152,103.870490405284,Potong Pasir,281.311189,MRT,Purple,5853.225667,CEDAR PRIMARY SCHOOL,617.00209


In [30]:
new_masterlist_df = pd.concat([masterlist_df, for_editing], axis=0)
new_masterlist_df = new_masterlist_df.sort_values(by=["town", "street_name", "block"]).reset_index(drop=True)

In [31]:
new_masterlist_df.shape

(9636, 22)

In [33]:
# new_masterlist_df.to_csv(masterlist_file_loc)

### 2. Append additional information to the raw dataset

#### 2a: Reopen Masterlist

In [34]:
masterlist_file_loc = "../datasets/hdb_resale_flat_address_masterlist.csv"
# masterlist_file_loc = "./temp_masterlist.csv"
masterlist_df = pd.read_csv(
    masterlist_file_loc,
    index_col=0,
    dtype={"postal": "object"},
    parse_dates=["lease_commence_date"],
)

masterlist_df.head(1)

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
0,ANG MO KIO,205,ANG MO KIO AVE 1,205,ANG MO KIO AVENUE 1,NIL,560205,205 ANG MO KIO AVENUE 1 SINGAPORE 560205,1977-01-01,ANG MO KIO,NORTH-EAST REGION,29142.244275,38774.891527,1.366941,103.843582,Ang Mo Kio,742.768808,MRT,Red,9199.172507,ANG MO KIO PRIMARY SCHOOL,512.545254


In [35]:
masterlist_df.shape

(9636, 22)

#### 2b: Perform a left merge between the new HDB dataset and the address masterlist

In [36]:
df_new = df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
)

In [37]:
df_new.query("_merge != 'both'")

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date_r,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters,_merge


In [38]:
df_new = df_new.drop("_merge", axis=1)

In [39]:
file_destination = file_loc_raw.replace("raw", "coords_mrt")
file_destination
df_new.to_csv(file_destination)

#### The following is used to call the Onemap API and populate coordinate details (deprecated, use the above masterlist instead)

In [None]:
# # Obtain geospatial coordinates with the Onemap API

# def get_coordinates_for_each_row(row):
#     search_value = row["block"] + " " + row["street_name"]
#     # print(search_value)

#     response = requests.get(f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1")
#     while response.status_code != 200:
#         time.sleep(1)
#         print("not 200")
#         response = requests.get(f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1")

#     data = response.json()

#     row_data = data['results'][0]
#     searchval = row_data["SEARCHVAL"]
#     address = row_data["ADDRESS"]
#     postal = row_data["POSTAL"]
#     x = row_data["X"]
#     y = row_data["Y"]
#     latitude = row_data["LATITUDE"]
#     longitude = row_data["LONGITUDE"]

#     return pd.Series([address, postal, x, y, latitude, longitude], 
#                      index=["SEARCHVAL", "ADDRESS", "POSTAL", "X", "Y", "LATITUDE", "LONGITUDE"])

In [None]:
# Appending additional columns to the dataframe


# NOTE: SHOULDNT APPEND
# for_addition["year"] = for_addition["month"].dt.year
# for_addition["lease_years"] = for_addition["remaining_lease"].str.split(" ").apply(lambda x: int(x[0]))
# bins = pd.IntervalIndex.from_tuples(
#     [(40, 50), (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)]
# )
# for_addition["lease_cat"] = pd.cut(for_addition["lease_years"], bins)
# for_addition = for_addition.drop("index", axis=1)

#### Checking datasets

In [14]:
file_loc_cleaned = "../datasets/resale_hdb_price_coords_mrt_31oct.csv"
df_for_kaggle = pd.read_csv(
    file_loc_cleaned,
    parse_dates=["month", "lease_commence_date"],
    index_col=0,
    dtype={"x": "float64", "y": "float64", "postal": "object"},
    low_memory=False,
)

In [15]:
df_for_kaggle[df_for_kaggle.isna().any(axis=1)]
# df_for_kaggle.shape

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters


In [16]:
# df_for_kaggle = df_for_kaggle.drop([ "block", "street_name"], axis=1)
df_for_kaggle = df_for_kaggle.drop(["town", "block", "street_name"], axis=1)

In [17]:
today_date = dt.date.today().strftime("%Y-%d%b").lower()
kaggle_file_loc_updated_name = f"../datasets/resale_hdb_price_for_kaggle_{today_date}.csv"
df_for_kaggle.to_csv(kaggle_file_loc_updated_name)

In [18]:
df_for_kaggle

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,...,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
0,2017-01-01,2 ROOM,10 TO 12,44.0,Improved,1979-01-01,61 years 04 months,232000.0,473.6116,489.852867,...,38229.067463,1.362005,103.853880,Ang Mo Kio,999.941618,MRT,Red,8615.656983,TOWNSVILLE PRIMARY SCHOOL,218.125254
1,2017-01-01,3 ROOM,01 TO 03,67.0,New Generation,1978-01-01,60 years 07 months,250000.0,721.1813,346.653470,...,39220.009892,1.370966,103.838202,Mayflower,189.980291,MRT,Brown,9715.131951,ANG MO KIO PRIMARY SCHOOL,241.572335
2,2017-01-01,3 ROOM,01 TO 03,67.0,New Generation,1980-01-01,62 years 05 months,262000.0,721.1813,363.292836,...,40297.283149,1.380709,103.835368,Lentor,532.154773,MRT,Brown,10828.819556,ANDERSON PRIMARY SCHOOL,777.155378
3,2017-01-01,3 ROOM,04 TO 06,68.0,New Generation,1980-01-01,62 years 01 month,265000.0,731.9452,362.048962,...,38693.098657,1.366201,103.857201,Ang Mo Kio,945.371842,MRT,Red,9097.929095,TECK GHEE PRIMARY SCHOOL,698.165530
4,2017-01-01,3 ROOM,01 TO 03,67.0,New Generation,1980-01-01,62 years 05 months,265000.0,721.1813,367.452678,...,40334.052030,1.381041,103.835132,Lentor,498.418205,MRT,Brown,10869.453109,ANDERSON PRIMARY SCHOOL,782.553222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192858,2024-10-01,EXECUTIVE,01 TO 03,145.0,Maisonette,1988-01-01,62 years 09 months,888888.0,1560.7655,569.520533,...,45399.561977,1.426852,103.845966,Yishun,1252.997547,MRT,Red,15793.799535,HUAMIN PRIMARY SCHOOL,192.526956
192859,2024-10-01,EXECUTIVE,01 TO 03,146.0,Maisonette,1988-01-01,62 years 10 months,848000.0,1571.5294,539.601741,...,45356.500261,1.426463,103.844909,Yishun,1150.375908,MRT,Red,15755.777678,HUAMIN PRIMARY SCHOOL,93.390877
192860,2024-10-01,EXECUTIVE,10 TO 12,152.0,Maisonette,1985-01-01,59 years 11 months,867000.0,1636.1128,529.914563,...,46623.232572,1.437918,103.836995,Yishun,962.960922,MRT,Red,17081.671936,CHONGFU SCHOOL,262.924687
192861,2024-10-01,EXECUTIVE,01 TO 03,169.0,Apartment,1992-01-01,66 years 09 months,860000.0,1819.0991,472.761489,...,46423.735262,1.436114,103.837605,Yishun,792.388123,MRT,Red,16876.728294,XISHAN PRIMARY SCHOOL,292.658611
