In [62]:
import pandas as pd
import requests
import time
import geopandas as gpd
from dateutil.relativedelta import relativedelta
import datetime as dt
import math

In [63]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 10)

In [64]:
# Read in new file
file_loc_raw = "../datasets/resale_hdb_price_raw_01sep.csv"
df_raw = pd.read_csv(file_loc_raw, parse_dates=["month", "lease_commence_date"])

df_raw["floor_area_sqft"] = df_raw["floor_area_sqm"] * 10.7639
df_raw["price_per_sqft"] = (
    df_raw["resale_price"] / df_raw["floor_area_sqft"]
)

# df_raw = df_raw.drop(["remaining_lease", "lease_commence_date"], axis=1)

In [65]:
df_raw.head(1)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft
0,2017-01-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979-01-01,61 years 04 months,232000.0,473.6116,489.852867


In [66]:
df_raw.shape

(188515, 13)

### Begin the synchronisation process

#### 1. Open the masterlist of HDB addresses. This will be used to perform a left join to the new data

In [67]:
masterlist_file_loc = "../datasets/hdb_resale_flat_address_masterlist.csv"
# masterlist_file_loc = "./temp_masterlist.csv"
masterlist_df = pd.read_csv(
    masterlist_file_loc, index_col=0, dtype={"postal": "object"}, parse_dates=["lease_commence_date"]
)

masterlist_df.head()

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
0,ANG MO KIO,205,ANG MO KIO AVE 1,205,ANG MO KIO AVENUE 1,NIL,560205,205 ANG MO KIO AVENUE 1 SINGAPORE 560205,1977-01-01,ANG MO KIO,NORTH-EAST REGION,29142.244275,38774.891527,1.366941,103.843582,Ang Mo Kio,742.768808,MRT,Red,9199.172507,ANG MO KIO PRIMARY SCHOOL,512.545254
1,ANG MO KIO,207,ANG MO KIO AVE 1,207,ANG MO KIO AVENUE 1,ANG MO KIO 22,560207,207 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-07-01,ANG MO KIO,NORTH-EAST REGION,29060.485578,38651.052977,1.365821,103.842848,Ang Mo Kio,874.305666,MRT,Red,9084.16982,ANG MO KIO PRIMARY SCHOOL,527.275749
2,ANG MO KIO,208,ANG MO KIO AVE 1,208,ANG MO KIO AVENUE 1,ANG MO KIO 22,560208,208 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-07-01,ANG MO KIO,NORTH-EAST REGION,29045.715075,38609.483079,1.365445,103.842715,Ang Mo Kio,908.966103,MRT,Red,9044.41098,ANG MO KIO PRIMARY SCHOOL,549.157654
3,ANG MO KIO,215,ANG MO KIO AVE 1,215,ANG MO KIO AVENUE 1,ANG MO KIO 22,560215,215 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-04-01,ANG MO KIO,NORTH-EAST REGION,28924.303291,38732.591142,1.366558,103.841624,Mayflower,781.530965,MRT,Brown,9180.500545,ANG MO KIO PRIMARY SCHOOL,377.594832
4,ANG MO KIO,216,ANG MO KIO AVE 1,216,ANG MO KIO AVENUE 1,ANG MO KIO 22,560216,216 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-04-01,ANG MO KIO,NORTH-EAST REGION,28911.05224,38692.616791,1.366197,103.841505,Mayflower,800.632187,MRT,Brown,9142.4094,ANG MO KIO PRIMARY SCHOOL,403.611794


#### 1a: First, check if there are any new addresses that appear in the new dataframe

In [68]:
df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
).query("_merge != 'both'")

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date_r,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters,_merge
188163,2024-08-01,TOA PAYOH,4 ROOM,106B,BIDADARI PK DR,10 TO 12,94.0,Model A,2020-01-01,95 years,1080000.0,1011.8066,1067.397663,,,,,,NaT,,,,,,,,,,,,,,left_only


#### 1b: If there are new addresses, use the following code to select the correct address, and insert ancillary information like floor area in square feet, price per square foot, and recalculating lease commence date

In [69]:
# First, check for any missing values in the masterlist
missing_val = masterlist_df[masterlist_df.isna().any(axis=1)]
missing_val

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters


In [70]:
# Obtain the index of data points whose block + street name + postal + building information aren't present in the masterlist
for_editing_index = df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
).query("_merge != 'both'").drop_duplicates(
    subset=["town", "block", "street_name"]
).index

for_editing = df_raw.loc[for_editing_index, :]
for_editing.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft
188163,2024-08-01,TOA PAYOH,4 ROOM,106B,BIDADARI PK DR,10 TO 12,94.0,Model A,2020-01-01,95 years,1080000.0,1011.8066,1067.397663


In [71]:
def obtain_lease_yearmth(row):
    today = pd.to_datetime(dt.date.today().replace(day=1))
    lease_commence = today - pd.DateOffset(
        years=row["lease_year"] + 99, months=row["lease_month"]
    )
    return lease_commence


for_editing["lease_year"] = for_editing["remaining_lease"].str.slice(0, 2).astype("int")
for_editing["lease_month"] = pd.to_numeric(
    for_editing["remaining_lease"].str.slice(9, 11), errors="coerce"
).fillna(0)

for_editing["lease_commence_date"] = for_editing.apply(obtain_lease_yearmth, axis=1)
for_editing = for_editing.drop(
    ["remaining_lease", "lease_year", "lease_month"], axis=1
)

In [72]:
for_editing.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft
188163,2024-08-01,TOA PAYOH,4 ROOM,106B,BIDADARI PK DR,10 TO 12,94.0,Model A,1830-09-01,1080000.0,1011.8066,1067.397663


In [74]:
if for_editing.shape[0] > 0:
    for i in for_editing.index:
        search_value = (
            for_editing.loc[i, "block"] + " " + for_editing.loc[i, "street_name"]
        )
        print(f"{i}: {search_value}")
        response = requests.get(
            f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
        )
        while response.status_code != 200:
            time.sleep(1)
            print("not 200")
            response = requests.get(
                f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
            )

        json_data = response.json()
        for j in json_data["results"]:
            print(j)
        time.sleep(0.5)
        user_input = input("Select the correct address: ")
        k = int(user_input) - 1
        print(k)

        if k == -2:
            for_editing.loc[i, "found"] = math.nan
            for_editing.loc[i, "search_val"] = math.nan
            for_editing.loc[i, "blk_no"] = math.nan
            for_editing.loc[i, "road_name"] = math.nan
            for_editing.loc[i, "building"] = math.nan
            for_editing.loc[i, "address"] = math.nan
            for_editing.loc[i, "postal"] = math.nan
            for_editing.loc[i, "x"] = math.nan
            for_editing.loc[i, "y"] = math.nan
            for_editing.loc[i, "latitude"] = math.nan
            for_editing.loc[i, "longitude"] = math.nan
        else:
            for_editing.loc[i, "found"] = 1
            for_editing.loc[i, "search_val"] = json_data["results"][k]["SEARCHVAL"]
            for_editing.loc[i, "blk_no"] = json_data["results"][k]["BLK_NO"]
            for_editing.loc[i, "road_name"] = json_data["results"][k]["ROAD_NAME"]
            for_editing.loc[i, "building"] = json_data["results"][k]["BUILDING"]
            for_editing.loc[i, "address"] = json_data["results"][k]["ADDRESS"]
            for_editing.loc[i, "postal"] = json_data["results"][k]["POSTAL"]
            for_editing.loc[i, "x"] = json_data["results"][k]["X"]
            for_editing.loc[i, "y"] = json_data["results"][k]["Y"]
            for_editing.loc[i, "latitude"] = json_data["results"][k]["LATITUDE"]
            for_editing.loc[i, "longitude"] = json_data["results"][k]["LONGITUDE"]

188163: 106B BIDADARI PK DR
{'SEARCHVAL': 'ALKAFF VISTA', 'BLK_NO': '106B', 'ROAD_NAME': 'BIDADARI PARK DRIVE', 'BUILDING': 'ALKAFF VISTA', 'ADDRESS': '106B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPORE 342106', 'POSTAL': '342106', 'X': '32211.8843447613', 'Y': '35092.917645948', 'LATITUDE': '1.33364211083176', 'LONGITUDE': '103.871164760412'}
0


In [75]:
for_editing_gdf = gpd.GeoDataFrame(
    for_editing, geometry=gpd.points_from_xy(for_editing["x"], for_editing["y"])
)

for_editing_gdf

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft,found,search_val,blk_no,road_name,building,address,postal,x,y,latitude,longitude,geometry
188163,2024-08-01,TOA PAYOH,4 ROOM,106B,BIDADARI PK DR,10 TO 12,94.0,Model A,1830-09-01,1080000.0,1011.8066,1067.397663,1.0,ALKAFF VISTA,106B,BIDADARI PARK DRIVE,ALKAFF VISTA,106B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,342106,32211.8843447613,35092.917645948,1.33364211083176,103.871164760412,POINT (32211.884 35092.918)


#### 1c. Appending MRT information

In [76]:
mrt_file_loc = "../datasets/mrt_lrt_stations.csv"
mrt_df = pd.read_csv(mrt_file_loc, parse_dates=["opening"], index_col=0)
mrt_gdf = gpd.GeoDataFrame(mrt_df, geometry=gpd.points_from_xy(mrt_df["x"], mrt_df["y"]))

mrt_gdf.head(1)

Unnamed: 0,code,station_name,line,color,opening,type,blk_no,road_name,building,address,postal,x,y,latitude,longitude,planning_area_ura,region_ura,geometry
0,NS1,Jurong East,North-South Line,Red,1990-03-10,MRT,10,JURONG EAST STREET 12,JURONG EAST MRT STATION (EW24 / NS1),10 JURONG EAST STREET 12 JURONG EAST MRT STATI...,609690,17869.057052,35038.96887,1.333153,103.742286,JURONG EAST,WEST REGION,POINT (17869.057 35038.969)


In [77]:
def find_closest_station(row, mrt_gdf):
    """Remove comments for the following 3 lines if you want the closest MRT at the time of transaction"""
    # mrt_gdf["OPENING_DATE"] = mrt_gdf["OPENING"].dt.to_period("M").dt.to_timestamp()
    # mrt_stations_filtered = mrt_gdf[mrt_gdf["OPENING"] < row["month"]]
    # distances = mrt_stations_filtered.distance(row["geometry"])

    """Remove comments for this line if you want the closest MRT station today"""
    distances = mrt_gdf.distance(row["geometry"])

    closest_station_index = distances.idxmin()
    shortest_distance = distances.min()

    closest_station_name = mrt_gdf.loc[closest_station_index, "station_name"]
    closest_transport_type = mrt_gdf.loc[closest_station_index, "type"]
    closest_mrt_color = mrt_gdf.loc[closest_station_index, "color"]

    # distance to cbd
    raffles_place_index = mrt_gdf.query("station_name == 'Raffles Place'").index[0]
    distance_to_cbd = mrt_gdf.loc[raffles_place_index, "geometry"].distance(
        row["geometry"]
    )

    return pd.Series(
        [
            closest_station_name,
            shortest_distance,
            closest_transport_type,
            closest_mrt_color,
            distance_to_cbd,
        ],
        index=[
            "closest_mrt_station",
            "distance_to_mrt_meters",
            "transport_type",
            "line_color",
            "distance_to_cbd",
        ],
    )


if for_editing_gdf.shape[0] > 0:
    for_editing_gdf[
        [
            "closest_mrt_station",
            "distance_to_mrt_meters",
            "transport_type",
            "line_color",
            "distance_to_cbd",
        ]
    ] = for_editing_gdf.apply(find_closest_station, mrt_gdf=mrt_gdf, axis=1)

In [78]:
for_editing_gdf

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft,found,search_val,blk_no,road_name,building,address,postal,x,y,latitude,longitude,geometry,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd
188163,2024-08-01,TOA PAYOH,4 ROOM,106B,BIDADARI PK DR,10 TO 12,94.0,Model A,1830-09-01,1080000.0,1011.8066,1067.397663,1.0,ALKAFF VISTA,106B,BIDADARI PARK DRIVE,ALKAFF VISTA,106B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,342106,32211.8843447613,35092.917645948,1.33364211083176,103.871164760412,POINT (32211.884 35092.918),Potong Pasir,343.057807,MRT,Purple,5898.043365


#### 1d. Appending Closest School information

In [79]:
school_file_loc = "../datasets/schools_for_plotly.csv"
school_df = pd.read_csv(school_file_loc, index_col=0, dtype={"postal":"string"})
school_df["postal"] = school_df["postal"].astype("str").apply(lambda x: f"{x:0>6}")

# Convert the df into a gdf
school_gdf = gpd.GeoDataFrame(school_df, geometry=gpd.points_from_xy(school_df["x"], school_df["y"]))

In [80]:
def find_closest_school(row, school_gdf, level="PRIMARY"):
    """Remove comments for the following 3 lines if you want the closest MRT at the time of transaction"""
    # school_gdf["OPENING_DATE"] = school_gdf["OPENING"].dt.to_period("M").dt.to_timestamp()
    # mrt_stations_filtered = school_gdf[school_gdf["OPENING"] < row["month"]]
    # distances = mrt_stations_filtered.distance(row["geometry"])

    school_gdf_filtered = school_gdf.query("mainlevel_code == @level")
    """Remove comments for this line if you want the closest MRT station today"""
    distances = school_gdf_filtered.distance(row["geometry"])

    closest_school_index = distances.idxmin()
    shortest_distance = distances.min()

    closest_school = school_gdf_filtered.loc[closest_school_index, "school_name"]

    return pd.Series(
        [
            closest_school,
            shortest_distance,
        ],
        index=[
            "closest_pri_school",
            "distance_to_pri_school_meters",
        ],
    )

for_editing_gdf[["closest_pri_school", "distance_to_pri_school_meters"]] = (
    for_editing_gdf.apply(find_closest_school, school_gdf=school_gdf, axis=1)
)

In [81]:
for_editing_gdf

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,floor_area_sqft,price_per_sqft,found,search_val,blk_no,road_name,building,address,postal,x,y,latitude,longitude,geometry,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
188163,2024-08-01,TOA PAYOH,4 ROOM,106B,BIDADARI PK DR,10 TO 12,94.0,Model A,1830-09-01,1080000.0,1011.8066,1067.397663,1.0,ALKAFF VISTA,106B,BIDADARI PARK DRIVE,ALKAFF VISTA,106B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,342106,32211.8843447613,35092.917645948,1.33364211083176,103.871164760412,POINT (32211.884 35092.918),Potong Pasir,343.057807,MRT,Purple,5898.043365,CEDAR PRIMARY SCHOOL,540.624881


#### 1e. Appending URA planning area and regions

In [82]:
geo_file_loc = "../datasets/sg_map/mp2014/MP14_PLNG_AREA_NO_SEA_PL.shp"
planning_areas_gdf = gpd.read_file(geo_file_loc)

planning_areas_gdf.head()

Unnamed: 0,OBJECTID,PLN_AREA_N,PLN_AREA_C,CA_IND,REGION_N,REGION_C,INC_CRC,FMEL_UPD_D,X_ADDR,Y_ADDR,SHAPE_Leng,SHAPE_Area,geometry
0,1,ANG MO KIO,AM,N,NORTH-EAST REGION,NER,E5CBDDE0C2113055,2016-05-11,28976.8763,40229.1238,17494.24019,13941380.0,"POLYGON ((30658.500 42047.527, 30679.195 42020..."
1,2,BEDOK,BD,N,EAST REGION,ER,1719251260799DF6,2016-05-11,38582.665,34032.0961,21872.798962,21733190.0,"POLYGON ((38974.269 36138.243, 39371.471 35747..."
2,3,BISHAN,BS,N,CENTRAL REGION,CR,BA616285F402846F,2016-05-11,28789.763,37450.8865,13517.121556,7618921.0,"POLYGON ((29772.191 38311.805, 29784.826 38304..."
3,4,BOON LAY,BL,N,WEST REGION,WR,A3DC87118B43CDED,2016-05-11,13410.3824,33008.9884,18528.467448,8279408.0,"POLYGON ((12861.383 32207.492, 12860.555 32208..."
4,5,BUKIT BATOK,BK,N,WEST REGION,WR,FB44C870B04B7F57,2016-05-11,19255.415,37527.6527,15234.223423,11133260.0,"POLYGON ((20294.455 39114.528, 20334.318 39054..."


In [83]:
for_editing_gdf = for_editing_gdf.sjoin(planning_areas_gdf[["PLN_AREA_N", "REGION_N", "geometry"]], how='left')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: PROJCS["SVY21",GEOGCS["SVY21[WGS84]",DATUM["WGS_19 ...

  return geopandas.sjoin(left_df=self, right_df=df, *args, **kwargs)  # noqa: B026


In [84]:
for_editing_gdf = for_editing_gdf.rename(columns={"PLN_AREA_N":"planning_area_ura", "REGION_N":"region_ura"})

In [85]:
for_editing = for_editing_gdf[
    [
        "town",
        "block",
        "road_name",
        "blk_no",
        "street_name",
        "building",
        "postal",
        "address",
        "lease_commence_date",
        "planning_area_ura",
        "region_ura",
        "x",
        "y",
        "latitude",
        "longitude",
        "closest_mrt_station",
        "distance_to_mrt_meters",
        "transport_type",
        "line_color",
        "distance_to_cbd",
        "closest_pri_school",
        "distance_to_pri_school_meters",
    ]
]

In [86]:
for_editing

Unnamed: 0,town,block,road_name,blk_no,street_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
188163,TOA PAYOH,106B,BIDADARI PARK DRIVE,106B,BIDADARI PK DR,ALKAFF VISTA,342106,106B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,1830-09-01,TOA PAYOH,CENTRAL REGION,32211.8843447613,35092.917645948,1.33364211083176,103.871164760412,Potong Pasir,343.057807,MRT,Purple,5898.043365,CEDAR PRIMARY SCHOOL,540.624881


In [87]:
new_masterlist_df = pd.concat([masterlist_df, for_editing], axis=0)
new_masterlist_df = new_masterlist_df.sort_values(by=["town", "street_name", "block"]).reset_index(drop=True)

In [88]:
new_masterlist_df.shape

(9618, 22)

In [90]:
# new_masterlist_df.to_csv(masterlist_file_loc)

### 2. Append additional information to the raw dataset

#### 2a: Reopen Masterlist

In [91]:
masterlist_file_loc = "../datasets/hdb_resale_flat_address_masterlist.csv"
# masterlist_file_loc = "./temp_masterlist.csv"
masterlist_df = pd.read_csv(
    masterlist_file_loc,
    index_col=0,
    dtype={"postal": "object"},
    parse_dates=["lease_commence_date"],
)

masterlist_df.head(1)

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters
0,ANG MO KIO,205,ANG MO KIO AVE 1,205,ANG MO KIO AVENUE 1,NIL,560205,205 ANG MO KIO AVENUE 1 SINGAPORE 560205,1977-01-01,ANG MO KIO,NORTH-EAST REGION,29142.244275,38774.891527,1.366941,103.843582,Ang Mo Kio,742.768808,MRT,Red,9199.172507,ANG MO KIO PRIMARY SCHOOL,512.545254


In [92]:
masterlist_df.shape

(9618, 22)

#### 2b: Perform a left merge between the new HDB dataset and the address masterlist

In [93]:
df_new = df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
)

In [94]:
df_new.query("_merge != 'both'")

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date_r,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters,_merge


In [95]:
df_new = df_new.drop("_merge", axis=1)

In [96]:
file_destination = file_loc_raw.replace("raw", "coords_mrt")
file_destination
df_new.to_csv(file_destination)

#### The following is used to call the Onemap API and populate coordinate details (deprecated, use the above masterlist instead)

In [None]:
# # Obtain geospatial coordinates with the Onemap API

# def get_coordinates_for_each_row(row):
#     search_value = row["block"] + " " + row["street_name"]
#     # print(search_value)

#     response = requests.get(f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1")
#     while response.status_code != 200:
#         time.sleep(1)
#         print("not 200")
#         response = requests.get(f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1")

#     data = response.json()

#     row_data = data['results'][0]
#     searchval = row_data["SEARCHVAL"]
#     address = row_data["ADDRESS"]
#     postal = row_data["POSTAL"]
#     x = row_data["X"]
#     y = row_data["Y"]
#     latitude = row_data["LATITUDE"]
#     longitude = row_data["LONGITUDE"]

#     return pd.Series([address, postal, x, y, latitude, longitude], 
#                      index=["SEARCHVAL", "ADDRESS", "POSTAL", "X", "Y", "LATITUDE", "LONGITUDE"])

In [None]:
# Appending additional columns to the dataframe


# NOTE: SHOULDNT APPEND
# for_addition["year"] = for_addition["month"].dt.year
# for_addition["lease_years"] = for_addition["remaining_lease"].str.split(" ").apply(lambda x: int(x[0]))
# bins = pd.IntervalIndex.from_tuples(
#     [(40, 50), (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)]
# )
# for_addition["lease_cat"] = pd.cut(for_addition["lease_years"], bins)
# for_addition = for_addition.drop("index", axis=1)

#### Checking datasets

In [97]:
file_loc_raw = "../datasets/resale_hdb_price_coords_mrt_01sep.csv"
df_for_kaggle = pd.read_csv(
    file_loc_raw,
    parse_dates=["month", "lease_commence_date"],
    index_col=0,
    dtype={"x": "float64", "y": "float64", "postal": "object"},
    low_memory=False,
)

In [98]:
df_for_kaggle[df_for_kaggle.isna().any(axis=1)]
# df_for_kaggle.shape

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date_r,planning_area_ura,region_ura,x,y,latitude,longitude,closest_mrt_station,distance_to_mrt_meters,transport_type,line_color,distance_to_cbd,closest_pri_school,distance_to_pri_school_meters


In [99]:
df_for_kaggle = df_for_kaggle.drop(["town", "flat_type", "block", "street_name"], axis=1)

In [100]:
today_date = dt.date.today().strftime("%Y-%d%b").lower()
kaggle_file_loc_updated_name = f"../datasets/resale_hdb_price_for_kaggle_{today_date}.csv"
df_for_kaggle.to_csv(kaggle_file_loc_updated_name)