In [473]:
import pandas as pd
import requests
import time
import geopandas as gpd
from dateutil.relativedelta import relativedelta
import datetime as dt
import math
import shutil
import os

In [474]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 10)

In [517]:
####### TO BE CHANGED EVERY MONTH WHEN THIS SCRIPT NEEDS TO BE RUN ##########
HDB_RESALE_TRANSACTIONS_RAW = "../datasets/resale_hdb_price_raw_31jan25.csv" # change this for every sync
#############################################################################

# The rest should stay the same unless there are changes to the files
HDB_RESALE_TRANSACTIONS_CLEANED = "../datasets/hdb_resale_flat_transactions.csv"
HDB_MASTERFILE_SOURCE = "../datasets/hdb_resale_flat_address_masterlist.csv"
URA_PLANNING_AREA_SOURCE  = "../datasets/sg_map/mp2014/MP14_PLNG_AREA_NO_SEA_PL.shp"
MRT_SOURCE = "../datasets/mrt_lrt_stations_2025-01-14.csv"
SCHOOL_SOURCE = "../datasets/schools_for_plotly.csv"

In [478]:
# Read in new file
df_raw_orig = pd.read_csv(HDB_RESALE_TRANSACTIONS_RAW, parse_dates=["month", "lease_commence_date"])
df_raw = df_raw_orig.copy()

# converting floor area from per-square-meter to per-square-feet
df_raw["floor_area_sqft"] = df_raw["floor_area_sqm"] * 10.7639
df_raw["price_per_sqft"] = (df_raw["resale_price"] / df_raw["floor_area_sqft"])

# df_raw = df_raw.drop(["remaining_lease", "lease_commence_date"], axis=1)

In [479]:
df_raw.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft
0,2017-01-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979-01-01,61 years 04 months,232000.0,473.6116,489.852867
1,2017-01-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978-01-01,60 years 07 months,250000.0,721.1813,346.65347
2,2017-01-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980-01-01,62 years 05 months,262000.0,721.1813,363.292836
3,2017-01-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980-01-01,62 years 01 month,265000.0,731.9452,362.048962
4,2017-01-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980-01-01,62 years 05 months,265000.0,721.1813,367.452678


Data should ideally be retrieved from data.sg at the end of each month, e.g. 31 Jan, 28/29 Feb etc.. If data is retrieved at dates other than the end-of-month, records that lack the entire month of data are discarded with the following line of code.

In [480]:
# df_raw = df_raw.query("month != '2025-01-01'")

In [481]:
df_raw.shape

(199327, 13)

### 1. Update Masterlist with new HDB addresses not yet found in masterlist

#### 1a. Open the masterlist of HDB addresses. This will be used to perform a left join to the new data

In [482]:
masterlist_df_orig = pd.read_csv(HDB_MASTERFILE_SOURCE, index_col=0, dtype={"postal": "object"}, parse_dates=["lease_commence_date", "date_added"])
masterlist_df = masterlist_df_orig.copy()

In [483]:
# masterlist_df.head()

#### 1b: First, check if there are any new addresses that appear in the new dataframe

In [484]:
df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
).query("_merge != 'both'")

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date_r,planning_area_ura,region_ura,x,y,latitude,longitude,date_added,_merge


#### 1c: If there are new addresses, use the following code to select the correct address, and insert ancillary information like floor area in square feet, price per square foot, and recalculating lease commence date

In [485]:
# First, check for any missing values in the masterlist
missing_val = masterlist_df[masterlist_df.isna().any(axis=1)]
missing_val

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,date_added


In [486]:
# Obtain the index of data points whose block + street name + postal + building information aren't present in the masterlist
for_editing_index = df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
).query("_merge != 'both'").drop_duplicates(
    subset=["town", "block", "street_name"]
).index

print("Records with address not yet in our masterlist: ")
print(for_editing_index)

for_editing = df_raw.loc[for_editing_index, :]
for_editing.head()

Records with address not yet in our masterlist: 
Index([], dtype='int64')


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,floor_area_sqft,price_per_sqft


In [487]:
def obtain_lease_yearmth(row):
    """
    This function calculates a more accurate lease commence date using the 'month' and 'lease_commence_date' columns.
    """
    lease_year = int(row["remaining_lease"][0:2])
    lease_month_str = row["remaining_lease"][9:11]
    
    if lease_month_str == '':
        lease_month = 0
    else:
        lease_month = int(lease_month_str)
    
    lease_commence = row['month'] + pd.DateOffset(years=lease_year - 99, months=lease_month)
    
    return lease_commence

if for_editing.shape[0] > 0:
    for_editing["lease_commence_date"] = for_editing.apply(obtain_lease_yearmth, axis=1)
else:
    print("No new records for update")


No new records for update


In [488]:
if for_editing.shape[0] > 0:
    for i in for_editing.index:
        search_value = (
            for_editing.loc[i, "block"] + " " + for_editing.loc[i, "street_name"]
        )
        print(f"{i}: {search_value}")
        response = requests.get(
            f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
        )
        while response.status_code != 200:
            time.sleep(1)
            print("not 200")
            response = requests.get(
                f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_value}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
            )

        json_data = response.json()
        for j in json_data["results"]:
            print(j)
        time.sleep(0.5)
        user_input = input("Select the correct address: ")
        k = int(user_input) - 1
        print(k)

        if k == -2:
            for_editing.loc[i, "found"] = math.nan
            for_editing.loc[i, "search_val"] = math.nan
            for_editing.loc[i, "blk_no"] = math.nan
            for_editing.loc[i, "road_name"] = math.nan
            for_editing.loc[i, "building"] = math.nan
            for_editing.loc[i, "address"] = math.nan
            for_editing.loc[i, "postal"] = math.nan
            for_editing.loc[i, "x"] = math.nan
            for_editing.loc[i, "y"] = math.nan
            for_editing.loc[i, "latitude"] = math.nan
            for_editing.loc[i, "longitude"] = math.nan
        else:
            for_editing.loc[i, "found"] = 1
            for_editing.loc[i, "search_val"] = json_data["results"][k]["SEARCHVAL"]
            for_editing.loc[i, "blk_no"] = json_data["results"][k]["BLK_NO"]
            for_editing.loc[i, "road_name"] = json_data["results"][k]["ROAD_NAME"]
            for_editing.loc[i, "building"] = json_data["results"][k]["BUILDING"]
            for_editing.loc[i, "address"] = json_data["results"][k]["ADDRESS"]
            for_editing.loc[i, "postal"] = json_data["results"][k]["POSTAL"]
            for_editing.loc[i, "x"] = json_data["results"][k]["X"]
            for_editing.loc[i, "y"] = json_data["results"][k]["Y"]
            for_editing.loc[i, "latitude"] = json_data["results"][k]["LATITUDE"]
            for_editing.loc[i, "longitude"] = json_data["results"][k]["LONGITUDE"]

In [489]:
if for_editing.shape[0] > 0:
    for_editing_gdf = gpd.GeoDataFrame(for_editing, geometry=gpd.points_from_xy(for_editing["x"], for_editing["y"]))
else:
    print("No new records for update")

# for_editing_gdf

No new records for update


#### 1d. Append URAs planning area and regions

In [490]:
planning_areas_gdf = gpd.read_file(URA_PLANNING_AREA_SOURCE)

# planning_areas_gdf.head()

In [491]:
if for_editing.shape[0] > 0:
    for_editing_gdf = for_editing_gdf.sjoin(planning_areas_gdf[["PLN_AREA_N", "REGION_N", "geometry"]], how='left')
    for_editing_gdf = for_editing_gdf.rename(columns={"PLN_AREA_N":"planning_area_ura", "REGION_N":"region_ura"})
else:
    print("No new records for update")

No new records for update


#### 1e. Record insertion date

In [492]:
today_date = dt.date.today().replace(day=1)

if for_editing.shape[0] > 0:
    for_editing_gdf["date_added"] = today_date
else:
    print("No new records for update")

No new records for update


#### 1f. Reorder columns and save file

In [493]:
if for_editing.shape[0] > 0:
    # converting the gpd dataframe back to a pd dataframe. .geojson or .shp files are much harder to circulate compared to .csv files
    for_editing = for_editing_gdf[
        [
            "town",
            "block",
            "road_name",
            "blk_no",
            "street_name",
            "building",
            "postal",
            "address",
            "lease_commence_date",
            "planning_area_ura",
            "region_ura",
            "x",
            "y",
            "latitude",
            "longitude",
            "date_added"
        ]
    ]
else:
    print("No new records for update")

No new records for update


In [494]:
new_masterlist_df = pd.concat([masterlist_df, for_editing], axis=0)
new_masterlist_df = new_masterlist_df.sort_values(by=["town", "street_name", "block"]).reset_index(drop=True)

new_masterlist_df

Unnamed: 0,town,block,street_name,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,date_added,month,flat_type,storey_range,floor_area_sqm,flat_model,remaining_lease,resale_price,floor_area_sqft,price_per_sqft
0,ANG MO KIO,205,ANG MO KIO AVE 1,205,ANG MO KIO AVENUE 1,NIL,560205,205 ANG MO KIO AVENUE 1 SINGAPORE 560205,1977-01-01,ANG MO KIO,NORTH-EAST REGION,29142.244275,38774.891527,1.366941,103.843582,2025-02-01,NaT,,,,,,,,
1,ANG MO KIO,207,ANG MO KIO AVE 1,207,ANG MO KIO AVENUE 1,ANG MO KIO 22,560207,207 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-07-01,ANG MO KIO,NORTH-EAST REGION,29060.485578,38651.052977,1.365821,103.842848,2025-02-01,NaT,,,,,,,,
2,ANG MO KIO,208,ANG MO KIO AVE 1,208,ANG MO KIO AVENUE 1,ANG MO KIO 22,560208,208 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-07-01,ANG MO KIO,NORTH-EAST REGION,29045.715075,38609.483079,1.365445,103.842715,2025-02-01,NaT,,,,,,,,
3,ANG MO KIO,215,ANG MO KIO AVE 1,215,ANG MO KIO AVENUE 1,ANG MO KIO 22,560215,215 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-04-01,ANG MO KIO,NORTH-EAST REGION,28924.303291,38732.591142,1.366558,103.841624,2025-02-01,NaT,,,,,,,,
4,ANG MO KIO,216,ANG MO KIO AVE 1,216,ANG MO KIO AVENUE 1,ANG MO KIO 22,560216,216 ANG MO KIO AVENUE 1 ANG MO KIO 22 SINGAPOR...,1976-04-01,ANG MO KIO,NORTH-EAST REGION,28911.052240,38692.616791,1.366197,103.841505,2025-02-01,NaT,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9659,YISHUN,876,YISHUN ST 81,876,YISHUN STREET 81,NIL,760876,876 YISHUN STREET 81 SINGAPORE 760876,1987-12-01,YISHUN,NORTH REGION,28246.274546,44060.806242,1.414745,103.835532,2025-02-01,NaT,,,,,,,,
9660,YISHUN,877,YISHUN ST 81,877,YISHUN STREET 81,NIL,760877,877 YISHUN STREET 81 SINGAPORE 760877,1987-12-01,YISHUN,NORTH REGION,28237.634702,43967.636908,1.413902,103.835454,2025-02-01,NaT,,,,,,,,
9661,YISHUN,878,YISHUN ST 81,878,YISHUN STREET 81,NIL,760878,878 YISHUN STREET 81 SINGAPORE 760878,1988-01-01,YISHUN,NORTH REGION,28285.898091,43984.300584,1.414053,103.835888,2025-02-01,NaT,,,,,,,,
9662,YISHUN,879,YISHUN ST 81,879,YISHUN STREET 81,NIL,760879,879 YISHUN STREET 81 SINGAPORE 760879,1987-10-01,YISHUN,NORTH REGION,28311.512736,44027.290776,1.414442,103.836118,2025-02-01,NaT,,,,,,,,


In [495]:
def save_masterfile(df) -> None:
    today_date = dt.datetime.today().strftime("%Y-%m-%d")
    archive_folder_name = "../datasets/resale_hdb_archive/hdb_information_masterlist/"
    old_masterlist_filename = f"hdb_resale_flat_address_masterlist_{today_date}"

    copies = 0
    for i in os.listdir("../datasets/resale_hdb_archive/hdb_information_masterlist/"):
        if old_masterlist_filename in i:
            copies += 1

    if copies > 0:
        destination = f"{archive_folder_name}{old_masterlist_filename}_{copies+1}.csv"
    else:
        destination = f"{archive_folder_name}{old_masterlist_filename}.csv"

    shutil.move(HDB_MASTERFILE_SOURCE, destination)
    df.to_csv(HDB_MASTERFILE_SOURCE)

In [496]:
if for_editing.shape[0] > 0:
    save_masterfile(new_masterlist_df)
else:
    print("No new records for update")

No new records for update


### Begin combining closest MRT station and closest primary school

#### 2. Perform a left-join on the transactions dataset with masterlist

In [497]:
# reopen masterlist
masterlist_df_orig = pd.read_csv(
    HDB_MASTERFILE_SOURCE,
    index_col=0,
    dtype={"postal": "object"},
    parse_dates=["lease_commence_date", "date_added"],
)
masterlist_df = masterlist_df_orig.copy()

In [498]:
# Perform a df_raw <- leftjoin <- masterlist_df
df_new = df_raw.merge(
    masterlist_df,
    how="left",
    on=["town", "block", "street_name"],
    suffixes=["", "_r"],
    indicator=True,
)

# calculate remaining lease
df_new["remaining_lease_years"] = df_new["remaining_lease"].str.slice(0, 2).astype('int')
df_new["remaining_lease_months"] = pd.to_numeric(df_new["remaining_lease"].str.slice(9, 11)).fillna(0).astype('int')

# drop lease_commence_date in favour of the same column found in masterlist
df_new = df_new.drop(["lease_commence_date"], axis=1)
df_new = df_new.rename(columns={"lease_commence_date_r": "lease_commence_date"})

# Convert df into a geopandas dataframe
gdf_new = gpd.GeoDataFrame(df_new, geometry=gpd.points_from_xy(df_new["x"], df_new["y"]))

In [499]:
gdf_new.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,date_added,_merge,remaining_lease_years,remaining_lease_months,geometry
0,2017-01-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,61 years 04 months,232000.0,473.6116,489.852867,406,ANG MO KIO AVENUE 10,NIL,560406,406 ANG MO KIO AVENUE 10 SINGAPORE 560406,1979-05-01,ANG MO KIO,NORTH-EAST REGION,30288.234663,38229.067463,1.362005,103.85388,2025-02-01,both,61,4,POINT (30288.235 38229.067)
1,2017-01-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,60 years 07 months,250000.0,721.1813,346.65347,108,ANG MO KIO AVENUE 4,KEBUN BARU HEIGHTS,560108,108 ANG MO KIO AVENUE 4 KEBUN BARU HEIGHTS SIN...,1978-08-01,ANG MO KIO,NORTH-EAST REGION,28543.458747,39220.009892,1.370966,103.838202,2025-02-01,both,60,7,POINT (28543.459 39220.010)
2,2017-01-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,62 years 05 months,262000.0,721.1813,363.292836,602,ANG MO KIO AVENUE 5,YIO CHU KANG GREEN,560602,602 ANG MO KIO AVENUE 5 YIO CHU KANG GREEN SIN...,1980-06-01,ANG MO KIO,NORTH-EAST REGION,28228.099954,40297.283149,1.380709,103.835368,2025-02-01,both,62,5,POINT (28228.100 40297.283)
3,2017-01-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,62 years 01 month,265000.0,731.9452,362.048962,465,ANG MO KIO AVENUE 10,TECK GHEE HORIZON,560465,465 ANG MO KIO AVENUE 10 TECK GHEE HORIZON SIN...,1980-02-01,ANG MO KIO,NORTH-EAST REGION,30657.824693,38693.098657,1.366201,103.857201,2025-02-01,both,62,1,POINT (30657.825 38693.099)
4,2017-01-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,62 years 05 months,265000.0,721.1813,367.452678,601,ANG MO KIO AVENUE 5,YIO CHU KANG GREEN,560601,601 ANG MO KIO AVENUE 5 YIO CHU KANG GREEN SIN...,1980-06-01,ANG MO KIO,NORTH-EAST REGION,28201.782245,40334.05203,1.381041,103.835132,2025-02-01,both,62,5,POINT (28201.782 40334.052)


In [500]:
gdf_new.query("_merge != 'both'")

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,remaining_lease,resale_price,floor_area_sqft,price_per_sqft,blk_no,road_name,building,postal,address,lease_commence_date,planning_area_ura,region_ura,x,y,latitude,longitude,date_added,_merge,remaining_lease_years,remaining_lease_months,geometry


#### 2b. Appending information associated to the closest MRT and closest school for each transaction

In [501]:
mrt_df_orig = pd.read_csv(MRT_SOURCE, parse_dates=["opening"], index_col=0)
mrt_df = mrt_df_orig.copy()

mrt_gdf = gpd.GeoDataFrame(mrt_df, geometry=gpd.points_from_xy(mrt_df["x"], mrt_df["y"]))
# mrt_gdf.head()

In [502]:
school_df = pd.read_csv(SCHOOL_SOURCE, index_col=0, dtype={"postal": "string"})
school_df["postal"] = school_df["postal"].astype("str").apply(lambda x: f"{x:0>6}")

# Convert the df into a gdf
school_gdf = gpd.GeoDataFrame(
    school_df, geometry=gpd.points_from_xy(school_df["x"], school_df["y"])
)
# school_gdf.head()

In [503]:
def find_closest_station(row, mrt_gdf, on="trans_date"):
    assert on in ["trans_date", "today"]

    if on == "trans_date":
        mrt_gdf["opening_date"] = mrt_gdf["opening"].dt.to_period("M").dt.to_timestamp()
        mrt_stations_filtered = mrt_gdf[mrt_gdf["opening_date"] < row["month"]]
        distances = mrt_stations_filtered.distance(row["geometry"])
    else:
        distances = mrt_gdf.distance(row["geometry"])

    closest_station_index = distances.idxmin()
    shortest_distance = distances.min()

    closest_station_name = mrt_gdf.loc[closest_station_index, "station_name"]
    closest_transport_type = mrt_gdf.loc[closest_station_index, "type"]
    closest_mrt_color = mrt_gdf.loc[closest_station_index, "color"]

    # distance to cbd
    raffles_place_index = mrt_gdf.query("station_name == 'Raffles Place'").index[0]
    distance_to_cbd = mrt_gdf.loc[raffles_place_index, "geometry"].distance(row["geometry"])

    return pd.Series(
        [
            closest_station_name,
            shortest_distance,
            closest_transport_type,
            closest_mrt_color,
            distance_to_cbd,
        ],
        index=[
            "closest_mrt_station",
            "distance_to_mrt_meters",
            "transport_type",
            "line_color",
            "distance_to_cbd",
        ],
    )


def find_closest_school(row, school_gdf, level="PRIMARY"):
    assert level in [
        "PRIMARY",
        "SECONDARY",
        "JUNIOR COLLEGE",
        "MIXED LEVELS",
        "CENTRALISED INSTITUTE",
    ]

    school_gdf_filtered = school_gdf.query("mainlevel_code == @level")
    distances = school_gdf_filtered.distance(row["geometry"])

    closest_school_index = distances.idxmin()
    shortest_distance = distances.min()

    closest_school = school_gdf_filtered.loc[closest_school_index, "school_name"]

    return pd.Series(
        [
            closest_school,
            shortest_distance,
        ],
        index=[
            "closest_pri_school",
            "distance_to_pri_school_meters",
        ],
    )

In [504]:
gdf_new[
    [
        "closest_mrt_station",
        "distance_to_mrt_meters",
        "transport_type",
        "line_color",
        "distance_to_cbd",
    ]
] = gdf_new.apply(find_closest_station, mrt_gdf=mrt_gdf, axis=1)

gdf_new[
    [
        "closest_pri_school", 
        "distance_to_pri_school_meters"
    ]
] = gdf_new.apply(
        find_closest_school, 
        school_gdf=school_gdf, 
        axis=1
    )


In [505]:
gdf_new.shape

(199327, 36)

In [506]:
columns_to_drop = ["geometry", "address", "street_name", "block", "date_added", "_merge", "remaining_lease"]
gdf_new = gdf_new.drop(columns_to_drop, axis=1)

In [507]:
gdf_new = gdf_new[
    [
        "month",
        "town",
        "blk_no",
        "road_name",
        "building",
        "postal",
        "resale_price",
        "storey_range",
        "flat_type",
        "flat_model",
        "lease_commence_date",
        "remaining_lease_years",
        "remaining_lease_months",
        "floor_area_sqm",
        "floor_area_sqft",
        "price_per_sqft",
        "planning_area_ura",
        "region_ura",
        "x",
        "y",
        "latitude",
        "longitude",
        "closest_mrt_station",
        "distance_to_mrt_meters",
        "transport_type",
        "line_color",
        "distance_to_cbd",
        "closest_pri_school",
        "distance_to_pri_school_meters",
    ]
]

In [520]:
def save_resale_transactions(df) -> None:
    today_date = dt.datetime.today().strftime("%Y-%b")
    archive_folder_name = "../datasets/resale_hdb_archive/combined_dataset/for_kaggle/"
    old_resale_transaction_filename = f"hdb_resale_flat_transactions_{today_date}"

    copies = 0
    for i in os.listdir("../datasets/resale_hdb_archive/combined_dataset/for_kaggle"):
        if old_resale_transaction_filename in i:
            copies += 1

    if copies > 0:
        destination = f"{archive_folder_name}{old_resale_transaction_filename}_{copies+1}.csv"
    else:
        destination = f"{archive_folder_name}{old_resale_transaction_filename}.csv"

    shutil.move(HDB_RESALE_TRANSACTIONS_CLEANED, destination)
    df.to_csv(HDB_RESALE_TRANSACTIONS_CLEANED)

save_resale_transactions(gdf_new)

In [511]:
gdf_new.columns

Index(['month', 'town', 'blk_no', 'road_name', 'building', 'postal',
       'resale_price', 'storey_range', 'flat_type', 'flat_model',
       'lease_commence_date', 'remaining_lease_years',
       'remaining_lease_months', 'floor_area_sqm', 'floor_area_sqft',
       'price_per_sqft', 'planning_area_ura', 'region_ura', 'x', 'y',
       'latitude', 'longitude', 'closest_mrt_station',
       'distance_to_mrt_meters', 'transport_type', 'line_color',
       'distance_to_cbd', 'closest_pri_school',
       'distance_to_pri_school_meters'],
      dtype='object')

# Appendix: Repair Codes, Deprecated Codes, etc.

### Code for repairing masterfile (Single Use, Ran on 4 Feb 25, Deprecated)

A fix for the masterfile for 50 recent entries having lease commence dates in the 1800s.

In [221]:
# HDB_MASTERFILE_FOR_REPAIR = "../datasets/hdb_resale_flat_address_masterlist_original.csv"

# df_error =pd.read_csv(
#     HDB_MASTERFILE_FOR_REPAIR, 
#     index_col=0, 
#     dtype={"postal": "object"}, 
#     parse_dates=["lease_commence_date"]
# )

In [222]:
# df_for_repair = df_error[df_error["lease_commence_date"] < dt.datetime(year=1960, month=1, day=1)]
# df_error = df_error.drop(df_for_repair.index, axis=0)

# # df_error.to_csv("../datasets/hdb_resale_flat_address_masterlist_truncated_2025-02-04.csv")

In [223]:
# df_for_repair = df_for_repair.merge(
#     df_raw.drop_duplicates(subset=["town", "block", "street_name"]),
#     how="left",
#     on=["town", "block", "street_name"],
# )

In [224]:
# df_for_repair = df_for_repair.drop(
#     [
#         "lease_commence_date_x",
#         "flat_type",
#         "storey_range",
#         "floor_area_sqm",
#         "flat_model",
#         "resale_price",
#         "floor_area_sqft",
#         "price_per_sqft",
#     ], axis=1
# )

In [225]:
# df_for_repair["lease_remaining_year"] = df_for_repair["remaining_lease"].str.slice(0, 2).astype("int")
# df_for_repair["lease_remaining_month"] = pd.to_numeric(df_for_repair["remaining_lease"].str.slice(9, 11), errors="coerce").fillna(0)
# df_for_repair["lease_commence_yearmth"] = df_for_repair.apply(obtain_lease_yearmth, axis=1)

# df_for_repair = df_for_repair.rename(columns={"lease_commence_date_y":"lease_commence_date"})

In [226]:
# df_for_repair = df_for_repair.loc[:, df_error.columns]

In [227]:
# df_combined = pd.concat([df_for_repair, df_error], axis=0)
# df_combined = df_combined.sort_values(by=["town", "street_name", "block"]).reset_index(drop=True)

# df_combined

In [228]:
# df_combined.to_csv("../datasets/hdb_resale_flat_address_masterlist.csv")

### Removing temporal columns in the masterfile (Single Use, Ran on 4 Feb 25, Deprecated)

Summary:
- For removing columns containing temporal information. This include "closest mrt station", "distance_to_mrt_meters", "transport_type", "line_color", "distance_to_cbd", "closest_pri_school", and "distance_to_pri_school_meters". For adding a "date_added" column.

Why:
- The closest MRT station to a certain HDB block today might not be the same tomorrow because of continual upgrades to Singapores rail network. Similarly, the closest primary school
to a certain HDB block today might not be the same tomorrow because of the decreasing Singapore population.
- An additional column will also be added to track the month for which a certain entry is added.

Objective:
1. To remove columns that contain information with a temporal nature
2. To add a "date_added" column

In [None]:
# HDB_MASTERFILE_FOR_REPAIR = "../datasets/hdb_resale_flat_address_masterlist.csv"

# df_for_repair = pd.read_csv(
#     HDB_MASTERFILE_FOR_REPAIR, 
#     index_col=0, 
#     dtype={"postal": "object"}, 
#     parse_dates=["lease_commence_date"]
# )

# df_for_repair["date_added"] = dt.date.today().replace(day=1)

# df_for_repair = df_for_repair.loc[:, :"longitude"]
# df_for_repair.to_csv(HDB_MASTERFILE_FOR_REPAIR)