In [3]:
import pandas as pd

In [24]:
ec_list = [
    "URA_ResidentialTransaction_EC2020_20240917220317",
    "URA_ResidentialTransaction_EC2021_20240917220358",
    "URA_ResidentialTransaction_EC2022_20240917220420",
    "URA_ResidentialTransaction_EC2023_20240917220459",
    "URA_ResidentialTransaction_EC2024_20240917220523",
]

condo_list = [
    "URA_ResidentialTransaction_Conda2020_20240917220234",
    "URA_ResidentialTransaction_Conda2021_20240917220149",
    "URA_ResidentialTransaction_Conda2022_20240917220116",
    "URA_ResidentialTransaction_Conda2023_20240917215948",
    "URA_ResidentialTransaction_Condo2024_20240917215852",
]

hdb_list = [
    "ResaleFlatPricesBasedonRegistrationDateFromJan2015toDec2016",
    "ResaleflatpricesbasedonregistrationdatefromJan2017onwards",
]

In [62]:
ec_df = [pd.read_csv(f"../data/raw_data/csv/ura/{ec}.csv") for ec in ec_list]
ec_df = pd.concat(ec_df)
ec_df.to_parquet(r"../data/L1/housing_ec_transaction.parquet")

In [61]:
condo_df = [
    pd.read_csv(f"../data/raw_data/csv/ura/{condo}.csv") for condo in condo_list
]
condo_df = pd.concat(condo_df)
condo_df['Area (SQM)'] = condo_df['Area (SQM)'].str.replace(',', '').str.strip()
condo_df['Area (SQM)'] = pd.to_numeric(condo_df['Area (SQM)'], errors='coerce')
condo_df.to_parquet(r"../data/L1/housing_condo_transaction.parquet")

In [75]:
hdb_df = [pd.read_csv(f"../data/raw_data/csv/datagov/{hdb}.csv") for hdb in hdb_list]
hdb_df = pd.concat(hdb_df)

import re

def standardize_lease_duration(lease):
    if isinstance(lease, int) or lease.isdigit():
        return int(lease) * 12  # assume months
    else:
        match = re.match(r'(\d+) years?\s*', lease) #(\d+) months?
        if match:
            years = int(match.group(1))
            # months = int(match.group(2)) if match.group(2) else 0
            return years * 12 #+ months
        else:
            return None  # or raise an exception

hdb_df['remaining_lease_months'] = hdb_df['remaining_lease'].apply(standardize_lease_duration)
hdb_df.drop('remaining_lease', axis=1, inplace=True)
hdb_df.to_parquet(r"../data/L1/housing_hdb_transaction.parquet")

# Combining to idenfity all unique condo and flats

In [45]:
housing_df = pd.concat(
    [
        condo_df[["Project Name", "Street Name"]].drop_duplicates(),
        ec_df[["Project Name", "Street Name"]].drop_duplicates(),
        hdb_df[["block", "street_name"]].drop_duplicates(),
    ],
    ignore_index=True,
)

NameAddress_list = ["Project Name", "Street Name", "block", "street_name"]
for i in NameAddress_list:
    housing_df[i] = housing_df[i].fillna("")
housing_df["NameAddress"] = housing_df[NameAddress_list].agg(" ".join, axis=1)
housing_df["NameAddress"] = [i.strip() for i in housing_df["NameAddress"]]

In [95]:
for search_string in housing_df['NameAddress'][:10]:
    print(search_string)

AFFINITY AT SERANGOON SERANGOON NORTH AVENUE 1
THE FLORENCE RESIDENCES HOUGANG AVENUE 2
THE GARDEN RESIDENCES SERANGOON NORTH VIEW
THE PROMENADE@PELIKAT JALAN PELIKAT
PRIMO RESIDENCES JALAN PELIKAT
SENGKANG GRAND RESIDENCES COMPASSVALE BOW
COMPASS HEIGHTS SENGKANG SQUARE
A TREASURE TROVE PUNGGOL WALK
BOTANIQUE AT BARTLEY UPPER PAYA LEBAR ROAD
REGENTVILLE HOUGANG STREET 92


# OneMap Setup

In [79]:
import requests
import os
import json

url = "https://www.onemap.gov.sg/api/auth/post/getToken"
      
payload = {
        "email": os.environ['ONEMAP_EMAIL'],
        "password": os.environ['ONEMAP_EMAIL_PASSWORD']
      }
      
response = requests.request("POST", url, json=payload)
access_token = json.loads(response.text)['access_token']
headers = {"Authorization": f"{access_token}"}

# Search for X, Y and other data on OneMap
- this will take a while
- with exponential backoff and limit to failure

In [100]:
import requests
import json
import pandas as pd
import time
import random

df_list = []
max_retries = 3
initial_backoff = 1  # seconds
max_backoff = 32  # seconds

for search_string in housing_df['NameAddress']:
    retries = 0
    success = False
    backoff = initial_backoff

    while not success and retries < max_retries:
        try:
            url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_string}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
            response = requests.request("GET", url, headers=headers)
            response.raise_for_status()  # Raise an exception for HTTP errors

            _df = pd.DataFrame(json.loads(response.text)['results']).reset_index().rename({'index':'search_result'}, axis=1)
            _df['NameAddress'] = search_string
            df_list.append(_df)

            success = True

        except requests.RequestException as e:
            retries += 1
            backoff = min(backoff * 2, max_backoff)  # Exponential backoff
            delay = backoff + random.uniform(0, 1)  # Add some jitter to the delay
            print(f"Request failed for {search_string}. Retrying in {delay:.2f} seconds. (Retry {retries}/{max_retries})")
            time.sleep(delay)

    if not success:
        print(f"Failed to retrieve data for {search_string} after {max_retries} retries.")

In [None]:
df_housing_searched = pd.concat(df_list)
df_housing_searched.to_parquet(r"../data/L1/housing_unique_searched.parquet")