In [35]:
import pandas as pd
import numpy as np
import geopandas as gpd
from bs4 import BeautifulSoup
import h3
from shapely.geometry import Polygon

# OneMap

In [36]:
import requests
import os
import json

url = "https://www.onemap.gov.sg/api/auth/post/getToken"

payload = {
    "email": os.environ['ONEMAP_EMAIL'],
    "password": os.environ['ONEMAP_EMAIL_PASSWORD']
}

response = requests.request("POST", url, json=payload)
access_token = json.loads(response.text)['access_token']
headers = {"Authorization": f"{access_token}"}

# Data Input

In [37]:
# planning area
planning_area_gpd = gpd.read_file(
    "../data/raw_data/onemap_planning_area_polygon.shp")

# transactions
condo_df = pd.read_parquet('../data/L1/housing_condo_transaction.parquet')
ec_df = pd.read_parquet('../data/L1/housing_ec_transaction.parquet')
hdb_df = pd.read_parquet('../data/L1/housing_hdb_transaction.parquet')

# processsed data
unique_df = pd.read_parquet('../data/L2/housing_unique_searched.parquet')
amenity_df = pd.read_parquet('../data/L2/amenity.parqeut')

# filter for only first result from onemap then
unique_df = unique_df.loc[unique_df['search_result'] == 0]
unique_df = unique_df.rename({'LATITUDE': 'lat', 'LONGITUDE': 'lon'}, axis=1)
unique_df['lat'] = unique_df['lat'].astype(float)
unique_df['lon'] = unique_df['lon'].astype(float)

# unique name for ec, condo and hdb
condo_df['unique_name'] = condo_df['Project Name'] + \
    ' ' + condo_df['Street Name']
ec_df['unique_name'] = ec_df['Project Name'] + ' ' + ec_df['Street Name']
hdb_df['unique_name'] = hdb_df['block'] + ' ' + hdb_df['street_name']

In [59]:
# unique_df

In [38]:
# geojson
park_df = gpd.read_file("../data/L1/park.geojson")
park_connecter_df = gpd.read_file("../data/L1/park_connector.geojson")
waterbody_df = gpd.read_file("../data/L1/waterbody.geojson")

In [39]:
sqm_2_sqrt = 10.764

# Casting

In [40]:
# ameneties_h12grid = [h3.latlng_to_cell(x,y, 13)  for x,y in zip(amenity_df['lat'], amenity_df['lon'])]

In [41]:
def generate_h3_grid_cell(lat, lon, resolution=8):
    """Generate H3 grid cell from lat/lon coordinates."""
    return h3.latlng_to_cell(lat, lon, resolution)


def generate_grid_disk(cell, k=5):
    """Generate H3 grid disk from a cell."""
    return h3.grid_disk(cell, k)


def generate_polygon_from_cells(cells):
    """Generate Shapely Polygon from H3 cells."""
    return Polygon(h3.cells_to_geo(cells)['coordinates'][0])


def generate_polygons(unique_df):
    """
    Generate polygons from unique_df's lat/lon coordinates.

    Parameters:
    unique_df (pandas.DataFrame): DataFrame containing 'lat' and 'lon' columns.

    Returns:
    list[shapely.Polygon]: List of polygons.
    """
    return [
        generate_polygon_from_cells(generate_grid_disk(
            generate_h3_grid_cell(lat, lon), 3))
        for lat, lon in zip(unique_df['lat'], unique_df['lon'])
    ]

# Data Processing Prep

In [42]:
polygon_list = generate_polygons(unique_df)
unique_gdf = gpd.GeoDataFrame(unique_df, geometry=polygon_list)
unique_gdf = unique_gdf.drop('search_result', axis=1)
unique_gdf = unique_gdf.set_crs('EPSG:4326')

In [43]:
# checks

# len(unique_gdf['SEARCHVAL'].unique())

# unique_tmp = unique_gdf.copy()
# unique_tmp['SEARCHVAL'] = unique_tmp['SEARCHVAL'].str.lower()
# unique_tmp.columns = unique_tmp.columns.str.lower()
# mask = unique_tmp['searchval'].str.contains('prive', case=False, na=False)
# unique_tmp[mask]

In [44]:
amenity_gdf = gpd.GeoDataFrame(
    amenity_df,
    geometry=gpd.points_from_xy(amenity_df.lon, amenity_df.lat),
    crs="EPSG:4326"  # Coordinate Reference System
)

In [45]:
# Comment
# some properties contain a list of postcode instead of a single
# 828816, 828817 etc are all prive, punggol

## Properties vs Amenity

In [46]:
unique_gdf = unique_gdf.to_crs(crs=3857)
amenity_gdf = amenity_gdf.to_crs(crs=3857)

# create copy of geometry
amenity_gdf['amenity_centroid'] = amenity_gdf.geometry


# sjoin
unique_joined = (
    unique_gdf[['SEARCHVAL', 'POSTAL', 'geometry']].drop_duplicates()
    .sjoin(amenity_gdf[['type', 'name', 'geometry', 'amenity_centroid']].drop_duplicates())
    .drop('index_right', axis=1)
)

unique_gdf = unique_gdf.to_crs(crs='EPSG:4326')
amenity_gdf = amenity_gdf.to_crs(crs='EPSG:4326')

# unique_joined = unique_joined.to_crs(crs=3857)
unique_joined["polygon_centroid"] = unique_joined["geometry"].centroid
# unique_joined = unique_joined.to_crs('EPSG:4326')

# add distance calculation
unique_joined['distance'] = unique_joined['polygon_centroid'].distance(
    unique_joined['amenity_centroid'])

In [47]:
unique_joined['SEARCHVAL'] = unique_joined['SEARCHVAL'].str.lower()
unique_joined.columns = unique_joined.columns.str.lower()

In [48]:
# len(unique_joined['searchval'].unique())
# some properties are missing, or they dont have anything nearby....

## Properties vs Planning area

In [49]:
planning_area_gpd = planning_area_gpd.set_crs(crs='EPSG:4326')
planning_area_gpd = planning_area_gpd.rename(
    {'pln_area_n': 'planning_area'}, axis=1)

# centroid
unique_gdf = unique_gdf.to_crs(crs=3857)
unique_gdf["geometry"] = unique_gdf["geometry"].centroid
unique_gdf = unique_gdf.to_crs(crs='EPSG:4326')

unique_gdf = (
    unique_gdf
    .sjoin(planning_area_gpd, how='left', predicate='within')  # d
    .drop('index_right', axis=1)
)

# Post processing

In [50]:
from datetime import datetime

# Function to extract commencing year and calculate years left


def extract_lease_info(lease_info):
    # Extract the number of years and the commencing year
    if lease_info == 'freehold' or lease_info == 'Freehold':
        return None, 'freehold'
    else:
        years = int(lease_info.split(' ')[0])
        commencing_year = int(lease_info.split(' ')[-1])

    # Calculate the remaining years
    # current_year = datetime.now().year
    # years_left = years - (current_year - commencing_year)

    return commencing_year, 'leasehold'

In [51]:
def extract_two_digits(string):
    """Extracts the first two digits from a string of the format "a to b".

    Args:
        string: The input string.

    Returns:
        The first two digits extracted from the string.
    """

    # Split the string into two parts based on "to"
    digits_parts = string.split(" to ")

    return (digits_parts[0], digits_parts[1])

## Transactions

In [63]:
private_df = pd.concat([condo_df, ec_df])
private_df.columns = (private_df.columns
                      # Replace text in brackets with underscore
                      .str.replace(r'\((.*?)\)', r'_\1', regex=True)
                      # Remove the last bracket
                      .str.replace(r'\)$', '', regex=True)
                      # Remove special characters
                      .str.replace(r'[^a-zA-Z0-9_]', '', regex=True)
                      .str.replace(r'_$', '', regex=True)
                      .str.lower()
                      )


private_df = private_df.drop(
    ['nettprice', 'numberofunits', 'typeofarea', 'typeofsale'], axis=1)

private_df[['lease_start_yr', 'hold_type']] = private_df['tenure'].apply(
    lambda x: pd.Series(extract_lease_info(x)))


# Convert the 'Date' column to datetime
private_df['saledate'] = pd.to_datetime(
    private_df['saledate'], format='%b-%y').dt.date

# private_df['area_sqft'] = private_df['area_sqft'].astype(float)

numerical_cast_dict = {'transactedprice': int,
                       'unitprice_psf': int, 'unitprice_psm': int, 'area_sqft': float}
# 'lease_start_yr':int

for key, val in numerical_cast_dict.items():
    private_df[key] = private_df[key].str.replace(',', '').astype(val)

private_df['propertytype'] = private_df['propertytype'].replace(
    'Apartment', 'Condominium')
private_df['property_type'] = 'Private'

cat_cast_list = ['marketsegment', 'propertytype', 'hold_type']
for col in cat_cast_list:
    private_df[col] = private_df[col].astype('category')

# Replace '-' with a meaningful category, if needed
private_df['floorlevel'] = private_df['floorlevel'].replace('-', 'Unknown')
# Fitler for the 3 odd records
private_df = private_df.loc[private_df['floorlevel'] != 'Unknown']
# Convert the 'Range' column to categorical
private_df['floorlevel'] = private_df['floorlevel'].astype('category')

private_df = private_df.rename(
    {'unique_name': 'property_index', 'projectname': 'project_name',
     'saledate': 'transaction_date', 'lease_start_yr': 'lease_commence_date',
     'floorlevel': 'floor_level', 'streetname': 'street_name',
     'propertytype': 'property_sub_type', 'transactedprice': 'resale_price'}, axis=1)

str_cast_list = ['project_name', 'street_name', 'property_index']
for col in str_cast_list:
    private_df[col] = private_df[col].astype('string')

private_df['area_sqft'] = pd.to_numeric(
    private_df['area_sqft'], errors='coerce')
# Fill NaN values in 'area_sqm' with calculated values from 'area_sqft'
private_df['area_sqm'] = private_df.apply(
    lambda row: row['area_sqft'] / 10.7639 if pd.isna(row['area_sqm']) else row['area_sqm'], axis=1)

private_df['lease_commence_date'] = private_df['lease_commence_date'].astype(
    'Int64')

private_df[['floor_low', 'floor_high']] = [
    extract_two_digits(i) for i in private_df['floor_level']]
private_df['property_sub_type'] = private_df['property_sub_type'].str.lower()

private_df = private_df.drop(
    ['tenure', 'marketsegment', 'postaldistrict', 'unitprice_psm'], axis=1)

In [53]:
private_df[:5]

Unnamed: 0,project_name,resale_price,area_sqft,unitprice_psf,transaction_date,street_name,area_sqm,property_sub_type,floor_level,property_index,lease_commence_date,hold_type,property_type,floor_low,floor_high
0,AFFINITY AT SERANGOON,831000,473.62,1755,2020-12-01,SERANGOON NORTH AVENUE 1,44.00078,condominium,11 to 15,AFFINITY AT SERANGOON SERANGOON NORTH AVENUE 1,2018,leasehold,Private,11,15
1,THE FLORENCE RESIDENCES,982000,635.08,1546,2020-12-01,HOUGANG AVENUE 2,59.00092,condominium,01 to 05,THE FLORENCE RESIDENCES HOUGANG AVENUE 2,2018,leasehold,Private,1,5
2,THE FLORENCE RESIDENCES,1518000,979.52,1550,2020-12-01,HOUGANG AVENUE 2,91.000474,condominium,16 to 20,THE FLORENCE RESIDENCES HOUGANG AVENUE 2,2018,leasehold,Private,16,20
3,THE GARDEN RESIDENCES,1325400,871.88,1520,2020-12-01,SERANGOON NORTH VIEW,81.000381,condominium,01 to 05,THE GARDEN RESIDENCES SERANGOON NORTH VIEW,2017,leasehold,Private,1,5
4,THE FLORENCE RESIDENCES,1582000,1011.82,1564,2020-12-01,HOUGANG AVENUE 2,94.001245,condominium,06 to 10,THE FLORENCE RESIDENCES HOUGANG AVENUE 2,2018,leasehold,Private,6,10


In [54]:
hdb_df = hdb_df.rename({'month': 'transaction_date', 'unique_name': 'property_index',
                        'floor_area_sqm': 'area_sqm', 'storey_range': 'floor_level',
                        'flat_type': 'property_sub_type', 'block': 'project_name'}, axis=1)

hdb_df['floor_level'] = hdb_df['floor_level'].str.lower()
hdb_df['property_sub_type'] = hdb_df['property_sub_type'].str.lower()

numerical_cast_list = ['resale_price', 'area_sqm',
                       'remaining_lease_months', 'lease_commence_date']
for col in numerical_cast_list:
    hdb_df[col] = hdb_df[col].astype('int')
hdb_df['area_sqft'] = hdb_df['area_sqm']*sqm_2_sqrt
hdb_df['transaction_date'] = pd.to_datetime(
    hdb_df['transaction_date'], format="%Y-%m").dt.date

cat_cast_list = ['property_sub_type', 'flat_model', 'floor_level', 'town']
for col in cat_cast_list:
    hdb_df[col] = hdb_df[col].astype('category')

str_cast_list = ['street_name', 'property_index', 'project_name']
for col in str_cast_list:
    hdb_df[col] = hdb_df[col].astype('string')

hdb_df['property_type'] = 'HDB'
hdb_df['hold_type'] = 'leasehold'

hdb_df[['floor_low', 'floor_high']] = [
    extract_two_digits(i) for i in hdb_df['floor_level']]

hdb_df = hdb_df.drop(['remaining_lease_months', 'town', 'flat_model'], axis=1)

hdb_df['unitprice_psf'] = hdb_df['resale_price']/hdb_df['area_sqft']

In [62]:
transaction_sales = pd.concat([hdb_df, private_df])

## Properties

In [55]:
properties_df = unique_gdf.copy()
properties_df.columns = properties_df.columns.str.lower()
properties_df = properties_df.rename({'nameaddress': 'property_index'}, axis=1)

str_cast_list = ['property_index', 'blk_no',
                 'road_name', 'building', 'address']
for col in str_cast_list:
    properties_df[col] = properties_df[col].astype('string')

properties_df['postal'] = pd.to_numeric(
    properties_df['postal'], errors='coerce')

properties_df['postal'] = properties_df['postal'].astype('Int64')

properties_df = properties_df[[
    'property_index', 'blk_no', 'road_name', 'building', 'address', 'postal', 'planning_area', 'property_type']]

In [77]:
properties_df

Unnamed: 0,property_index,blk_no,road_name,building,address,postal,planning_area
0,AFFINITY AT SERANGOON SERANGOON NORTH AVENUE 1,26,SERANGOON NORTH AVENUE 1,AFFINITY AT SERANGOON,26 SERANGOON NORTH AVENUE 1 AFFINITY AT SERANG...,554340,SERANGOON
0,THE FLORENCE RESIDENCES HOUGANG AVENUE 2,83,HOUGANG AVENUE 2,THE FLORENCE RESIDENCES,83 HOUGANG AVENUE 2 THE FLORENCE RESIDENCES SI...,538860,HOUGANG
0,THE GARDEN RESIDENCES SERANGOON NORTH VIEW,5,SERANGOON NORTH VIEW,THE GARDEN RESIDENCES,5 SERANGOON NORTH VIEW THE GARDEN RESIDENCES S...,554345,SERANGOON
0,THE PROMENADE@PELIKAT JALAN PELIKAT,183,JALAN PELIKAT,THE PROMENADE@PELIKAT,183 JALAN PELIKAT THE PROMENADE@PELIKAT SINGAP...,537643,HOUGANG
0,PRIMO RESIDENCES JALAN PELIKAT,137,JALAN PELIKAT,NIL,137 JALAN PELIKAT SINGAPORE 537620,537620,HOUGANG
...,...,...,...,...,...,...,...
0,457B SENGKANG WEST RD,457B,SENGKANG WEST ROAD,FERNVALE WOODS,457B SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,792457,SENGKANG
0,106B BIDADARI PK DR,106B,BIDADARI PARK DRIVE,ALKAFF VISTA,106B BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,342106,TOA PAYOH
0,458B SENGKANG WEST RD,458B,SENGKANG WEST ROAD,FERNVALE WOODS,458B SENGKANG WEST ROAD FERNVALE WOODS SINGAPO...,792458,SENGKANG
0,105A BIDADARI PK DR,105A,BIDADARI PARK DRIVE,ALKAFF VISTA,105A BIDADARI PARK DRIVE ALKAFF VISTA SINGAPOR...,341105,TOA PAYOH


In [86]:
import random
facilities = ['bbq', 'gym', 'tennis court', 'sky terrace',
    'jacuzzi', 'swimming pool', 'yoga corner','pavilion','fitness corner']

private_properties = properties_df.loc[properties_df['property_type']=='private'][['property_index']]
private_properties['facilities'] = [random.sample(facilities, np.random.randint(5,7)) for i in  range(len(private_properties))]

private_facilities = private_properties.explode('facilities', ignore_index=True)

## Nearby

In [25]:
nearby_df = unique_joined.copy()
nearby_df = nearby_df.rename(
    {'searchval': 'property_index', 'distance': 'distance_m'}, axis=1)
nearby_df = nearby_df[['property_index', 'type', 'name', 'distance_m']]

# Sales/Listing Data
- creation from transacted data

In [26]:
frac = 0.8
replace = True
hdb_sales = (hdb_df.sort_values('transaction_date', ascending=False).groupby(
    'property_index').first().sample(frac=frac, replace=replace))

private_sales = (private_df.sort_values('transaction_date', ascending=False).groupby(
    'property_index').first().sample(frac=frac, replace=replace))

In [27]:
listing_sales = pd.concat([hdb_sales, private_sales])

# adding room no
listing_sales['room_no'] = [
    i[0] if 'room' in i else 0 for i in listing_sales['property_sub_type']]
listing_sales['room_no'] = [x if x != 0 else np.clip(int(
    y/np.random.randint(15, 25)/sqm_2_sqrt), a_min=1, a_max=6) for x, y in zip(listing_sales['room_no'], listing_sales['area_sqft'])]
listing_sales['room_no'] = listing_sales['room_no'].astype('int')
# adding bathroom
listing_sales['bathroom_no'] = [np.clip(int(
    x/np.random.randint(35, 45)/sqm_2_sqrt), a_min=1, a_max=4) for x in listing_sales['area_sqft']]
listing_sales['bathroom_no'] = listing_sales['bathroom_no'].astype('int')
# floor
listing_sales['floor'] = [int(np.random.randint(x, y)) for x, y in zip(
    listing_sales['floor_low'], listing_sales['floor_high'])]
listing_sales['floor'] = listing_sales['floor'].astype('int')

listing_sales = listing_sales.drop(
    ['floor_low', 'floor_high', 'floor_level'], axis=1)

In [28]:
# sg rooms are typically of 10-30sqm area

In [29]:
listing_sales  # ['floor'].value_counts()

Unnamed: 0_level_0,transaction_date,property_sub_type,project_name,street_name,area_sqm,lease_commence_date,resale_price,area_sqft,property_type,hold_type,unitprice_psf,room_no,bathroom_no,floor
property_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
327 TAH CHING RD,2024-04-01,4 room,327,TAH CHING RD,103.000000,1997,503000,1108.692,HDB,leasehold,453.687769,4,2,14
318A ANCHORVALE LINK,2024-09-01,5 room,318A,ANCHORVALE LINK,110.000000,2003,618000,1184.040,HDB,leasehold,521.941826,5,3,4
326 CLEMENTI AVE 5,2024-08-01,3 room,326,CLEMENTI AVE 5,67.000000,1978,430000,721.188,HDB,leasehold,596.238429,3,1,1
670 JLN DAMAI,2024-04-01,5 room,670,JLN DAMAI,125.000000,1996,800000,1345.500,HDB,leasehold,594.574508,5,3,8
339D KANG CHING RD,2024-06-01,3 room,339D,KANG CHING RD,67.000000,2012,470000,721.188,HDB,leasehold,651.702469,3,1,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BAYOU RESIDENCE UPPER PAYA LEBAR ROAD,2022-08-01,condominium,BAYOU RESIDENCE,UPPER PAYA LEBAR ROAD,73.000000,,1000000,785.770,Private,freehold,1273.000000,3,1,4
CENTRAL VIEW HOUGANG STREET 11,2024-07-01,condominium,CENTRAL VIEW,HOUGANG STREET 11,117.001273,1998,1550000,1259.390,Private,leasehold,1231.000000,6,3,8
MIDTOWN RESIDENCES UPPER SERANGOON ROAD,2024-07-01,condominium,MIDTOWN RESIDENCES,UPPER SERANGOON ROAD,60.000557,2013,1030000,645.840,Private,leasehold,1595.000000,3,1,6
3@PHILLIPS PHILLIPS AVENUE,2024-06-01,condominium,3@PHILLIPS,PHILLIPS AVENUE,168.001375,1886,2048000,1808.350,Private,leasehold,1133.000000,6,4,4


# Output

In [32]:
properties_df.to_parquet('../data/L3/propeties.parquet')
# added town to the property table
nearby_df.to_parquet('../data/L3/nearby_facilities.parquet')
# TODO maybe add walking est/time too
transaction_sales.to_parquet('../data/L3/transactions_sales.parquet')
private_facilities.to_parquet('../data/L3/private_facilities.parquet')

listing_sales.to_parquet('../data/L3/listing_sales.parquet')