# procesing all utilities with additional latlon and discard extra info

In [1]:
import pandas as pd
import numpy as np
# import polars as pl
import geopandas as gpd
from bs4 import BeautifulSoup

## OneMap

Create one map header

In [2]:
import requests
import os
import json

url = "https://www.onemap.gov.sg/api/auth/post/getToken"

payload = {
    "email": os.environ['ONEMAP_EMAIL'],
    "password": os.environ['ONEMAP_EMAIL_PASSWORD']
}

response = requests.request("POST", url, json=payload)
access_token = json.loads(response.text)['access_token']
headers = {"Authorization": f"{access_token}"}

# School

In [3]:
# config
school_query_onemap = False

In [4]:
import requests
import json
import pandas as pd
from time import sleep
from tqdm import tqdm
from random import uniform


def extract_df_data(school_df, search_cols, initial_backoff, max_retries, max_backoff, headers):
    """
    Extracts data from OneMap API for each search_cols in a pandas DataFrame.

    Args:
        school_df (pandas.DataFrame): DataFrame containing a column named 'postal_code'.
        initial_backoff (float): Initial delay for exponential backoff in seconds.
        max_retries (int): Maximum number of retries for failed requests.
        headers (dict): Dictionary containing headers for API requests.

    Returns:
        list: List of DataFrames containing extracted data for each successful request.
    """

    df_list = []
    for search_string in tqdm(school_df[search_cols], desc="Extracting Data"):
        retries = 0
        success = False
        backoff = initial_backoff

        while not success and retries < max_retries:
            try:
                url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_string}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
                response = requests.get(url, headers=headers)
                # print(f"{search_string}")
                response.raise_for_status()  # Raise an exception for HTTP errors
                data = json.loads(response.text)
                _df = pd.DataFrame(data['results']).iloc[0:1]
                _df['NameAddress'] = search_string
                df_list.append(_df)
                success = True

            except:
                # print(f"No results found for postal code {search_string}. Retry.")
                retries += 1
                backoff = min(backoff * 2, max_backoff)  # Exponential backoff
                delay = backoff + uniform(0, 1)  # Add jitter to the delay
                print(
                    f"Request failed for {search_string}. Retrying in {delay:.2f} seconds. (Retry {retries}/{max_retries})")
                sleep(delay)

        if not success:
            print(
                f"Failed to retrieve data for {search_string} after {max_retries} retries.")

    return df_list

In [5]:
import requests
import json
import pandas as pd
from time import sleep
from random import uniform

# Define maximum retries and initial/maximum backoff times
max_retries = 3
initial_backoff = 1  # seconds
max_backoff = 32  # seconds

In [6]:
def get_school_data_from_onemap(school_df):
    """
    Retrieves school data from OneMap API for each postal code in the 'postal_code' column of the provided DataFrame.

    Args:
        school_df (pd.DataFrame): DataFrame containing a 'postal_code' column.

    Returns:
        pd.DataFrame: DataFrame containing school data, including 'name', 'lat', 'lon', etc.
    """
    df_list = extract_df_data(school_df,
                              search_cols='postal_code',
                              initial_backoff=initial_backoff,
                              max_retries=max_retries,
                              max_backoff=max_backoff,
                              headers=headers)
    if df_list:  # Check if any data was retrieved
        df_school = pd.concat(df_list).rename({
            'SEARCHVAL': 'name',
            'LATITUDE': 'lat',
            'LONGITUDE': 'lon',
            'POSTAL': 'postal',
            'ADDRESS': 'address'
        }, axis=1)
        df_school = df_school[['name', 'lat', 'lon', 'postal', 'address']]
        df_school['type'] = 'school'
        df_school['address'] = [i.lower() for i in df_school['address']]
        df_school['name'] = [i.lower() for i in df_school['name']]
    else:
        print("No school data retrieved from OneMap API.")

    return df_school

In [7]:
if school_query_onemap:
    school = pd.read_csv(
        "../data/raw_data/csv/datagov/Generalinformationofschools.csv")
    school_df = get_school_data_from_onemap(school)
    school_df.to_parquet('../data/L1/school_queried.parqeut')

else:
    # Read from parquet file if not querying OneMap API
    try:
        school_df = pd.read_parquet("../data/L1/school_queried.parqeut")
        print("School data read from parquet file.")
    except FileNotFoundError:
        print("School data parquet file not found. Please ensure it exists or query OneMap API.")

School data read from parquet file.


# Mall

In [8]:
# config
mall_query_onemap = False

In [9]:
if mall_query_onemap:
    mall = pd.read_parquet("../data/raw_data/datawiki_shopping_mall.parquet")
    df_list = extract_df_data(mall,
                              search_cols='shopping_mall',
                              initial_backoff=initial_backoff,
                              max_retries=max_retries,
                              max_backoff=max_backoff,
                              headers=headers)
    if df_list:  # Check if any data was retrieved
        df_mall = pd.concat(df_list).rename({
            'SEARCHVAL': 'name',
            'LATITUDE': 'lat',
            'LONGITUDE': 'lon',
            'POSTAL': 'postal',
            'ADDRESS': 'address'
        }, axis=1)
        df_mall = df_mall[['name', 'lat', 'lon', 'postal', 'address']]
        df_mall['type'] = 'mall'
        df_mall['address'] = [i.lower() for i in df_mall['address']]
        df_mall['name'] = [i.lower() for i in df_mall['name']]

        df_mall.to_parquet('../data/L1/mall_queried.parqeut')
else:
    df_mall = pd.read_parquet("../data/L1/mall_queried.parqeut")

# Kindergardens, gym, hawker

In [10]:
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd


def extract_html_name(html_str: str, name_search) -> str:
    """
    Extracts the name from an HTML string.

    Args:
        html_str (str): The HTML string to parse.

    Returns:
        str: The extracted name.
    """
    soup = BeautifulSoup(html_str, 'html.parser')
    name_cell = soup.find('th', string=name_search).find_next('td')
    name = name_cell.text.strip()
    return name


def parse_datagov_geojson(path: str, data_type: str, name_search: str = 'NAME') -> pd.DataFrame:
    """
    Parses a GeoJSON file and extracts relevant data.

    Args:
        path (str): The file path to the GeoJSON file.
        data_type (str): The type of data (e.g., "kindergardens").

    Returns:
        pd.DataFrame: A DataFrame with the extracted data.
    """
    df = gpd.read_file(path)
    df = df.to_crs('4326')
    df["lat"] = df['geometry'].y
    df["lon"] = df['geometry'].x
    df['type'] = data_type
    df['name'] = [extract_html_name(i, name_search) for i in df['Description']]
    return pd.DataFrame(df[['name', 'type', 'lat', 'lon']])

In [11]:
kindergarden_df = parse_datagov_geojson(
    "../data/raw_data/csv/datagov/Kindergartens.geojson", "kindergarden")

In [12]:
gym_df = parse_datagov_geojson(
    "../data/raw_data/csv/datagov/GymsSGGEOJSON.geojson", "gym")

In [13]:
hawker_df = parse_datagov_geojson(
    "../data/raw_data/csv/datagov/HawkerCentresGEOJSON.geojson", "hawker")

In [14]:
water_activities_df = parse_datagov_geojson(
    "../data/raw_data/csv/datagov/WaterActivitiesSG.geojson", "water_activities")

In [15]:
supermarket_df = parse_datagov_geojson(
    "../data/raw_data/csv/datagov/SupermarketsGEOJSON.geojson", "supermarket", "LIC_NAME")

In [16]:
preschool_df = parse_datagov_geojson(
    "../data/raw_data/csv/datagov/PreSchoolsLocation.geojson", "preschool", "CENTRE_NAME")

# park

In [17]:
df = gpd.read_file(
    "../data/raw_data/csv/datagov/NParksParksandNatureReserves.geojson")
df = df.set_crs(crs='epsg:4326')
df = df.to_crs(crs=3857)
df['lon'] = df.centroid.x
df['lat'] = df.centroid.y
df['type'] = 'park'
df['name'] = [extract_html_name(i, 'NAME') for i in df['Description']]
park_df = df[['name', 'type', 'lon', 'lat', 'geometry']]
park_df.to_file("../data/L1/park.geojson", driver='GeoJSON')

In [18]:
df = gpd.read_file(
    "../data/raw_data/csv/datagov/MasterPlan2019SDCPParkConnectorLinelayerGEOJSON.geojson")
park_connector_df = df.drop('Description', axis=1)
park_connector_df.to_file(
    "../data/L1/park_connector.geojson", driver='GeoJSON')

In [19]:
waterbody_df = gpd.read_file(
    "../data/raw_data/csv/datagov/MasterPlan2019SDCPWaterbodylayerKML.kml")
waterbody_df = waterbody_df.to_crs('3857')
waterbody_df['area_m'] = waterbody_df.geometry.area
waterbody_df = waterbody_df[waterbody_df['area_m'] >= 4000].reset_index()
waterbody_df = waterbody_df[['Name', 'geometry', 'area_m']]
waterbody_df.to_parquet('../data/L1/amenity.parqeut')
waterbody_df.to_file("../data/L1/waterbody.geojson", driver='GeoJSON')

# Concat

In [20]:
df_combined = pd.concat([
    school_df[['name', 'type', 'lat', 'lon']],
    df_mall[['name', 'type', 'lat', 'lon']],
    kindergarden_df,
    gym_df,
    hawker_df,
    kindergarden_df,
    water_activities_df,
    supermarket_df,
    preschool_df
])

In [21]:
df_combined['lat'] = df_combined['lat'].astype('float')
df_combined['lon'] = df_combined['lon'].astype('float')

In [22]:
df_combined['type'].value_counts()

type
preschool           2290
kindergarden         896
supermarket          526
school               336
mall                 169
gym                  159
hawker               125
water_activities      32
Name: count, dtype: int64

In [23]:
df_combined.to_parquet('../data/L1/amenity.parqeut')