# procesing all utilities with additional latlon and discard extra info

In [169]:
import pandas as pd
import numpy as np
# import polars as pl
import geopandas as gpd
from bs4 import BeautifulSoup

## OneMap

In [170]:
import requests
import os
import json

url = "https://www.onemap.gov.sg/api/auth/post/getToken"
      
payload = {
        "email": os.environ['ONEMAP_EMAIL'],
        "password": os.environ['ONEMAP_EMAIL_PASSWORD']
      }
      
response = requests.request("POST", url, json=payload)
access_token = json.loads(response.text)['access_token']
headers = {"Authorization": f"{access_token}"}

# School

In [171]:
school = pd.read_csv("../data/raw_data/csv/datagov/Generalinformationofschools.csv")

In [172]:
import requests
import json
import pandas as pd
import time
import random

df_list = []
max_retries = 3
initial_backoff = 1  # seconds
max_backoff = 32  # seconds

for search_string in school['postal_code']:
    # print(search_string)
    retries = 0
    success = False
    backoff = initial_backoff

    while not success and retries < max_retries:
        try:
            url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={search_string}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
            response = requests.request("GET", url, headers=headers)
            response.raise_for_status()  # Raise an exception for HTTP errors

            _df = pd.DataFrame(json.loads(response.text)['results']).iloc[0:1]#reset_index().rename({'index':'search_result'}, axis=1)
            _df['NameAddress'] = search_string
            df_list.append(_df)

            success = True

        except requests.RequestException as e:
            retries += 1
            backoff = min(backoff * 2, max_backoff)  # Exponential backoff
            delay = backoff + random.uniform(0, 1)  # Add some jitter to the delay
            print(f"Request failed for {search_string}. Retrying in {delay:.2f} seconds. (Retry {retries}/{max_retries})")
            time.sleep(delay)

    if not success:
        print(f"Failed to retrieve data for {search_string} after {max_retries} retries.")

In [173]:
df_school = pd.concat(df_list).rename({'SEARCHVAL':'name','LATITUDE':'lat','LONGITUDE':'lon','POSTAL':'postal','ADDRESS':'address'}, axis=1)
df_school = df_school[['name','lat','lon','postal','address']]
df_school['type'] = 'school'
df_school['address'] = [i.lower() for i in df_school['address']]
df_school['name'] = [i.lower() for i in df_school['name']]

school_df = df_school

# Kindergardens, gym, hawker

In [174]:
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

def extract_html_name(html_str: str, name_search) -> str:
    """
    Extracts the name from an HTML string.

    Args:
        html_str (str): The HTML string to parse.

    Returns:
        str: The extracted name.
    """
    soup = BeautifulSoup(html_str, 'html.parser')
    name_cell = soup.find('th', string=name_search).find_next('td')
    name = name_cell.text.strip()
    return name

def parse_datagov_geojson(path: str, data_type: str, name_search:str='NAME') -> pd.DataFrame:
    """
    Parses a GeoJSON file and extracts relevant data.

    Args:
        path (str): The file path to the GeoJSON file.
        data_type (str): The type of data (e.g., "kindergardens").

    Returns:
        pd.DataFrame: A DataFrame with the extracted data.
    """
    df = gpd.read_file(path)
    df = df.to_crs('4326')
    df["lat"] = df['geometry'].y
    df["lon"] = df['geometry'].x
    df['type'] = data_type
    df['name'] = [extract_html_name(i, name_search) for i in df['Description']]
    return pd.DataFrame(df[['name', 'type', 'lat', 'lon']])



In [175]:
kindergarden_df = parse_datagov_geojson("../data/raw_data/csv/datagov/Kindergartens.geojson", "kindergarden")

In [176]:
gym_df = parse_datagov_geojson("../data/raw_data/csv/datagov/GymsSGGEOJSON.geojson", "gym")

In [177]:
hawker_df = parse_datagov_geojson("../data/raw_data/csv/datagov/HawkerCentresGEOJSON.geojson", "hawker")

In [178]:
water_activities_df = parse_datagov_geojson("../data/raw_data/csv/datagov/WaterActivitiesSG.geojson", "water_activities")

In [179]:
supermarket_df = parse_datagov_geojson("../data/raw_data/csv/datagov/SupermarketsGEOJSON.geojson", "supermarket", "LIC_NAME")

In [180]:
preschool_df = parse_datagov_geojson("../data/raw_data/csv/datagov/PreSchoolsLocation.geojson", "preschool", "CENTRE_NAME")

# park

In [181]:
df = gpd.read_file("../data/raw_data/csv/datagov/NParksParksandNatureReserves.geojson")
df = df.set_crs(crs='epsg:4326')
df = df.to_crs(crs=3857)
df['lon'] = df.centroid.x  
df['lat'] = df.centroid.y
df['type']='park'
df['name'] = [extract_html_name(i, 'NAME') for i in df['Description']]
park_df = df[['name','type','lon','lat','geometry']]
park_df.to_file("../data/L1/park.geojson", driver='GeoJSON')

In [182]:
df = gpd.read_file("../data/raw_data/csv/datagov/MasterPlan2019SDCPParkConnectorLinelayerGEOJSON.geojson")
park_connector_df = df.drop('Description', axis=1)
park_connector_df.to_file("../data/L1/park_connector.geojson", driver='GeoJSON')

In [183]:
waterbody_df = gpd.read_file("../data/raw_data/csv/datagov/MasterPlan2019SDCPWaterbodylayerKML.kml")
waterbody_df = waterbody_df.to_crs('3857')
waterbody_df['area_m'] = waterbody_df.geometry.area
waterbody_df = waterbody_df[waterbody_df['area_m']>=4000].reset_index()
waterbody_df = waterbody_df[['Name','geometry','area_m']]
waterbody_df.to_parquet('../data/L1/amenity.parqeut')
waterbody_df.to_file("../data/L1/waterbody.geojson", driver='GeoJSON')

# Concat

In [184]:
df_combined = pd.concat([
    school_df[['name','type','lat','lon']],
    kindergarden_df,
    gym_df,
    hawker_df,
    kindergarden_df,
    water_activities_df,
    supermarket_df,
    preschool_df
    ])

In [185]:
df_combined['lat'] = df_combined['lat'].astype('float')
df_combined['lon'] = df_combined['lon'].astype('float')

In [186]:
df_combined.to_parquet('../data/L1/amenity.parqeut')