In [None]:
import os
import h3
import googlemaps
from keplergl import KeplerGl
from datetime import datetime
from pyspark.sql import SparkSession, DataFrame, Row
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.errors import AnalysisException
from dotenv import load_dotenv

load_dotenv("../.env-deploy", override=True)

In [None]:
data_home = "/Users/kwesi/Desktop/ai/gpts/mlsgpt/data"
jar_files = ["postgresql-42.7.3.jar", "mysql-connector-j-8.0.33.jar"]
jar_opts = ",".join([f"{data_home}/jars/{jar}" for jar in jar_files])
warehouse = f"{data_home}/warehouse"

spark: SparkSession = (
    SparkSession.builder\
    .appName("MLSGPT")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.sql.warehouse.dir", f"{warehouse}")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.jars", f"{jar_opts}") 
    .enableHiveSupport()
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

def read_table(url:str, props:dict, table_name: str, ) -> DataFrame:
    try:
        return spark.read.jdbc(url=url, table=table_name, properties=props)
    except AnalysisException as e:
        print(f"Table {table_name} not found")
        return None
    
pg_host = os.getenv("POSTGRES_HOST")
pg_port = os.getenv("POSTGRES_PORT")
pg_db = os.getenv("POSTGRES_DB")
pg_user = os.getenv("POSTGRES_USER")
pg_pass = os.getenv("POSTGRES_PASSWORD")
pg_driver = "org.postgresql.Driver"
pg_url = f"jdbc:postgresql://{pg_host}:{pg_port}/{pg_db}"
pg_props = {"user": pg_user, "password": pg_pass, "driver": pg_driver}

h3_df = (
    spark.read.format("parquet")
    .load(f"{data_home}/h3/property.parquet")
    .drop("Latitude", "Longitude")
)

prop_df = (
    read_table(pg_url, pg_props, "rsbr.property")
    .join(h3_df, "property_id", "left")
)


In [None]:
def h3_index(lat:float, lng:float, resolution:int) -> str:
    return h3.geo_to_h3(lat, lng, resolution)

def geocode(address:str) -> dict:
    gmaps = googlemaps.Client(key=os.environ["GOOGLE_MAPS_API_KEY"])
    geocode = gmaps.geocode(address)
    return geocode[0]["geometry"]["location"]


In [None]:
def find_nearest(address:str, resolution:int=11):
    loc = geocode(address)
    index = h3_index(loc["lat"], loc["lng"], resolution=resolution)
    neighbors = h3.k_ring(index, 4)
    column = f"H3IndexR{str(resolution).zfill(2)}"
    return column, index, neighbors

def find_closest_properties(address:str, prop_df:DataFrame, resolution:int=11):
    column, _, neighbors = find_nearest(address, resolution)
    close_df = prop_df.filter(F.col(column).isin(neighbors))
    return close_df.select("property_id", "StreetAddress", "City", "Latitude", "Longitude")

In [None]:
addresses = [
    "440 Bathurst St, Toronto, ON M5T 2S6, Canada",
    "21 Lippincott St, Toronto, ON M5T 2R5, Canada",
    "5 Delabo Dr, North York, ON M3C 1W4, Canada",
    "760 Sheppard Ave W, North York, ON M3H 5T6, Canada",
    "2401 Keele St, North York, ON M6L 2N9, Canada",
    "71 Mitchell Pl, Newmarket, ON L3Y 0C7, Canada",
]
for address in addresses:
    find_closest_properties(address, prop_df).show(5, truncate=False)