In [1]:
import os
import h3
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.errors import AnalysisException
from dotenv import load_dotenv

load_dotenv("../.env-deploy", override=True)

True

In [2]:
data_home = "/Users/kwesi/Desktop/ai/gpts/mlsgpt/data"
jar_files = ["postgresql-42.7.3.jar", "mysql-connector-j-8.0.33.jar"]
jar_opts = ",".join([f"{data_home}/jars/{jar}" for jar in jar_files])
warehouse = f"{data_home}/warehouse"

spark: SparkSession = (
    SparkSession.builder\
    .appName("MLSGPT")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.sql.warehouse.dir", f"{warehouse}")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.jars", f"{jar_opts}") 
    .enableHiveSupport()
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

24/06/06 20:29:58 WARN Utils: Your hostname, marley.local resolves to a loopback address: 127.0.0.1; using 10.0.0.135 instead (on interface en0)
24/06/06 20:29:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/06/06 20:29:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 55754)
Traceback (most recent call last):
  File "/Users/kwesi/.pyenv/versions/3.12.1/lib/python3.12/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/kwesi/.pyenv/versions/3.12.1/lib/python3.12/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
  File "/Users/kwesi/.pyenv/versions/3.12.1/lib/python3.12/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/kwesi/.pyenv/versions/3.12.1/lib/python3.12/socketserver.py", line 761, in __init__
    self.handle()
  File "/Users/kwesi/Desktop/ai/gpts/mlsgpt/.venv/lib/python3.12/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/Users/kwesi/Desktop/ai/gpts/mlsgpt/.venv/lib/python3.12/site-packages/pyspark/accumulators.

In [3]:
def read_table(url:str, props:dict, table_name: str, ) -> DataFrame:
    try:
        return spark.read.jdbc(url=url, table=table_name, properties=props)
    except AnalysisException as e:
        print(f"Table {table_name} not found")
        return None
    
pg_host = os.getenv("POSTGRES_HOST")
pg_port = os.getenv("POSTGRES_PORT")
pg_db = os.getenv("POSTGRES_DB")
pg_user = os.getenv("POSTGRES_USER")
pg_pass = os.getenv("POSTGRES_PASSWORD")
pg_url = f"jdbc:postgresql://{pg_host}:{pg_port}/{pg_db}"
pg_props = {"user": pg_user, "password": pg_pass, "driver": "org.postgresql.Driver"}

In [4]:
@F.udf("string")
def h3_index(lat:float, lng:float, resolution:int) -> str:
    return h3.geo_to_h3(lat, lng, resolution)
columns = [
    "property_id", "ListingID", "City", 
    F.col("Latitude").cast(T.DoubleType()), 
    F.col("Longitude").cast(T.DoubleType())
]
df = (
    read_table(pg_url, pg_props, "rsbr.property")
    .select(*columns)
    .withColumn("H3IndexR15", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(15)))
    .withColumn("H3IndexR14", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(14)))
    .withColumn("H3IndexR13", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(13)))
    .withColumn("H3IndexR12", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(12)))
    .withColumn("H3IndexR11", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(11)))
    .withColumn("H3IndexR10", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(10)))
    .withColumn("H3IndexR09", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(9)))
    .withColumn("H3IndexR08", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(8)))
    .withColumn("H3IndexR07", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(7)))
    .withColumn("H3IndexR06", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(6)))
    .withColumn("H3IndexR05", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(5)))
    .withColumn("H3IndexR04", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(4)))
    .withColumn("H3IndexR03", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(3)))
    .withColumn("H3IndexR02", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(2)))
    .withColumn("H3IndexR01", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(1)))
    .withColumn("H3IndexR00", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(0)))
)

In [5]:
(
    ("Total", df.count()),
    ("City", df.select("City").distinct().count()),
    ("H3IndexR15", df.select("H3IndexR15").distinct().count()), 
    ("H3IndexR13", df.select("H3IndexR13").distinct().count()), 
    ("H3IndexR11", df.select("H3IndexR11").distinct().count()), 
    ("H3IndexR09", df.select("H3IndexR09").distinct().count()), 
    ("H3IndexR07", df.select("H3IndexR07").distinct().count()), 
    ("H3IndexR05", df.select("H3IndexR05").distinct().count())
)

                                                                                

(('Total', 38881),
 ('City', 696),
 ('H3IndexR15', 31518),
 ('H3IndexR13', 30894),
 ('H3IndexR11', 28375),
 ('H3IndexR09', 18272),
 ('H3IndexR07', 4460),
 ('H3IndexR05', 548))

In [6]:
columns = [
    "property_id", "ListingID", "H3IndexR15", "H3IndexR14", "H3IndexR13", 
    "H3IndexR12", "H3IndexR11","H3IndexR10", "H3IndexR09", "H3IndexR08", 
    "H3IndexR07", "H3IndexR06","H3IndexR05", "H3IndexR04", "H3IndexR03", 
    "H3IndexR02", "H3IndexR01", "H3IndexR00"
]
(
    df.select(*columns).write.format("parquet")
    .mode("overwrite").save(f"{data_home}/h3/property.parquet")
)

                                                                                

24/06/07 02:04:35 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE