In [12]:
import os
import h3
from pyspark.sql import SparkSession, DataFrame, Row
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.errors import AnalysisException
from dotenv import load_dotenv

load_dotenv("../.env-deploy", override=True)

True

In [2]:
data_home = "/Users/kwesi/Desktop/ai/gpts/mlsgpt/data"
jar_files = ["postgresql-42.7.3.jar", "mysql-connector-j-8.0.33.jar"]
jar_opts = ",".join([f"{data_home}/jars/{jar}" for jar in jar_files])
warehouse = f"{data_home}/warehouse"

spark: SparkSession = (
    SparkSession.builder\
    .appName("MLSGPT")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.sql.warehouse.dir", f"{warehouse}")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.jars", f"{jar_opts}") 
    .enableHiveSupport()
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

24/06/04 21:38:07 WARN Utils: Your hostname, marley.local resolves to a loopback address: 127.0.0.1; using 10.0.0.135 instead (on interface en0)
24/06/04 21:38:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/06/04 21:38:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
def read_table(url:str, props:dict, table_name: str, ) -> DataFrame:
    try:
        return spark.read.jdbc(url=url, table=table_name, properties=props)
    except AnalysisException as e:
        print(f"Table {table_name} not found")
        return None
    
pg_url = "jdbc:postgresql://{}:{}/{}".format(os.getenv("POSTGRES_HOST"), os.getenv("POSTGRES_PORT"),os.getenv("POSTGRES_DB"))
pg_props = {
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
    "driver": "org.postgresql.Driver"
}

In [31]:
@F.udf("string")
def h3_index(lat:float, lon:float, resolution:int) -> str:
    return h3.geo_to_h3(lat, lon, resolution)

df = (
    read_table(pg_url, pg_props, "rsbr.property")
    .select("property_id", "ListingID", "City", F.col("Latitude").cast(T.DoubleType()), F.col("Longitude").cast(T.DoubleType()))
    .withColumn("H3IndexR10", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(10)))
    .withColumn("H3IndexR09",  h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(9)))
    .withColumn("H3IndexR08", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(8)))
    .withColumn("H3IndexR07", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(7)))
    .withColumn("H3IndexR06", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(6)))
    .withColumn("H3IndexR05", h3_index(F.col("Latitude"), F.col("Longitude"), F.lit(5)))
)

In [32]:
(
    ("Total", df.count()),
    ("City", df.select("City").distinct().count()),
    ("H3IndexR10", df.select("H3IndexR10").distinct().count()), 
    ("H3IndexR09", df.select("H3IndexR09").distinct().count()), 
    ("H3IndexR08", df.select("H3IndexR08").distinct().count()), 
    ("H3IndexR07", df.select("H3IndexR07").distinct().count()), 
    ("H3IndexR06", df.select("H3IndexR06").distinct().count()), 
    ("H3IndexR05", df.select("H3IndexR05").distinct().count())
)

(('Total', 38881),
 ('City', 696),
 ('H3IndexR10', 24898),
 ('H3IndexR09', 18272),
 ('H3IndexR08', 9429),
 ('H3IndexR07', 4460),
 ('H3IndexR06', 1803),
 ('H3IndexR05', 548))

In [33]:
df.write.mode("overwrite").parquet(f"{data_home}/h3/property.parquet")

                                                                                

In [35]:
df.show(vertical=True, n=5, truncate=False)

-RECORD 0----------------------
 property_id | 6014467         
 ListingID   | 26638517        
 City        | toronto         
 Latitude    | 43.6740303      
 Longitude   | -79.3904724     
 H3IndexR10  | 8a2b9bc702d7fff 
 H3IndexR09  | 892b9bc702fffff 
 H3IndexR08  | 882b9bc703fffff 
 H3IndexR07  | 872b9bc70ffffff 
 H3IndexR06  | 862b9bc77ffffff 
 H3IndexR05  | 852b9bc7fffffff 
-RECORD 1----------------------
 property_id | 6014480         
 ListingID   | 26614087        
 City        | toronto         
 Latitude    | 43.7763214      
 Longitude   | -79.414032      
 H3IndexR10  | 8a2b9bc21a87fff 
 H3IndexR09  | 892b9bc21abffff 
 H3IndexR08  | 882b9bc21bfffff 
 H3IndexR07  | 872b9bc21ffffff 
 H3IndexR06  | 862b9bc27ffffff 
 H3IndexR05  | 852b9bc3fffffff 
-RECORD 2----------------------
 property_id | 6014481         
 ListingID   | 26458403        
 City        | toronto         
 Latitude    | 43.7593422      
 Longitude   | -79.3321152     
 H3IndexR10  | 8a2b9bd5132ffff 
 H3Index