In [1]:
import os
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.errors import AnalysisException


from dotenv import load_dotenv

os.environ["SPARK_VERSION"] = "3.3"
load_dotenv("../.env-deploy", override=True)

True

In [2]:
import pydeequ

In [2]:
data_home = "/Users/kwesi/Desktop/ai/gpts/mlsgpt/data"
jar_files = ["postgresql-42.7.3.jar", "mysql-connector-j-8.0.33.jar"]
jar_opts = ",".join([f"{data_home}/jars/{jar}" for jar in jar_files])
warehouse = f"{data_home}/warehouse"

spark: SparkSession = (
    SparkSession.builder\
    .appName("MLSGPT")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.sql.warehouse.dir", f"{warehouse}")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.jars", f"{jar_opts}") 
    .enableHiveSupport()
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

24/06/07 09:34:11 WARN Utils: Your hostname, marley.local resolves to a loopback address: 127.0.0.1; using 10.0.0.135 instead (on interface en0)
24/06/07 09:34:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/06/07 09:34:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/07 09:34:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
def read_table(url:str, props:dict, table_name: str, ) -> DataFrame:
    try:
        return spark.read.jdbc(url=url, table=table_name, properties=props)
    except AnalysisException as e:
        print(f"Table {table_name} not found")
        return None
    
pg_url = "jdbc:postgresql://{}:{}/{}".format(os.getenv("POSTGRES_HOST"), os.getenv("POSTGRES_PORT"),os.getenv("POSTGRES_DB"))
pg_props = {
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
    "driver": "org.postgresql.Driver"
}
tables = ["rsbr.boards", "rsbr.office", "rsbr.agent", "rsbr.property", "rsbr.property_rooms"]
tables = ["rsbr.property"]
dfs = {k.split(".")[1]: read_table(pg_url, pg_props, k) for k in tables}

In [None]:
columns = [
    "Type",
    "PropertyType",
    "OwnershipType",
    "ConstructionArchitecturalStyle",
]

In [8]:
dfs["property"].groupBy("OwnershipType").count().show(truncate=False)

+----------------------+-----+
|OwnershipType         |count|
+----------------------+-----+
|Undivided Co-ownership|16   |
|Condominium/Strata    |9056 |
|NULL                  |1543 |
|Timeshare/Fractional  |3    |
|Freehold              |25725|
|Shares in Co-operative|42   |
|Life Lease            |15   |
|Cooperative           |20   |
|Condominium           |2228 |
|Leasehold Condo/Strata|37   |
|Leasehold             |186  |
|Other, See Remarks    |10   |
+----------------------+-----+



In [9]:
dfs["property"].groupBy("ConstructionStyleAttachment").count().show(truncate=False)

+---------------------------+-----+
|ConstructionStyleAttachment|count|
+---------------------------+-----+
|Up and down                |5    |
|Link                       |77   |
|NULL                       |11118|
|Semi-detached              |1360 |
|Detached                   |22126|
|Attached                   |4195 |
+---------------------------+-----+



In [5]:
# from pydeequ.analyzers import *

# analysisResult = AnalysisRunner(spark) \
#                     .onData(dfs["property"]) \
#                     .addAnalyzer(Size()) \
#                     .addAnalyzer(Completeness("b")) \
#                     .run()

# analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
# analysisResult_df.show()

In [6]:
# from pydeequ.profiles import *

# result = ColumnProfilerRunner(spark) \
#     .onData(dfs["property"]) \
#     .run()

# for col, profile in result.profiles.items():
#     print(profile)

In [7]:
# stats = dfs["property"].describe()

In [8]:
# stats.toPandas().transpose()

In [9]:
# dfs["property"].show()