In [1]:
import os
from pyspark.sql import SparkSession, DataFrame, Row
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.errors import AnalysisException


from dotenv import load_dotenv

os.environ["SPARK_VERSION"] = "3.3"
load_dotenv("../.env-deploy", override=True)

True

In [2]:
data_home = "/Users/kwesi/Desktop/ai/gpts/mlsgpt/data"
jar_files = ["postgresql-42.7.3.jar", "mysql-connector-j-8.0.33.jar"]
jar_opts = ",".join([f"{data_home}/jars/{jar}" for jar in jar_files])
warehouse = f"{data_home}/warehouse"

spark: SparkSession = (
    SparkSession.builder\
    .appName("MLSGPT")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.sql.warehouse.dir", f"{warehouse}")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.jars", f"{jar_opts}") 
    .enableHiveSupport()
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

24/05/18 19:14:24 WARN Utils: Your hostname, marley.local resolves to a loopback address: 127.0.0.1; using 10.0.0.135 instead (on interface en0)
24/05/18 19:14:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/05/18 19:14:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
def read_table(url:str, props:dict, table_name: str, ) -> DataFrame:
    try:
        return spark.read.jdbc(url=url, table=table_name, properties=props)
    except AnalysisException as e:
        print(f"Table {table_name} not found")
        return None
    
pg_url = "jdbc:postgresql://{}:{}/{}".format(os.getenv("POSTGRES_HOST"), os.getenv("POSTGRES_PORT"),os.getenv("POSTGRES_DB"))
pg_props = {
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
    "driver": "org.postgresql.Driver"
}
tables = ["rsbr.boards", "rsbr.office", "rsbr.agent", "rsbr.property", "rsbr.property_rooms"]
df = read_table(pg_url, pg_props, "rsbr.property")    

In [9]:
df.select("ArchitecturalStyle").distinct().show()

+--------------------+
|  ArchitecturalStyle|
+--------------------+
|             2 Level|
| 2 Level, Multi-Unit|
|                Loft|
|         Mobile Home|
|               Other|
|Bungalow, Mobile ...|
|   Multi-Unit, Other|
|             Cottage|
| 3 Level, Multi-Unit|
|              Chalet|
|          Multi-Unit|
|     Log house/cabin|
|Bungalow, Multi-Unit|
|     Raised bungalow|
|            Bungalow|
|  Mobile Home, Other|
|        Contemporary|
|             3 Level|
|                NULL|
+--------------------+



In [None]:
total = df.count()
rows = [
    Row(column=k, count=int(v), null_ratio=round(int(v)/total,4)) 
    for k,v in df.describe().collect()[0].asDict().items() if v != "count"
]
df4 = spark.createDataFrame(rows)

In [None]:
df4.toPandas().to_csv("../data/summary.csv", index=False)

In [None]:
df.select("Board").show()

In [None]:
df5 = spark.read.csv("../data/columns_to_keep.csv", header=True)

In [None]:
columns = [c.column for c in df5.collect()]
df6 = df.select(columns)

In [None]:
df6.printSchema()