In [0]:
# 00_seed_providers_postgres.ipynb
# Goal: copy providers.csv into the local Postgres instance (table = providers).

import os
import socket, time

HOST, PORT = "localhost", 5432
CSV_PATH   = "dbfs:/FileStore/tables/providers.csv"
PG_PW      = os.environ["POSTGRES_PW"]

In [0]:
# 1. Wait (< 60s) for Postgres to accept connections
deadline = time.time() + 60
while time.time() < deadline:
    try:
        socket.create_connection((HOST, PORT), timeout=2).close()
        print("PostgreSQL is accepting connections.")
        break
    except OSError:
        time.sleep(2)
else:
    raise TimeoutError("PostgreSQL did not start within 60s. Check init logs.")

In [0]:
# 2. Load the CSV in Spark
df = spark.read.option("header", True).csv(CSV_PATH)

In [0]:
# 3. Overwrite the Providers table in Postgres
(df.write
   .format("jdbc")
   .option("url", f"jdbc:postgresql://{HOST}:{PORT}/postgres")
   .option("dbtable", "providers")
   .option("user", "postgres")
   .option("password", PG_PW)
   .option("driver", "org.postgresql.Driver")
   .mode("overwrite")
   .save())

print(f"Seeded {df.count()} provider rows into Postgres.")

In [0]:
# 4. Read back for sanity check
providers_check_df = (spark.read
                           .format("jdbc")
                           .option("url", f"jdbc:postgresql://{HOST}:{PORT}/postgres")
                           .option("dbtable", "providers")
                           .option("user", "postgres")
                           .option("password", PG_PW)
                           .option("driver", "org.postgresql.Driver")
                           .load())

display(providers_check_df)