In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

import ConnectionConfig as cc

cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("DIM_KLANT", 4)
spark.getActiveSession()

In [3]:
# EXTRACT

cc.set_connectionProfile("velodb")

df_velo_users = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "velo_users") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_velo_users.createOrReplaceTempView("df_velo_users")

df_subscription = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
        .option("dbtable", "subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "userid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_subscription.createOrReplaceTempView("df_subscription")

In [7]:
# TRANSFORM

address_df = spark.sql("""
    select userid, street || ' ' || number || ', ' || zipcode || ' ' || city || ' ' || country_code as address, monotonically_increasing_id() as klant_SK,
    to_timestamp('1999-01-01','yyyy-MM-dd') as scd_start, to_timestamp('2100-12-12','yyyy-MM-dd') as scd_end, md5(address) as md5_hash, True as current
    from df_velo_users
""")

subscription_df = spark.sql("""
    SELECT s.userid, s.subscriptiontypeid, s.validfrom
    FROM (
        SELECT userid, MAX(validfrom) AS max_validfrom
        FROM df_subscription
        GROUP BY userid
    ) AS r
    JOIN df_subscription s ON s.userid = r.userid AND s.validfrom = r.max_validfrom
""")

In [8]:
# TRANSFORM
joined_df = address_df.join(subscription_df, on='userid')

dim_klant = joined_df.select('userid', 'address', 'subscriptiontypeid', 'klant_SK', 'scd_start', 'scd_end', 'md5_hash','current')

In [None]:
# LOAD
dim_klant.write.format("delta").mode("overwrite").saveAsTable("DIM_KLANT")

In [10]:
spark.stop()