In [1]:
import ConnectionConfig as cc
from pyspark.sql.functions import arrays_overlap, split, when, col
from delta import DeltaTable

cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("FACT_RIT", 4)
spark.getActiveSession()

In [3]:
# EXTRACT
cc.set_connectionProfile("velodb")

rides_src_df = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "rides") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 4138255) \
    .load()

rides_src_df.createOrReplaceTempView("rides")

subscriptions_src_df = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "subscriptions") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "subscriptionid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 78248) \
    .load()

subscriptions_src_df.createOrReplaceTempView("subscriptions")

In [4]:
# EXTRACT
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")
dim_klant = spark.read.format("delta").load("spark-warehouse/dim_klant")
dim_slot = spark.read.format("delta").load("spark-warehouse/dim_slot")
dim_weather = spark.read.format("delta").load("spark-warehouse/dim_weather")

dim_date.createOrReplaceTempView("dimdate")
dim_klant.createOrReplaceTempView("dimklant")
dim_slot.createOrReplaceTempView("dimslot")
rides_src_df.createOrReplaceTempView("rides_source")
dim_weather.createOrReplaceTempView("dimweather")

In [7]:
# TRANSFORM

fact_rit_df = spark.sql("""
WITH rides_with_date AS (
    SELECT r.*, d.date_SK AS date_SK
    FROM rides r
    JOIN dimdate d
    ON cast( r.starttime as date) = cast(d.CalendarDate as date)
),
rides_with_slots AS (
    SELECT r.*, s_start.lockid AS start_slot_id_SK, s_end.lockid AS end_slot_id_SK
    FROM rides_with_date r
    JOIN dimslot s_start
    ON r.startlockid = s_start.lockid
    JOIN dimslot s_end
    ON r.endlockid = s_end.lockid
),
rides_with_subscriptions AS (
    SELECT r.*, s.subscriptionid, s.userid
    FROM rides_with_slots r
    LEFT JOIN subscriptions s
    ON r.subscriptionid = s.subscriptionid
),
rides_with_klant AS (
    SELECT r.*, k.userid AS klant_id_SK
    FROM rides_with_subscriptions r
    LEFT JOIN dimklant k
    ON  r.userid = k.userid
    AND r.starttime > k.scd_start
    AND r.starttime <= k.scd_end
),
rides_with_weather AS (
    SELECT r.*, w.weather_descriptor AS weather_descriptor_SK
    FROM rides_with_klant r
        
    LEFT JOIN dimslot s
    ON r.start_slot_id_SK = s.lockid
      
    LEFT JOIN dimweather w
    ON s.zipcode = w.zipcode
    AND year(r.starttime) = year(to_timestamp(w.dt))
    AND month(r.starttime) = month(to_timestamp(w.dt))
    AND day(r.starttime) = day(to_timestamp(w.dt))
    AND hour(r.starttime) = hour(to_timestamp(w.dt))
)
SELECT rideid, date_SK, start_slot_id_SK, end_slot_id_SK, klant_id_SK, COALESCE(weather_descriptor_SK, 'weertype onbekend') AS weather_descriptor_SK
FROM rides_with_weather
"""
)

In [9]:
# TRANSFORM
  
fact_rit_df.where(fact_rit_df.weather_descriptor_SK.isin("aangenaam","onaangenaam","neutraal")).show(50)

+------+-------+----------------+--------------+-----------+---------------------+
|rideid|date_SK|start_slot_id_SK|end_slot_id_SK|klant_id_SK|weather_descriptor_SK|
+------+-------+----------------+--------------+-----------+---------------------+
|113383|    298|            3386|          4786|      50717|             neutraal|
|113387|    298|            3514|          3158|      39872|             neutraal|
|113388|    298|            3422|          2827|       2249|             neutraal|
|113391|    298|            4065|           798|      28986|             neutraal|
|113413|    298|            3443|          2795|      23637|             neutraal|
|113465|    298|            3421|          1634|      24863|             neutraal|
|113510|    298|            3562|          3562|       NULL|             neutraal|
|113550|    298|            3401|          6289|      46643|             neutraal|
|113568|    298|            3419|          5663|      25509|             neutraal|
|113

In [11]:
# LOAD

fact_rit_df.write.format("delta").mode("overwrite").save("spark-warehouse/fact_rit")

In [12]:
spark.stop()