In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfig as cc
from pyspark.sql.functions import when


cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("DIM_SLOT", 4)
spark.getActiveSession()

In [3]:
# EXTRACT

cc.set_connectionProfile("velodb")

# Reading from a JDBC source
df_locks = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "locks") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "lockid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_locks.createOrReplaceTempView("locks")

df_stations = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "stations") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "stationid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

df_stations.createOrReplaceTempView("stations")

In [4]:
# TRANSFORM

dim_slot = spark.sql("""
select lockid, stationlocknr, l.stationid, objectid, stationnr, type, street, number, zipcode, district, gpscoord from locks l inner join stations s on l.stationid = s.stationid
union all

select null as lockid, 'geen slot' as stationlocknr, null as stationid, null as objectid, null as stationnr, null as type, null as street, null as number, null as zipcode, null as district, null as gpscoord

""")

In [5]:
dim_slot.show(2)

+------+-------------+---------+--------+---------+-----------+----------+------+-------+---------+----------------+
|lockid|stationlocknr|stationid|objectid|stationnr|       type|    street|number|zipcode| district|        gpscoord|
+------+-------------+---------+--------+---------+-----------+----------+------+-------+---------+----------------+
|    36|           18|        2|   33203|      019|ENKELZIJDIG|ONTBREKEND|    12|   2000|ANTWERPEN|(51.219,4.40405)|
|    35|           17|        2|   33203|      019|ENKELZIJDIG|ONTBREKEND|    12|   2000|ANTWERPEN|(51.219,4.40405)|
+------+-------------+---------+--------+---------+-----------+----------+------+-------+---------+----------------+
only showing top 2 rows



In [6]:
dim_slot.where(dim_slot.lockid.isNull()).show()

+------+-------------+---------+--------+---------+----+------+------+-------+--------+--------+
|lockid|stationlocknr|stationid|objectid|stationnr|type|street|number|zipcode|district|gpscoord|
+------+-------------+---------+--------+---------+----+------+------+-------+--------+--------+
|  NULL|    geen slot|     NULL|    NULL|     NULL|NULL|  NULL|  NULL|   NULL|    NULL|    NULL|
+------+-------------+---------+--------+---------+----+------+------+-------+--------+--------+



In [7]:
# LOAD

dim_slot.coalesce(1).write.format("delta").mode("overwrite").saveAsTable("dim_slot")

In [8]:
spark.stop()