In [1]:
import ConnectionConfig as cc
from pyspark.sql.functions import arrays_overlap, split, when, col, expr
from pyspark.sql.functions import date_format
import math
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from delta import DeltaTable

cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("analyse", 4)
spark.getActiveSession()

In [15]:
cc.set_connectionProfile("velodb")

rides_src_df = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "rides") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 4138255) \
    .load()

rides_src_df.createOrReplaceTempView("rides")

vehicles = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "vehicles") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "vehicleid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 4138255) \
    .load()

vehicles.createOrReplaceTempView("vehicles")

bikelots = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "bikelots") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "bikelotid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 4138255) \
    .load()

bikelots.createOrReplaceTempView("bikelots")

biketypes = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "bike_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "biketypeid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 4138255) \
    .load()

biketypes.createOrReplaceTempView("biketypes")

In [3]:
factRide = spark.read.format("delta").load("./spark-warehouse/fact_rit")

#distance = spark.read.format("delta").load("./spark-warehouse/distance")
dim_weather = spark.read.format("delta").load("spark-warehouse/dim_weather")
dim_klant = spark.read.format("delta").load("spark-warehouse/dim_klant")
dim_slot = spark.read.format("delta").load("spark-warehouse/dim_slot")
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")

factRide.createOrReplaceTempView("viewRide")
dim_weather.createOrReplaceTempView("viewWeather")
dim_klant.createOrReplaceTempView("viewKlant")
dim_slot.createOrReplaceTempView("viewSlot")
dim_date.createOrReplaceTempView("viewDate")

In [16]:
bikelots.printSchema()

root
 |-- bikelotid: integer (nullable = true)
 |-- deliverydate: date (nullable = true)
 |-- biketypeid: integer (nullable = true)



In [36]:
main = spark.sql("""
select r.rideid, r_src.starttime, r_src.endtime, v_src.*,
s_start.lockid as startlockid, s_start.stationlocknr as startstationlocknr, s_start.stationid as startstationid, s_start.objectid as startobjectid,
s_start.type as startstationtype, s_start.number as startnumber, s_start.zipcode as startzipcode, s_start.gpscoord as startgpscoord,
s_end.lockid as endlockid, s_end.stationlocknr as endstationlocknr, s_end.stationid as endstationid, s_end.objectid as endobjectid,
s_end.type as endstationtype, s_end.number as endnumber, s_end.zipcode as endzipcode, s_end.gpscoord as endgpscoord,
d.CalendarDate, k.* from viewRide r
inner join rides r_src on r.rideid = r_src.rideid
inner join vehicles v_src on r_src.vehicleid = v_src.vehicleid
inner join bikelots b_src on v_src.bikelotid = b_src.bikelotid
inner join biketypes bt_src on b_src.biketypeid = bt_src.biketypeid
inner join viewSlot s_start on r.start_slot_id = s_start.lockid
inner join viewSlot s_end on r.end_slot_id = s_end.lockid
inner join viewDate d on r.date_SK = d.date_SK
inner join viewKlant k on r.klant_id = k.userid
where r.date_SK in (2601, 3093)
""")



In [25]:
spark.sql("""
select count(*) as count, date_SK from viewRide
group by date_SK order by count desc
""").show()

+-----+-------+
|count|date_SK|
+-----+-------+
| 7891|   2601|
| 7891|   3093|
| 7407|   2477|
| 7084|   3316|
| 7058|   3176|
| 7030|   2407|
| 6871|   3134|
| 6761|   2399|
| 6688|   2883|
| 6505|   2932|
| 6366|   2615|
| 6332|   2392|
| 6330|   3497|
| 6221|   2302|
| 6179|   2700|
| 6139|   3476|
| 6139|   3261|
| 6126|   2694|
| 6053|   3056|
| 6022|   2811|
+-----+-------+
only showing top 20 rows



In [37]:
main.show(2)

+-------+-------------------+-------------------+---------+------------+---------+-------------------+------+-----------------+-----------+------------------+--------------+-------------+----------------+-----------+------------+-----------------+---------+----------------+------------+-----------+--------------+---------+----------+-----------------+------------+------+--------------------+------------------+-----------+-------------------+-------------------+--------------------+-------+
| rideid|          starttime|            endtime|vehicleid|serialnumber|bikelotid|  lastmaintenanceon|lockid|         position|startlockid|startstationlocknr|startstationid|startobjectid|startstationtype|startnumber|startzipcode|    startgpscoord|endlockid|endstationlocknr|endstationid|endobjectid|endstationtype|endnumber|endzipcode|      endgpscoord|CalendarDate|userid|             address|subscriptiontypeid|   klant_SK|          scd_start|            scd_end|            md5_hash|current|
+-------+-

In [31]:
main.count()

15927

In [38]:
main.write.json("spark-warehouse/jsonFilesSmallV3")