In [1]:
import ConnectionConfig as cc
from pyspark.sql.functions import arrays_overlap, split, when, col
from delta import DeltaTable

cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("mongo_iets", 4)
spark.getActiveSession()

In [3]:
cc.set_connectionProfile("velodb")

rides_src_df = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "rides") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 4138255) \
    .load()

rides_src_df.createOrReplaceTempView("rides")

vehicles_src_df = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "vehicles") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "vehicleid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 301) \
    .load()

vehicles_src_df.createOrReplaceTempView("vehicles")



In [4]:
spark.sql("SELECT * FROM vehicles").show()

+---------+------------+---------+-------------------+------+-----------------+
|vehicleid|serialnumber|bikelotid|  lastmaintenanceon|lockid|         position|
+---------+------------+---------+-------------------+------+-----------------+
|        1|        1000|        1|2020-01-19 02:14:57|  NULL|(51.1968,4.40579)|
|        2|        2000|        1|2020-03-08 01:49:24|  NULL|(51.2177,4.42075)|
|        3|        3000|        1|2020-06-01 12:37:26|  1568|(51.1926,4.42151)|
|        4|        4000|        1|2020-02-27 03:13:56|  NULL|(51.2311,4.41267)|
|        5|        5000|        1|2021-03-21 03:38:31|  NULL|(51.2177,4.42075)|
|        6|        6000|        1|2020-06-16 21:44:19|  NULL|(51.2195,4.41169)|
|        7|        7000|        1|2019-10-01 10:29:21|  1556| (51.2273,4.4307)|
|        8|        8000|        1|2019-12-06 17:09:49|  NULL|(51.2047,4.39625)|
|        9|        9000|        1|2020-01-01 10:06:51|  NULL|(51.2058,4.41837)|
|       10|       10000|        1|2019-1

In [5]:
rides_src_df.printSchema()

root
 |-- rideid: integer (nullable = true)
 |-- startpoint: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- starttime: timestamp (nullable = true)
 |-- endtime: timestamp (nullable = true)
 |-- vehicleid: integer (nullable = true)
 |-- subscriptionid: integer (nullable = true)
 |-- startlockid: integer (nullable = true)
 |-- endlockid: integer (nullable = true)



In [6]:
vehicles_src_df.printSchema()

root
 |-- vehicleid: integer (nullable = true)
 |-- serialnumber: string (nullable = true)
 |-- bikelotid: integer (nullable = true)
 |-- lastmaintenanceon: timestamp (nullable = true)
 |-- lockid: integer (nullable = true)
 |-- position: string (nullable = true)



In [7]:
vehicles_src_df.show()

+---------+------------+---------+-------------------+------+-----------------+
|vehicleid|serialnumber|bikelotid|  lastmaintenanceon|lockid|         position|
+---------+------------+---------+-------------------+------+-----------------+
|        1|        1000|        1|2020-01-19 02:14:57|  NULL|(51.1968,4.40579)|
|        2|        2000|        1|2020-03-08 01:49:24|  NULL|(51.2177,4.42075)|
|        3|        3000|        1|2020-06-01 12:37:26|  1568|(51.1926,4.42151)|
|        4|        4000|        1|2020-02-27 03:13:56|  NULL|(51.2311,4.41267)|
|        5|        5000|        1|2021-03-21 03:38:31|  NULL|(51.2177,4.42075)|
|        6|        6000|        1|2020-06-16 21:44:19|  NULL|(51.2195,4.41169)|
|        7|        7000|        1|2019-10-01 10:29:21|  1556| (51.2273,4.4307)|
|        8|        8000|        1|2019-12-06 17:09:49|  NULL|(51.2047,4.39625)|
|        9|        9000|        1|2020-01-01 10:06:51|  NULL|(51.2058,4.41837)|
|       10|       10000|        1|2019-1

In [17]:
# Use Spark SQL to create a nested structure
nested_df = spark.sql("""
SELECT r.rideid, r.starttime, r.endtime, r.startpoint, r.endpoint, r.vehicleid,
    struct(v.vehicleid, v.serialnumber) as vehicles
FROM rides r
JOIN vehicles v ON r.vehicleid = v.vehicleid
""")

nested_df.show(truncate=False)

+------+-------------------+-------------------+-----------------+-----------------+---------+---------------+
|rideid|starttime          |endtime            |startpoint       |endpoint         |vehicleid|vehicles       |
+------+-------------------+-------------------+-----------------+-----------------+---------+---------------+
|19    |2019-09-22 08:50:08|2019-09-22 09:09:02|(51.1888,4.45039)|(51.2221,4.40467)|5536     |{5536, 5536000}|
|20    |2019-09-22 08:29:42|2019-09-22 08:31:40|(51.2159,4.41073)|(51.2191,4.41596)|6336     |{6336, 6336000}|
|29    |2019-09-22 08:25:19|2019-09-22 08:28:33|(51.2011,4.439)  |(51.2069,4.43013)|1925     |{1925, 1925000}|
|30    |2019-09-22 08:03:47|2019-09-22 08:05:09|(51.2214,4.41756)|(51.2168,4.41572)|5840     |{5840, 5840000}|
|32    |2019-09-22 08:12:54|2019-09-22 08:18:24|(51.2155,4.42551)|(51.228,4.41899) |3572     |{3572, 3572000}|
|38    |2019-09-22 08:07:25|2019-09-22 08:15:33|(51.1925,4.42158)|(51.2183,4.41409)|4032     |{4032, 4032000}|
|

In [18]:
nested_df.count()

4138241

In [19]:
# schrijf weg naar een json file
nested_df.write.format("json").mode("overwrite").save("spark-warehouse/rides_vehicles")