In [1]:
import ConnectionConfig as cc
from pyspark.sql.functions import arrays_overlap, split, when, col, expr
from pyspark.sql.functions import date_format
import math
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from delta import DeltaTable
cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("startLocalQuery", 4)
spark.getActiveSession()

# Meetwaarden

In [11]:
# EXTRACT

cc.set_connectionProfile("velodb")

rides_src_df = spark.read \
    .format("jdbc") \
    .option("driver", cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "rides") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 100) \
    .option("lowerBound", 0) \
    .option("upperBound", 4138255) \
    .load()

rides_src_df.createOrReplaceTempView("rides")

In [12]:
# EXTRACT
factRide = spark.read.format("delta").load("./spark-warehouse/fact_rit")
factRide.createOrReplaceTempView("factRide")

## Duurtijd

In [13]:
# TRANSFORM

# verschil tussen start- en eindtijd van een rit
spark.sql("select regexp_replace(duurtijd, '.*\\'(.*)\\'.*', '$1') as duurtijd from (select (endtime - starttime) as duurtijd from rides)").show(2)

+----------+
|  duurtijd|
+----------+
|0 00:14:53|
|0 00:02:04|
+----------+
only showing top 2 rows



In [14]:
# TRANSFORM

# verschil tussen start- en eindtijd van een rit in seconden
df_duurtijd = spark.sql("""
SELECT 
    factRide.rideid,
    starttime, 
    endtime, 
    (unix_timestamp(endtime) - unix_timestamp(starttime)) as duurtijd_in_seconden
FROM rides
inner join factRide on rides.rideid = factRide.rideid
""")

In [15]:
df_duurtijd.show(2)

+------+-------------------+-------------------+--------------------+
|rideid|          starttime|            endtime|duurtijd_in_seconden|
+------+-------------------+-------------------+--------------------+
|    15|2019-09-22 08:46:43|2019-09-22 09:01:36|                 893|
|    17|2019-09-22 08:27:38|2019-09-22 08:30:25|                 167|
+------+-------------------+-------------------+--------------------+
only showing top 2 rows



In [16]:
# TRANSFORM

factRide_with_duurtijd = factRide.join(df_duurtijd.select("rideid", "duurtijd_in_seconden"), on="rideid", how="left")

In [17]:
factRide_with_duurtijd.show(2)

+-------+-------+----------------+--------------+-----------+---------------------+--------------------+
| rideid|date_SK|start_slot_id_SK|end_slot_id_SK|klant_id_SK|weather_descriptor_SK|duurtijd_in_seconden|
+-------+-------+----------------+--------------+-----------+---------------------+--------------------+
|2069132|    985|            1099|          2754|      32419|    weertype onbekend|                 100|
|2069137|    985|            1854|          1600|      35053|    weertype onbekend|                 214|
+-------+-------+----------------+--------------+-----------+---------------------+--------------------+
only showing top 2 rows



## Afstand

In [18]:
def haversine_km(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = float(lat1), float(lon1), float(lat2), float(lon2)
    if lat1 == lat2 and lon1 == lon2:
        return 0.0
    else:
        R = 6371
        dlat = math.radians(lat2 - lat1)
        dlon = math.radians(lon1 - lon2)
        a = math.sin(dlat / 2) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) ** 2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        distance = R * c
        return distance

haversine_udf = udf(haversine_km, DoubleType())
spark.udf.register("haversine_km", haversine_udf)

<pyspark.sql.udf.UserDefinedFunction at 0x27c0c787290>

In [19]:
# TRANSFORM
rides_src_df = rides_src_df.withColumn("starttime_normal", date_format("starttime", "yyyy-MM-dd"))
rides_src_df.createOrReplaceTempView("rides_src_df")
coord = spark.sql("SELECT rideid, startpoint, endpoint, starttime_normal FROM rides_src_df") 

In [20]:
# TRANSFORM
coord_split = coord.withColumn("startpoint_lat", split(col("startpoint"), ",")[0]) \
                   .withColumn("startpoint_lon", split(col("startpoint"), ",")[1]) \
                   .withColumn("endpoint_lat", split(col("endpoint"), ",")[0]) \
                   .withColumn("endpoint_lon", split(col("endpoint"), ",")[1])

coord_cleaned = coord_split.withColumn("startpoint_lat", expr("substring(startpoint_lat, 2, length(startpoint_lat)-1)")) \
                           .withColumn("startpoint_lon", expr("substring(startpoint_lon, 1, length(startpoint_lon)-1)")) \
                           .withColumn("endpoint_lat", expr("substring(endpoint_lat, 2, length(endpoint_lat)-1)")) \
                           .withColumn("endpoint_lon", expr("substring(endpoint_lon, 1, length(endpoint_lon)-1)"))

coord_cleaned = coord_cleaned.drop("startpoint", "endpoint")

In [21]:
# TRANSFORM
coord_cleaned.createOrReplaceTempView("coord_cleaned")

result_df = spark.sql("""
SELECT
    rideid,
    startpoint_lat,
    startpoint_lon,
    endpoint_lat,
    endpoint_lon,
    haversine_km(startpoint_lat, startpoint_lon, endpoint_lat, endpoint_lon) as distance_km,
    starttime_normal
FROM coord_cleaned
""")

result_df.show(2)

+------+--------------+--------------+------------+------------+------------------+----------------+
|rideid|startpoint_lat|startpoint_lon|endpoint_lat|endpoint_lon|       distance_km|starttime_normal|
+------+--------------+--------------+------------+------------+------------------+----------------+
|    15|       51.2083|       4.44595|     51.1938|     4.40228| 3.443440890340808|      2019-09-22|
|    16|       51.2174|       4.41597|     51.2188|     4.40935|0.4866395640789212|      2019-09-22|
+------+--------------+--------------+------------+------------+------------------+----------------+
only showing top 2 rows



In [22]:
# TRANSFORM

result_df= result_df.drop("startpoint_lat","startpoint_lon","endpoint_lat","endpoint_lon","starttime_normal")

In [23]:
result_df.show(2)

+------+------------------+
|rideid|       distance_km|
+------+------------------+
|    15| 3.443440890340808|
|    16|0.4866395640789212|
+------+------------------+
only showing top 2 rows



In [24]:
# TRANSFORM

factRide_with_duurtijd_distance = factRide_with_duurtijd.join(result_df, on="rideid", how="left")

In [25]:
factRide_with_duurtijd_distance.show(2)

+-------+-------+----------------+--------------+-----------+---------------------+--------------------+-------------------+
| rideid|date_SK|start_slot_id_SK|end_slot_id_SK|klant_id_SK|weather_descriptor_SK|duurtijd_in_seconden|        distance_km|
+-------+-------+----------------+--------------+-----------+---------------------+--------------------+-------------------+
|2069132|    985|            1099|          2754|      32419|    weertype onbekend|                 100|0.47459539392385197|
|2069137|    985|            1854|          1600|      35053|    weertype onbekend|                 214| 1.2740061881205194|
+-------+-------+----------------+--------------+-----------+---------------------+--------------------+-------------------+
only showing top 2 rows



In [26]:
# LOAD

factRide_with_duurtijd_distance.write.option("mergeSchema", "true").format("delta").mode("overwrite").save("./spark-warehouse/fact_rit")

In [27]:
spark.stop()