In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import *
from pyspark.sql import Row
from pyspark.sql.types import *
import ConnectionConfig as cc
cc.setupEnvironment()

In [2]:
#active session
spark = cc.startLocalCluster("DIM_LOCK",4)
spark.getActiveSession()

In [3]:
cc.set_connectionProfile("VeloBike")
print(cc.create_jdbc())
locks_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "locks").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()

stations_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "stations").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()


# reneming the columns so there is no conflict
stations_df = stations_df.withColumnRenamed("stationid", "station_id")



jdbc:postgresql://localhost:5433/velodb


In [4]:

joined_df = locks_df.join(
    stations_df,
    locks_df.stationid == stations_df.station_id,
    "left"
)

# Instead of grouping by station_id, keep each lock as a separate row
# This way you can join with rides on lockid directly
lock_dim_df = joined_df.select(
    "lockid",
    "stationid",
    "stationlocknr",
    "street",
    "number",
    "zipcode",
    "district",
    "gpscoord"
)



# # Group by stationid to have one row per station
# lock_dim_df = joined_df.groupBy("station_id").agg(
#     first("lockid").alias("stationnr"),
#     first("street").alias("street"),
#     first("number").alias("number"),
#     first("zipcode").alias("zipcode"),
#     first("district").alias("district"),
#     first("gpscoord").alias("gpscoord")
# )
# lock_dim_df.show()


In [5]:

# the schema for the no locks rows
schema = StructType([
    StructField("lockid", IntegerType(), True),
    StructField("stationid", IntegerType(), True),
    StructField("stationlocknr", IntegerType(), True),
    StructField("street", StringType(), True),
    StructField("number", StringType(), True),
    StructField("zipcode", StringType(), True),
    StructField("district", StringType(), True),
    StructField("gpscoord", StringType(), True)
])


null_lock_row = Row(lockid=0, stationid=0, stationlocknr=None, street=None, number=None, zipcode=None, district=None, gpscoord=None)

null_lock_df = spark.createDataFrame([null_lock_row], schema)


final_lock_dim_df = lock_dim_df.union(null_lock_df)

final_lock_dim_df.show()

+------+---------+-------------+-----------+------+-------+---------+-----------------+
|lockid|stationid|stationlocknr|     street|number|zipcode| district|         gpscoord|
+------+---------+-------------+-----------+------+-------+---------+-----------------+
|    19|        2|            1| ONTBREKEND|    12|   2000|ANTWERPEN| (51.219,4.40405)|
|    20|        2|            2| ONTBREKEND|    12|   2000|ANTWERPEN| (51.219,4.40405)|
|    21|        2|            3| ONTBREKEND|    12|   2000|ANTWERPEN| (51.219,4.40405)|
|     1|        1|            1|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     2|        1|            2|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     3|        1|            3|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     4|        1|            4|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     5|        1|            5|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     6|        1|            6|

In [6]:
final_lock_dim_df.count()

7543

In [7]:
final_lock_dim_df.write.format("delta").mode("overwrite").saveAsTable("dimLock")
final_lock_dim_df.repartition(1).write.format("parquet").mode("overwrite").saveAsTable("dimLock_pq")


In [8]:
spark.stop()