In [0]:
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import col, md5, concat_ws

In [0]:
stations_df = spark.read \
    .table("mta_bronze.station")

dim_boroughs_df = spark.read \
    .table("mta_silver.dim_borough")

lkp_stations_df = spark.read \
    .table("mta_silver.lkp_station")

In [0]:
stations_trans_df = stations_df \
    .join(dim_boroughs_df, stations_df.stn_borough == dim_boroughs_df.brh_nk, "left") \
    .join(lkp_stations_df, stations_df.stn_stop_name == lkp_stations_df.lks_abbr, "left") \
    .withColumn("stn_borough", col("brh_name")) \
    .withColumn("stn_stop_name", col("lks_name")) \
    .withColumn("stn_sk", md5(concat_ws("_", col("stn_gtfs_stop_id"), col("stn_id"), ))) \
    .withColumn("stn_ada", col("stn_ada").cast(BooleanType())) \
    .withColumn("stn_ada_northbound", col("stn_ada_northbound").cast(BooleanType())) \
    .withColumn("stn_ada_southbound", col("stn_ada_southbound").cast(BooleanType()))

In [0]:
stations_final_df = stations_trans_df.select(
    'stn_sk',
    col('stn_id').alias('stn_nk'),
    'stn_gtfs_stop_id',
    'stn_division',
    'stn_stop_name',
    'stn_borough',
    'stn_structure',
    'stn_gtfs_latitude',
    'stn_gtfs_longitude',
    'stn_north_direction_lbl',
    'stn_south_direction_lbl',
    'stn_ada',
    'stn_ada_northbound',
    'stn_ada_southbound',
    'stn_ada_notes',
    'stn_ingestion_date',
    'stn_source'
)

In [0]:
stations_final_df.write.mode("overwrite").format("delta").partitionBy("stn_borough").option("mergeSchema", "true").saveAsTable("mta_silver.dim_station")

In [0]:
%sql
SELECT * FROM mta_silver.dim_station;

In [0]:
dbutils.notebook.exit("Success")