In [0]:
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import col, md5, concat_ws

In [0]:
stations_df = spark.read \
    .table("mta_bronze.station")

dim_boroughs_df = spark.read \
    .table("mta_silver.dim_borough")

lkp_stations_df = spark.read \
    .table("mta_silver.lkp_station")

In [0]:
stations_trans_df = stations_df \
    .join(dim_boroughs_df, stations_df.stn_borough == dim_boroughs_df.brh_nk, "left") \
    .join(lkp_stations_df, stations_df.stn_stop_name == lkp_stations_df.lks_abbr, "left") \
    .withColumn("stn_borough", col("brh_name")) \
    .withColumn("stn_stop_name", col("lks_name")) \
    .withColumn("stn_sk", md5(concat_ws("_", col("stn_gtfs_stop_id"), col("stn_id"), ))) \
    .withColumn("stn_ada", col("stn_ada").cast(BooleanType())) \
    .withColumn("stn_ada_northbound", col("stn_ada_northbound").cast(BooleanType())) \
    .withColumn("stn_ada_southbound", col("stn_ada_southbound").cast(BooleanType()))

In [0]:
stations_final_df = stations_trans_df.select(
    'stn_sk',
    col('stn_id').alias('stn_nk'),
    'stn_gtfs_stop_id',
    'stn_division',
    'stn_stop_name',
    'stn_borough',
    'stn_structure',
    'stn_gtfs_latitude',
    'stn_gtfs_longitude',
    'stn_north_direction_lbl',
    'stn_south_direction_lbl',
    'stn_ada',
    'stn_ada_northbound',
    'stn_ada_southbound',
    'stn_ada_notes',
    'stn_ingestion_date',
    'stn_source'
)

In [0]:
stations_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("stn_borough") \
    .option("mergeSchema", "true") \
    .saveAsTable("mta_silver.dim_station")

In [0]:
%sql
SELECT * FROM mta_silver.dim_station LIMIT 10;

stn_sk,stn_nk,stn_gtfs_stop_id,stn_division,stn_stop_name,stn_borough,stn_structure,stn_gtfs_latitude,stn_gtfs_longitude,stn_north_direction_lbl,stn_south_direction_lbl,stn_ada,stn_ada_northbound,stn_ada_southbound,stn_ada_notes,stn_ingestion_date,stn_source
c64eb990beffbc3efa5eba4734cbef5f,83,J15,BMT,Woodhaven Boulevard,Queens,Elevated,40.693879,-73.851576,Jamaica,Manhattan,True,True,True,,2025-03-09T07:18:46.888Z,data.gov
021e260a93b31dfe25fdf0e6e32d259c,461,718,IRT,Queensboro Plaza,Queens,Elevated,40.750582,-73.940202,Outbound,Manhattan,True,True,True,,2025-03-09T07:18:46.888Z,data.gov
d9f3db64cbb1b34ac8e18d1199619a70,200,H12,IND,Beach 90th Street,Queens,Viaduct,40.588034,-73.813641,Inbound,Rockaway Park,False,False,False,,2025-03-09T07:18:46.888Z,data.gov
1db2272d25df1ee771b91f578e6badc9,263,G10,IND,63rd Drive-Rego Park,Queens,Subway,40.729846,-73.861604,Outbound,Manhattan,False,False,False,,2025-03-09T07:18:46.888Z,data.gov
0a2836833e09a76e3d42a82c4ffd7714,456,712,IRT,61st Street-Woodside,Queens,Elevated,40.74563,-73.902984,Outbound,Manhattan,True,True,True,,2025-03-09T07:18:46.888Z,data.gov
522c173bf3669a2745f9cfa06856ebb9,195,A65,IND,Ozone Park-Lefferts Boulevard,Queens,Elevated,40.685951,-73.825798,Manhattan,Last Stop,True,True,True,,2025-03-09T07:18:46.888Z,data.gov
91eb445190fdea70eea263b2453805a5,464,721,IRT,Vernon Boulevard-Jackson Avenue,Queens,Subway,40.742626,-73.953581,Outbound,Manhattan,False,False,False,,2025-03-09T07:18:46.888Z,data.gov
4ef9db701e4ae5f0dc165eeaff066a0b,450,706,IRT,103rd Street-Corona Plaza,Queens,Elevated,40.749865,-73.8627,Outbound,Manhattan,False,False,False,,2025-03-09T07:18:46.888Z,data.gov
2533b5d6708a9cfdc4626ae1ba9fb11d,2,R03,BMT,Astoria Boulevard,Queens,Elevated,40.770258,-73.917843,Astoria,Manhattan,True,True,True,,2025-03-09T07:18:46.888Z,data.gov
41972977409498617f22923d6bdd5a46,267,G14,IND,Jackson Heights-Roosevelt Avenue,Queens,Subway,40.746644,-73.891338,Outbound,Manhattan,True,True,True,,2025-03-09T07:18:46.888Z,data.gov


In [0]:
dbutils.notebook.exit("Success")