In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import *

In [0]:
entrance_exit_raw_df = spark.read \
    .option("multiline", "true") \
    .json("/mnt/mtasubwaydl/raw/mta_entrances_and_exits.json")

In [0]:
entrance_exit_expl_df = entrance_exit_raw_df.selectExpr("explode(data) as row")

In [0]:
column_names = entrance_exit_raw_df.selectExpr("meta.view.columns.name").first()[0]
column_names = [name for name in column_names]

In [0]:
entrance_exit_flat_df = entrance_exit_expl_df.select([col("row")[i].alias(column_names[i]) for i in range(len(column_names))])

In [0]:
# drop JSON metadata columns to adjust the schema to majority of CSV-sourced data

entrance_exit_dropped_df = entrance_exit_flat_df \
    .drop("sid") \
    .drop("id") \
    .drop("position") \
    .drop("created_at") \
    .drop("created_meta") \
    .drop("updated_at") \
    .drop("updated_meta") \
    .drop("meta")

In [0]:
entrance_exit_renamed_casted_df = entrance_exit_dropped_df \
    .withColumnRenamed("Division", "eex_division") \
    .withColumnRenamed("Line", "eex_line") \
    .withColumnRenamed("Borough", "eex_borough") \
    .withColumnRenamed("Stop Name", "eex_stop_name") \
    .withColumnRenamed("Complex ID", "eex_complex_id") \
    .withColumnRenamed("Constituent Station Name", "eex_constituent_station_name") \
    .withColumnRenamed("Station ID", "eex_station_id") \
    .withColumnRenamed("GTFS Stop ID", "eex_gtfs_stop_id") \
    .withColumnRenamed("Daytime Routes", "eex_daytime_routes") \
    .withColumnRenamed("Entrance Type", "eex_entrance_type") \
    .withColumnRenamed("Entry Allowed", "eex_entry_allowed") \
    .withColumnRenamed("Exit Allowed", "eex_exit_allowed") \
    .withColumnRenamed("Entrance Latitude", "eex_entrance_latitude") \
    .withColumnRenamed("Entrance Longitude", "eex_entrance_longitude") \
    .withColumnRenamed("entrance_georeference", "eex_entrance_georeference") \
    .withColumn("eex_station_id", col("eex_station_id").cast(IntegerType())) \
    .withColumn("eex_entrance_latitude", col("eex_entrance_latitude").cast(DoubleType())) \
    .withColumn("eex_entrance_longitude", col("eex_entrance_longitude").cast(DoubleType()))

In [0]:
entrance_exit_final_df = add_ingestion_date(entrance_exit_renamed_casted_df, alias="eex") \
    .withColumn("eex_source", lit("data.gov"))

In [0]:
entrance_exit_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("eex_borough") \
    .save("/mnt/mtasubwaydl/bronze/entrance_exit")

if not spark.catalog.tableExists("mta_bronze.entrance_exit"):
    spark.catalog.createTable(
        tableName="mta_bronze.entrance_exit",
        path="/mnt/mtasubwaydl/bronze/entrance_exit",
        source="delta"
)

In [0]:
%sql
SELECT * FROM mta_bronze.entrance_exit LIMIT 10;

eex_division,eex_line,eex_borough,eex_stop_name,eex_complex_id,eex_constituent_station_name,eex_station_id,eex_gtfs_stop_id,eex_daytime_routes,eex_entrance_type,eex_entry_allowed,eex_exit_allowed,eex_entrance_latitude,eex_entrance_longitude,eex_entrance_georeference,eex_ingestion_date,eex_source
IRT,Eastern Pky,B,Borough Hall/Court St,620,Borough Hall,415,423,2 3 4 5 R,Stair,YES,YES,40.6925551,-73.9901084,POINT (-73.9901084 40.6925551),2025-03-09T07:17:34.45Z,data.gov
IRT,Eastern Pky,B,Borough Hall/Court St,620,Borough Hall,415,423,2 3 4 5 R,Easement - Street,YES,YES,40.692299,-73.990301,POINT (-73.990301 40.692299),2025-03-09T07:17:34.45Z,data.gov
IRT,Eastern Pky,B,Borough Hall/Court St,620,Borough Hall,415,423,2 3 4 5 R,Stair,YES,YES,40.692601,-73.9905449,POINT (-73.9905449 40.692601),2025-03-09T07:17:34.45Z,data.gov
IRT,Eastern Pky,B,Borough Hall/Court St,620,Borough Hall,415,423,2 3 4 5 R,Easement - Street,YES,YES,40.69234,-73.99056,POINT (-73.99056 40.69234),2025-03-09T07:17:34.45Z,data.gov
IRT,Eastern Pky,B,Borough Hall/Court St,620,Borough Hall,415,423,2 3 4 5 R,Stair,YES,YES,40.6925156,-73.9912429,POINT (-73.9912429 40.6925156),2025-03-09T07:17:34.45Z,data.gov
IRT,Nostrand,B,Flatbush Av-Brooklyn College,359,Flatbush Av-Brooklyn College,359,247,2 5,Stair,YES,YES,40.6327597,-73.9473784,POINT (-73.9473784 40.6327597),2025-03-09T07:17:34.45Z,data.gov
IRT,Nostrand,B,Flatbush Av-Brooklyn College,359,Flatbush Av-Brooklyn College,359,247,2 5,Stair,YES,YES,40.632359,-73.947389,POINT (-73.947389 40.632359),2025-03-09T07:17:34.45Z,data.gov
IRT,Nostrand,B,Flatbush Av-Brooklyn College,359,Flatbush Av-Brooklyn College,359,247,2 5,Stair,YES,YES,40.6323382,-73.9474844,POINT (-73.9474844 40.6323382),2025-03-09T07:17:34.45Z,data.gov
IRT,Nostrand,B,Flatbush Av-Brooklyn College,359,Flatbush Av-Brooklyn College,359,247,2 5,Elevator,YES,YES,40.632463,-73.947518,POINT (-73.947518 40.632463),2025-03-09T07:17:34.45Z,data.gov
IRT,Nostrand,B,Flatbush Av-Brooklyn College,359,Flatbush Av-Brooklyn College,359,247,2 5,Stair,YES,YES,40.632869,-73.9475376,POINT (-73.9475376 40.632869),2025-03-09T07:17:34.45Z,data.gov


In [0]:
dbutils.notebook.exit("Success")