In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import lit

In [0]:
station_schema = StructType(fields=[
    StructField("GTFS Stop ID", StringType(), False),
    StructField("Station ID", IntegerType(), False),
    StructField("Complex ID", IntegerType(), False),
    StructField("Division", StringType(), False),
    StructField("Line", StringType(), False),
    StructField("Stop Name", StringType(), False),
    StructField("Borough", StringType(), False),
    StructField("CBD", BooleanType(), False),
    StructField("Daytime Routes", StringType(), False),
    StructField("Structure", StringType(), False),
    StructField("GTFS Latitude", DoubleType(), False),
    StructField("GTFS Longitude", DoubleType(), False),
    StructField("North Direction Label", StringType(), False),
    StructField("South Direction Label", StringType(), False),
    StructField("ADA", IntegerType(), False),
    StructField("ADA Northbound", IntegerType(), False),
    StructField("ADA Southbound", IntegerType(), False),
    StructField("ADA Notes", StringType(), True),
    StructField("Georeference", StringType(), False)
    ])

In [0]:
station_df = spark.read \
    .option("header", True) \
    .schema(station_schema) \
    .csv("/mnt/mtasubwaydl/raw/mta_subway_stations.csv")

In [0]:
station_renamed_df = station_df \
    .withColumnRenamed("GTFS Stop ID", "stn_gtfs_stop_id") \
    .withColumnRenamed("Station ID", "stn_id") \
    .withColumnRenamed("Complex ID", "stn_complex_id") \
    .withColumnRenamed("Division", "stn_division") \
    .withColumnRenamed("Line", "stn_line") \
    .withColumnRenamed("Stop Name", "stn_stop_name") \
    .withColumnRenamed("Borough", "stn_borough") \
    .withColumnRenamed("CBD", "stn_cbd") \
    .withColumnRenamed("Daytime Routes", "stn_daytime_routes") \
    .withColumnRenamed("Structure", "stn_structure") \
    .withColumnRenamed("GTFS Latitude", "stn_gtfs_latitude") \
    .withColumnRenamed("GTFS Longitude", "stn_gtfs_longitude") \
    .withColumnRenamed("North Direction Label", "stn_north_direction_lbl") \
    .withColumnRenamed("South Direction Label", "stn_south_direction_lbl") \
    .withColumnRenamed("ADA", "stn_ada") \
    .withColumnRenamed("ADA Northbound", "stn_ada_northbound") \
    .withColumnRenamed("ADA Southbound", "stn_ada_southbound") \
    .withColumnRenamed("ADA Notes", "stn_ada_notes") \
    .withColumnRenamed("Georeference", "stn_georeference")

In [0]:
station_final_df = add_ingestion_date(station_renamed_df, alias="stn") \
    .withColumn("stn_source", lit("data.gov"))

In [0]:
station_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("stn_borough") \
    .save("/mnt/mtasubwaydl/bronze/station")

if not spark.catalog.tableExists("mta_bronze.station"):
    spark.catalog.createTable(
        tableName="mta_bronze.station",
        path="/mnt/mtasubwaydl/bronze/station",
        source="delta"
)

In [0]:
%sql
SELECT * FROM mta_bronze.station LIMIT 20;

stn_gtfs_stop_id,stn_id,stn_complex_id,stn_division,stn_line,stn_stop_name,stn_borough,stn_cbd,stn_daytime_routes,stn_structure,stn_gtfs_latitude,stn_gtfs_longitude,stn_north_direction_lbl,stn_south_direction_lbl,stn_ada,stn_ada_northbound,stn_ada_southbound,stn_ada_notes,stn_georeference,stn_ingestion_date,stn_source
J15,83,83,BMT,Jamaica,Woodhaven Blvd,Q,False,J Z,Elevated,40.693879,-73.851576,Jamaica,Manhattan,1,1,1,,POINT (-73.851576 40.693879),2025-03-09T06:42:58.45Z,data.gov
718,461,461,IRT,Flushing,Queensboro Plaza,Q,False,7,Elevated,40.750582,-73.940202,Outbound,Manhattan,1,1,1,,POINT (-73.940202 40.750582),2025-03-09T06:42:58.45Z,data.gov
H12,200,200,IND,Rockaway,Beach 90 St,Q,False,A S,Viaduct,40.588034,-73.813641,Inbound,Rockaway Park,0,0,0,,POINT (-73.813641 40.588034),2025-03-09T06:42:58.45Z,data.gov
G10,263,263,IND,Queens Blvd,63 Dr-Rego Park,Q,False,M R,Subway,40.729846,-73.861604,Outbound,Manhattan,0,0,0,,POINT (-73.861604 40.729846),2025-03-09T06:42:58.45Z,data.gov
712,456,456,IRT,Flushing,61 St-Woodside,Q,False,7,Elevated,40.74563,-73.902984,Outbound,Manhattan,1,1,1,,POINT (-73.902984 40.74563),2025-03-09T06:42:58.45Z,data.gov
A65,195,195,IND,Liberty Av,Ozone Park-Lefferts Blvd,Q,False,A,Elevated,40.685951,-73.825798,Manhattan,Last Stop,1,1,1,,POINT (-73.825798 40.685951),2025-03-09T06:42:58.45Z,data.gov
721,464,464,IRT,Flushing,Vernon Blvd-Jackson Av,Q,False,7,Subway,40.742626,-73.953581,Outbound,Manhattan,0,0,0,,POINT (-73.953581 40.742626),2025-03-09T06:42:58.45Z,data.gov
706,450,450,IRT,Flushing,103 St-Corona Plaza,Q,False,7,Elevated,40.749865,-73.8627,Outbound,Manhattan,0,0,0,,POINT (-73.8627 40.749865),2025-03-09T06:42:58.45Z,data.gov
R03,2,2,BMT,Astoria,Astoria Blvd,Q,False,N W,Elevated,40.770258,-73.917843,Astoria,Manhattan,1,1,1,,POINT (-73.917843 40.770258),2025-03-09T06:42:58.45Z,data.gov
G14,267,616,IND,Queens Blvd,Jackson Hts-Roosevelt Av,Q,False,E F M R,Subway,40.746644,-73.891338,Outbound,Manhattan,1,1,1,,POINT (-73.891338 40.746644),2025-03-09T06:42:58.45Z,data.gov


In [0]:
dbutils.notebook.exit("Success")