In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import lit

In [0]:
station_schema = StructType(fields=[
    StructField("GTFS Stop ID", StringType(), False),
    StructField("Station ID", IntegerType(), False),
    StructField("Complex ID", IntegerType(), False),
    StructField("Division", StringType(), False),
    StructField("Line", StringType(), False),
    StructField("Stop Name", StringType(), False),
    StructField("Borough", StringType(), False),
    StructField("CBD", BooleanType(), False),
    StructField("Daytime Routes", StringType(), False),
    StructField("Structure", StringType(), False),
    StructField("GTFS Latitude", DoubleType(), False),
    StructField("GTFS Longitude", DoubleType(), False),
    StructField("North Direction Label", StringType(), False),
    StructField("South Direction Label", StringType(), False),
    StructField("ADA", IntegerType(), False),
    StructField("ADA Northbound", IntegerType(), False),
    StructField("ADA Southbound", IntegerType(), False),
    StructField("ADA Notes", StringType(), True),
    StructField("Georeference", StringType(), False)
    ])

In [0]:
station_df = spark.read \
    .option("header", True) \
    .schema(station_schema) \
    .csv("/mnt/mtasubwaydl/raw/mta_subway_stations.csv")

In [0]:
station_renamed_df = station_df \
    .withColumnRenamed("GTFS Stop ID", "stn_gtfs_stop_id") \
    .withColumnRenamed("Station ID", "stn_id") \
    .withColumnRenamed("Complex ID", "stn_complex_id") \
    .withColumnRenamed("Division", "stn_division") \
    .withColumnRenamed("Line", "stn_line") \
    .withColumnRenamed("Stop Name", "stn_stop_name") \
    .withColumnRenamed("Borough", "stn_borough") \
    .withColumnRenamed("CBD", "stn_cbd") \
    .withColumnRenamed("Daytime Routes", "stn_daytime_routes") \
    .withColumnRenamed("Structure", "stn_structure") \
    .withColumnRenamed("GTFS Latitude", "stn_gtfs_latitude") \
    .withColumnRenamed("GTFS Longitude", "stn_gtfs_longitude") \
    .withColumnRenamed("North Direction Label", "stn_north_direction_lbl") \
    .withColumnRenamed("South Direction Label", "stn_south_direction_lbl") \
    .withColumnRenamed("ADA", "stn_ada") \
    .withColumnRenamed("ADA Northbound", "stn_ada_northbound") \
    .withColumnRenamed("ADA Southbound", "stn_ada_southbound") \
    .withColumnRenamed("ADA Notes", "stn_ada_notes") \
    .withColumnRenamed("Georeference", "stn_georeference")

In [0]:
station_final_df = add_ingestion_date(station_renamed_df, alias="stn") \
    .withColumn("stn_source", lit("data.gov"))

In [0]:
station_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("stn_borough") \
    .save("/mnt/mtasubwaydl/bronze/station")

if not spark.catalog.tableExists("mta_bronze.station"):
    spark.catalog.createTable(
        tableName="mta_bronze.station",
        path="/mnt/mtasubwaydl/bronze/station",
        source="delta"
)

In [0]:
%sql
SELECT * FROM mta_bronze.station LIMIT 10;

stn_gtfs_stop_id,stn_id,stn_complex_id,stn_division,stn_line,stn_stop_name,stn_borough,stn_cbd,stn_daytime_routes,stn_structure,stn_gtfs_latitude,stn_gtfs_longitude,stn_north_direction_lbl,stn_south_direction_lbl,stn_ada,stn_ada_northbound,stn_ada_southbound,stn_ada_notes,stn_georeference,stn_ingestion_date,stn_source
S18,514,514,SIR,Staten Island,Eltingville,SI,False,SIR,Embankment,40.544601,-74.16457,Ferry,South Shore,0,0,0,,POINT (-74.16457 40.544601),2025-03-09T07:18:46.888Z,data.gov
S28,504,504,SIR,Staten Island,Clifton,SI,False,SIR,Elevated,40.621319,-74.071402,Ferry,South Shore,0,0,0,,POINT (-74.071402 40.621319),2025-03-09T07:18:46.888Z,data.gov
S13,519,519,SIR,Staten Island,Richmond Valley,SI,False,SIR,Open Cut,40.519631,-74.229141,Ferry,Tottenville,0,0,0,,POINT (-74.229141 40.519631),2025-03-09T07:18:46.888Z,data.gov
S24,508,508,SIR,Staten Island,Jefferson Av,SI,False,SIR,Embankment,40.583591,-74.103338,Ferry,South Shore,0,0,0,,POINT (-74.103338 40.583591),2025-03-09T07:18:46.888Z,data.gov
S31,501,501,SIR,Staten Island,St George,SI,False,SIR,Open Cut,40.643748,-74.073643,Last Stop,South Shore,1,1,1,,POINT (-74.073643 40.643748),2025-03-09T07:18:46.888Z,data.gov
S14,518,518,SIR,Staten Island,Pleasant Plains,SI,False,SIR,Embankment,40.52241,-74.217847,Ferry,South Shore,0,0,0,,POINT (-74.217847 40.52241),2025-03-09T07:18:46.888Z,data.gov
S19,513,513,SIR,Staten Island,Great Kills,SI,False,SIR,Open Cut,40.551231,-74.151399,Ferry,South Shore,1,1,1,,POINT (-74.151399 40.551231),2025-03-09T07:18:46.888Z,data.gov
S23,509,509,SIR,Staten Island,Grant City,SI,False,SIR,Open Cut,40.578965,-74.109704,Ferry,South Shore,0,0,0,,POINT (-74.109704 40.578965),2025-03-09T07:18:46.888Z,data.gov
S09,522,522,SIR,Staten Island,Tottenville,SI,False,SIR,At Grade,40.512764,-74.251961,Ferry,Last Stop,1,1,1,,POINT (-74.251961 40.512764),2025-03-09T07:18:46.888Z,data.gov
S27,505,505,SIR,Staten Island,Grasmere,SI,False,SIR,Open Cut,40.603117,-74.084087,Ferry,South Shore,0,0,0,,POINT (-74.084087 40.603117),2025-03-09T07:18:46.888Z,data.gov


In [0]:
dbutils.notebook.exit("Success")