In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import lit

In [0]:
station_schema = StructType(fields=[
    StructField("GTFS Stop ID", StringType(), False),
    StructField("Station ID", IntegerType(), False),
    StructField("Complex ID", IntegerType(), False),
    StructField("Division", StringType(), False),
    StructField("Line", StringType(), False),
    StructField("Stop Name", StringType(), False),
    StructField("Borough", StringType(), False),
    StructField("CBD", BooleanType(), False),
    StructField("Daytime Routes", StringType(), False),
    StructField("Structure", StringType(), False),
    StructField("GTFS Latitude", DoubleType(), False),
    StructField("GTFS Longitude", DoubleType(), False),
    StructField("North Direction Label", StringType(), False),
    StructField("South Direction Label", StringType(), False),
    StructField("ADA", IntegerType(), False),
    StructField("ADA Northbound", IntegerType(), False),
    StructField("ADA Southbound", IntegerType(), False),
    StructField("ADA Notes", StringType(), True),
    StructField("Georeference", StringType(), False)
    ])

In [0]:
station_df = spark.read \
    .option("header", True) \
    .schema(station_schema) \
    .csv("/mnt/mtasubwaydl/raw/mta_subway_stations.csv")

In [0]:
station_renamed_df = station_df \
    .withColumnRenamed("GTFS Stop ID", "stn_gtfs_stop_id") \
    .withColumnRenamed("Station ID", "stn_id") \
    .withColumnRenamed("Complex ID", "stn_complex_id") \
    .withColumnRenamed("Division", "stn_division") \
    .withColumnRenamed("Line", "stn_line") \
    .withColumnRenamed("Stop Name", "stn_stop_name") \
    .withColumnRenamed("Borough", "stn_borough") \
    .withColumnRenamed("CBD", "stn_cbd") \
    .withColumnRenamed("Daytime Routes", "stn_daytime_routes") \
    .withColumnRenamed("Structure", "stn_structure") \
    .withColumnRenamed("GTFS Latitude", "stn_gtfs_latitude") \
    .withColumnRenamed("GTFS Longitude", "stn_gtfs_longitude") \
    .withColumnRenamed("North Direction Label", "stn_north_direction_lbl") \
    .withColumnRenamed("South Direction Label", "stn_south_direction_lbl") \
    .withColumnRenamed("ADA", "stn_ada") \
    .withColumnRenamed("ADA Northbound", "stn_ada_northbound") \
    .withColumnRenamed("ADA Southbound", "stn_ada_southbound") \
    .withColumnRenamed("ADA Notes", "stn_ada_notes") \
    .withColumnRenamed("Georeference", "stn_georeference")

In [0]:
station_final_df = add_ingestion_date(station_renamed_df, alias="stn") \
    .withColumn("stn_source", lit("data.gov"))

In [0]:
station_final_df.write.mode("overwrite").format("delta").partitionBy("stn_borough").save("/mnt/mtasubwaydl/bronze/station")

if not spark.catalog.tableExists("mta_bronze.station"):
    spark.catalog.createTable(
        tableName="mta_bronze.station",
        path="/mnt/mtasubwaydl/bronze/station",
        source="delta"
)

In [0]:
%sql
SELECT * FROM mta_bronze.station LIMIT 20;

In [0]:
dbutils.notebook.exit("Success")