In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import lit, explode, col, year

In [0]:
major_incident_df = spark.read \
    .option("rowTag", "row") \
    .option("attributePrefix", "") \
    .xml("/mnt/mtasubwaydl/raw/mta_major_incidents.xml")

In [0]:
major_incident_expl_df = major_incident_df.withColumn("row", explode(col("row")))

In [0]:
major_incident_flat_renamed_df = major_incident_expl_df.select(
    col("row.month").alias("inc_month"),
    col("row.division").alias("inc_division"),
    col("row.line").alias("inc_line"),
    col("row.day_type").alias("inc_day_type"),
    col("row.category").alias("inc_category"),
    col("row.count").alias("inc_count")
)

In [0]:
major_incident_final_df = add_ingestion_date(major_incident_flat_renamed_df, alias="inc") \
    .withColumn("inc_source", lit("data.gov")) \
    .withColumn("inc_day_type", col("inc_day_type").cast(IntegerType())) \
    .withColumn("inc_count", col("inc_count").cast(IntegerType()))

In [0]:
major_incident_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("inc_month") \
    .save("/mnt/mtasubwaydl/bronze/major_incident")

if not spark.catalog.tableExists("mta_bronze.major_incident"):
    spark.catalog.createTable(
        tableName="mta_bronze.major_incident",
        path="/mnt/mtasubwaydl/bronze/major_incident",
        source="delta"
)

In [0]:
%sql
SELECT * FROM mta_bronze.major_incident LIMIT 10;

inc_month,inc_division,inc_line,inc_day_type,inc_category,inc_count,inc_ingestion_date,inc_source
2015-08-01T00:00:00Z,B DIVISION,S Rock,2,Other,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,R,2,Signals,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,R,1,Subway Car,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,R,1,Signals,2,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,R,1,Other,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,Q,1,Signals,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,N,1,Track,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,N,1,Stations and Structure,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,N,1,Signals,1,2025-03-09T07:18:20.256Z,data.gov
2015-08-01T00:00:00Z,B DIVISION,N,1,Other,1,2025-03-09T07:18:20.256Z,data.gov


In [0]:
dbutils.notebook.exit("Success")