In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import lit, explode, col, year

In [0]:
major_incident_df = spark.read \
    .option("rowTag", "row") \
    .option("attributePrefix", "") \
    .xml("/mnt/mtasubwaydl/raw/mta_major_incidents.xml")

In [0]:
major_incident_expl_df = major_incident_df.withColumn("row", explode(col("row")))

In [0]:
major_incident_flat_renamed_df = major_incident_expl_df.select(
    col("row.month").alias("inc_month"),
    col("row.division").alias("inc_division"),
    col("row.line").alias("inc_line"),
    col("row.day_type").alias("inc_day_type"),
    col("row.category").alias("inc_category"),
    col("row.count").alias("inc_count")
)

In [0]:
major_incident_final_df = add_ingestion_date(major_incident_flat_renamed_df, alias="inc") \
    .withColumn("inc_source", lit("data.gov")) \
    .withColumn("inc_day_type", col("inc_day_type").cast(IntegerType())) \
    .withColumn("inc_count", col("inc_count").cast(IntegerType()))

In [0]:
major_incident_final_df.write.mode("overwrite").format("delta").partitionBy("inc_month").save("/mnt/mtasubwaydl/bronze/major_incident")

if not spark.catalog.tableExists("mta_bronze.major_incident"):
    spark.catalog.createTable(
        tableName="mta_bronze.major_incident",
        path="/mnt/mtasubwaydl/bronze/major_incident",
        source="delta"
)

In [0]:
%sql
SELECT * FROM mta_bronze.major_incident LIMIT 20;

In [0]:
dbutils.notebook.exit("Success")