In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import DateType
from pyspark.sql.functions import md5, concat_ws, col, initcap, when

In [0]:
major_incident_df = spark.read \
    .table("mta_bronze.major_incident")

dim_lines_df = spark.read \
    .table("mta_silver.dim_line")

dim_inc_category_df = spark.read \
    .table("mta_silver.dim_inc_category")

In [0]:
major_incidents_transform_df = major_incident_df \
    .join(dim_lines_df, major_incident_df.inc_line == dim_lines_df.lin_nk, "inner") \
    .join(dim_inc_category_df, major_incident_df.inc_category == dim_inc_category_df.ict_category, "inner") \
    .withColumn("inc_sk", md5(concat_ws("_", "inc_month", "inc_division", "inc_line", "inc_day_type", "inc_category"))) \
    .withColumn("inc_division", initcap(col("inc_division"))) \
    .withColumn("dte_sk", col("inc_month").cast(DateType())) \
    .drop("inc_month") \
    .withColumn("inc_day_type", when(col("inc_day_type") == 1, "Weekday").otherwise("Weekend")) \
    .dropDuplicates(["inc_sk"])

In [0]:
major_incident_final_df = major_incidents_transform_df.select(
    'inc_sk',
    'lin_sk',
    'ict_sk',
    'dte_sk',
    'inc_count',
    'inc_ingestion_date',
    'inc_source',
)

In [0]:
if not spark._jsparkSession.catalog().tableExists("mta_silver.fct_major_incident"):

  major_incident_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("dte_sk") \
    .option("mergeSchema", "true") \
    .saveAsTable("mta_silver.fct_major_incident")

else:
  merge_delta_data(
    major_incident_final_df,
    db_name="mta_silver",
    table_name="fct_major_incident",
    merge_condition="tgt.inc_sk = src.inc_sk",
    partition_column="dte_sk"
)

In [0]:
%sql
SELECT * FROM mta_silver.fct_major_incident LIMIT 10;

inc_sk,lin_sk,ict_sk,dte_sk,inc_count,inc_ingestion_date,inc_source
00bad539aec1c157891ef4805b273acb,c81e728d9d4c2f636f067f89cc14862c,cadd2b2ad06d8a0caee658e3c05e615a,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
09ff33c97fb597aa7f42775d187f7bb7,743bda3a6098df4383907d7c772fca9c,92ea731d3af6677905303c88689f5d55,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
1d21c39cbeca2b6798a0294fea2492d0,eccbc87e4b5ce2fe28308fd9f2a7baf3,92ea731d3af6677905303c88689f5d55,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
1e696fe50f75feb5dba9d7893228bd9b,1679091c5a880faf6fb5e6087eb1b2dc,97622cf2e8771871841450151d8f6c3b,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
29a4d7a753b4fc1b4464612709c26d2e,f09564c9ca56850d4cd6b3319e541aee,cadd2b2ad06d8a0caee658e3c05e615a,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
3c184868736471f186fa05cae2f32101,1679091c5a880faf6fb5e6087eb1b2dc,5be0ee9a2a4d1ffddc897625771606ab,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
3ca21362601b5b6ff936710bb6801678,d20caec3b48a1eef164cb4ca81ba2587,92ea731d3af6677905303c88689f5d55,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
419f8558da87355b90b7ff8c3ec87383,3a3ea00cfc35332cedf6e5e9a32e94da,5be0ee9a2a4d1ffddc897625771606ab,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
463402e39fba1013cc84837b214e0c09,e1e1d3d40573127e9ee0480caf1283d6,92ea731d3af6677905303c88689f5d55,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov
487eec14d084afc1f169c06da53cab27,1679091c5a880faf6fb5e6087eb1b2dc,92ea731d3af6677905303c88689f5d55,2015-04-01,1,2025-03-09T07:18:20.256Z,data.gov


In [0]:
dbutils.notebook.exit("Success")