In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import DateType
from pyspark.sql.functions import md5, concat_ws, col, initcap, when

In [0]:
major_incident_df = spark.read \
    .table("mta_bronze.major_incident")

dim_lines_df = spark.read \
    .table("mta_silver.dim_line")

dim_inc_category_df = spark.read \
    .table("mta_silver.dim_inc_category")

In [0]:
major_incidents_transform_df = major_incident_df \
    .join(dim_lines_df, major_incident_df.inc_line == dim_lines_df.lin_nk, "inner") \
    .join(dim_inc_category_df, major_incident_df.inc_category == dim_inc_category_df.ict_category, "inner") \
    .withColumn("inc_sk", md5(concat_ws("_", "inc_month", "inc_division", "inc_line", "inc_day_type", "inc_category"))) \
    .withColumn("inc_division", initcap(col("inc_division"))) \
    .withColumn("dte_sk", col("inc_month").cast(DateType())) \
    .drop("inc_month") \
    .withColumn("inc_day_type", when(col("inc_day_type") == 1, "Weekday").otherwise("Weekend")) \
    .dropDuplicates(["inc_sk"])

In [0]:
#final select

major_incident_final_df = major_incidents_transform_df.select(
    'inc_sk',
    'lin_sk',
    'ict_sk',
    'dte_sk',
    'inc_count',
    'inc_ingestion_date',
    'inc_source',
)

In [0]:
if not spark._jsparkSession.catalog().tableExists("mta_silver.fct_major_incident"):

  major_incident_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("dte_sk") \
    .option("mergeSchema", "true") \
    .saveAsTable("mta_silver.fct_major_incident")

else:
  merge_delta_data(
    major_incident_final_df,
    db_name="mta_silver",
    table_name="fct_major_incident",
    merge_condition="tgt.inc_sk = src.inc_sk",
    partition_column="dte_sk"
)

In [0]:
%sql
SELECT * FROM mta_silver.fct_major_incident LIMIT 10;

inc_sk,lin_sk,ict_sk,dte_sk,inc_count,inc_ingestion_date,inc_source
00baede46423b1801a9e694370383634,7fc56270e7a70fa81a5935b72eacbe29,5be0ee9a2a4d1ffddc897625771606ab,2015-01-01,1,2025-03-09T07:18:20.256Z,data.gov
06b1afa40ad40e60de6fee56650ffb11,7fc56270e7a70fa81a5935b72eacbe29,92ea731d3af6677905303c88689f5d55,2015-01-01,2,2025-03-09T07:18:20.256Z,data.gov
0ca62e1f8cd26c5450c5498be0ca2cdb,800618943025315f869e4e1f09471012,cadd2b2ad06d8a0caee658e3c05e615a,2015-01-01,3,2025-03-09T07:18:20.256Z,data.gov
0ec3a6451dc12188e5dc5b1887878681,8d9c307cb7f3c4a32822a51922d1ceaa,cadd2b2ad06d8a0caee658e3c05e615a,2015-01-01,1,2025-03-09T07:18:20.256Z,data.gov
10ff7adc1f6b9f1ec80a943896fb495c,e4da3b7fbbce2345d7772b0674a318d5,97622cf2e8771871841450151d8f6c3b,2015-01-01,1,2025-03-09T07:18:20.256Z,data.gov
196cdb1392dece8594fff0452ee32820,1679091c5a880faf6fb5e6087eb1b2dc,5be0ee9a2a4d1ffddc897625771606ab,2015-01-01,4,2025-03-09T07:18:20.256Z,data.gov
1974487506a44045ab8b64c482c22926,e4da3b7fbbce2345d7772b0674a318d5,6311ae17c1ee52b36e68aaf4ad066387,2015-01-01,1,2025-03-09T07:18:20.256Z,data.gov
1ddf6920a583ca91726bc24200a9caea,69691c7bdcc3ce6d5d8a1361f22d04ac,6311ae17c1ee52b36e68aaf4ad066387,2015-01-01,2,2025-03-09T07:18:20.256Z,data.gov
1df39c0d75906410f8485cb47c371f27,1679091c5a880faf6fb5e6087eb1b2dc,5be0ee9a2a4d1ffddc897625771606ab,2015-01-01,1,2025-03-09T07:18:20.256Z,data.gov
31262797830bf4a595d4f545dc39e8f2,c4ca4238a0b923820dcc509a6f75849b,92ea731d3af6677905303c88689f5d55,2015-01-01,1,2025-03-09T07:18:20.256Z,data.gov


In [0]:
dbutils.notebook.exit("Success")