In [0]:
from pyspark.sql.types import DateType
from pyspark.sql.functions import md5, concat_ws, col, initcap, when

In [0]:
major_incident_df = spark.read \
    .table("mta_bronze.major_incident")

dim_lines_df = spark.read \
    .table("mta_silver.dim_line")

dim_inc_category_df = spark.read \
    .table("mta_silver.dim_inc_category")

In [0]:
major_incidents_transform_df = major_incident_df \
    .join(dim_lines_df, major_incident_df.inc_line == dim_lines_df.lin_nk, "inner") \
    .join(dim_inc_category_df, major_incident_df.inc_category == dim_inc_category_df.ict_category, "inner") \
    .withColumn("inc_sk", md5(concat_ws("_", "inc_month", "inc_division", "inc_line", "inc_day_type", "inc_category"))) \
    .withColumn("inc_division", initcap(col("inc_division"))) \
    .withColumn("dte_sk", col("inc_month").cast(DateType())) \
    .drop("inc_month") \
    .withColumn("inc_day_type", when(col("inc_day_type") == 1, "Weekday").otherwise("Weekend"))

In [0]:
major_incident_final_df = major_incidents_transform_df.select(
    'inc_sk',
    'lin_sk',
    'ict_sk',
    'dte_sk',
    'inc_count',
    'inc_ingestion_date',
    'inc_source',
)

In [0]:
major_incident_final_df.write.mode("overwrite").format("delta").partitionBy("dte_sk").option("mergeSchema", "true").saveAsTable("mta_silver.fct_major_incident")

In [0]:
%sql
SELECT * FROM mta_silver.fct_major_incident LIMIT 20;

In [0]:
dbutils.notebook.exit("Success")