In [0]:
from pyspark.sql.functions import sum, desc, round, avg, col
from pyspark.sql import Window

In [0]:
major_incident_df = spark.read \
    .table("mta_silver.fct_major_incident")

date_df = spark.read \
    .table("mta_silver.dim_date")

In [0]:
window_over_all = Window.orderBy().rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

inc_by_mth_of_year_t1_df = major_incident_df \
    .join(date_df, major_incident_df.dte_sk == date_df.dte_sk, "inner") \
    .select(
        "dte_month",
        "dte_month_name",
        "inc_count"
    ).groupBy(
        "dte_month",
        "dte_month_name"
    ).agg(
        sum("inc_count").alias("inc_count")
    ).orderBy(desc("inc_count"))

inc_by_mth_of_year_t2_df = inc_by_mth_of_year_t1_df \
    .withColumn("avg_incidents_per_mth_of_year", round(avg(col("inc_count")).over(window_over_all), 2)) \
    .withColumn("sum_incidents", round(sum(col("inc_count")).over(window_over_all), 2)) \
    .withColumn("inc_fraction_by_mth_of_year", round((col("inc_count") / col("sum_incidents") * 100), 2))

In [0]:
inc_by_mth_of_year_final_df = inc_by_mth_of_year_t2_df.select(
    col("dte_month").alias("imy_month"),
    col("dte_month_name").alias("imy_month_name"),
    col("inc_count").alias("imy_inc_count"),
    col("avg_incidents_per_mth_of_year").alias("imy_avg_incidents_per_mth_of_year"),
    col("sum_incidents").alias("imy_sum_incidents"),
    col("inc_fraction_by_mth_of_year").alias("imy_inc_percent_by_mth_of_year")
)

In [0]:
inc_by_mth_of_year_final_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("mta_gold.rpt_incidents_by_month_of_year")

In [0]:
%sql
SELECT * FROM mta_gold.rpt_incidents_by_month_of_year;

In [0]:
dbutils.notebook.exit("Success")