data reading from source



In [0]:
df=spark.sql("select * from databricks_cata.silver.acute_discharge_situation")

In [0]:
from pyspark.sql.functions import col, count

# 1. List all columns
all_cols = df.columns

# 2. Group by every column, count duplicates
dups = (
    df.groupBy(all_cols)
      .agg(count("*").alias("cnt"))
      .filter(col("cnt") > 1)
)

# 3. Show you any truly identical rows
dups.show(truncate=False)


In [0]:
dups.count()

In [0]:
from pyspark.sql import functions as F

# 1. Compute true min/max dates as before
min_max = (
    df
    .withColumn("period_date", F.to_date("period", "yyyy-MM-dd"))
    .agg(
        F.min("period_date").alias("start_date"),
        F.max("period_date").alias("end_date")
    )
    .collect()[0]
)
start = min_max["start_date"]
end   = min_max["end_date"]

# 2. Generate one seed row, then sequence & explode
calendar_df = (
    spark.range(1)   # single row with id = 0
         .selectExpr(
             f"sequence(to_date('{start}'), to_date('{end}'), interval 1 day) AS dt"
         )
         .select(F.explode("dt").alias("date_key"))
         .withColumn("year",        F.year("date_key"))
         .withColumn("month",       F.month("date_key"))
         .withColumn("day",         F.dayofmonth("date_key"))
         .withColumn("day_of_week", F.dayofweek("date_key"))
         .withColumn("is_weekend",  F.expr("day_of_week IN (1,7)"))
)

display(calendar_df)  # you should now see one row per date


In [0]:
calendar_df.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.dim_date")

In [0]:
%sql
select * from databricks_cata.gold.dim_date

In [0]:
dim_region = (df
  .filter("level = 'Region'")
  .selectExpr("org_code as region_code", "org_name as region_name")
  .distinct()
)

dim_region.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.dim_region")


In [0]:
%sql
select * from databricks_cata.gold.dim_region

In [0]:
dim_icb = (df
  .filter("level = 'ICB'")
  .selectExpr("org_code as icb_code", "org_name as icb_name", "region as region_code")
  .distinct()
)

dim_icb.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.dim_icb")


In [0]:
%sql
select * from databricks_cata.gold.dim_icb

In [0]:
dim_metric = (df
  .select("metric_group","metric","metric_type")
  .distinct()
)

dim_metric.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.dim_metric")


In [0]:
%sql
select * from databricks_cata.gold.dim_metric

In [0]:
df_silver = df.withColumn("period_date", F.to_date("period", "yyyy-MM-dd"))

In [0]:
display(df_silver)

In [0]:
from pyspark.sql.functions import to_date

spark.table("databricks_cata.silver.acute_discharge_situation") \
  .filter("level = 'ICB'") \
  .selectExpr(
    "to_date(period,'yyyy-MM-dd') AS date_key",
    "org_code",
    "metric_group",
    "value",
    "year",
    "month"
  ) \
  .write \
  .mode("overwrite") \
  .partitionBy("year","month") \
  .format("delta") \
  .saveAsTable("databricks_cata.gold.fact_daily_sitrep")


In [0]:
%sql
select * from databricks_cata.gold.fact_daily_sitrep

In [0]:
from pyspark.sql.functions import expr, col

fact_wide = (
  spark.table("databricks_cata.gold.fact_daily_sitrep")
    .groupBy("date_key", "year", "month", "org_code")
    .pivot("metric_group")
    .agg(expr("sum(value)"))
)

# Rename columns to remove spaces
for col_name in fact_wide.columns:
    new_col_name = col_name.replace(" ", "_")
    fact_wide = fact_wide.withColumnRenamed(col_name, new_col_name)

fact_wide.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.vw_fact_wide")

In [0]:
%sql
select * from databricks_cata.gold.vw_fact_wide

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1a) Add year/month columns
fact_monthly = (
    spark.table("vw_fact_wide")
      .withColumn("year",   F.year("date_key"))
      .withColumn("month",  F.month("date_key"))
)

# 1b) Aggregate to month × ICB
monthly_agg = (
    fact_monthly
      .groupBy("org_code","year","month")
      .agg(
        F.sum(F.col("NCTR")).alias("total_delays"),       # NCTR = count of patients delayed
        F.sum(F.col("Discharges")).alias("total_discharges")   # DISCH = count of discharges (if available)
      )
      .withColumn(
        "delay_rate",
        # per-100 discharges; if DISCH is null/zero you’d need to handle separately
        F.when(
          F.col("total_discharges")>0,
          F.col("total_delays")/F.col("total_discharges")*100
        ).otherwise(None)
      )
)

monthly_agg.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.vw_monthly_rates")

In [0]:
%sql
select * from databricks_cata.gold.vw_monthly_rates order by  year asc, month asc

In [0]:
# 2a) Peer averages & stddev by month
peer_stats = (
    spark.table("databricks_cata.gold.vw_monthly_rates")
      .groupBy("year","month")
      .agg(
        F.mean("delay_rate").alias("peer_avg_rate"),
        F.stddev("delay_rate").alias("peer_stddev_rate")
      )
)

# 2b) Join back & compute Z-score
peer_benchmarked = (
    spark.table("databricks_cata.gold.vw_monthly_rates")
      .join(peer_stats, ["year","month"])
      .withColumn(
        "z_score",
        (F.col("delay_rate") - F.col("peer_avg_rate")) / F.col("peer_stddev_rate")
      )
)

peer_benchmarked.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.vw_peer_benchmarked")

In [0]:
%sql
select * from databricks_cata.gold.vw_peer_benchmarked order by year asc, month asc 

In [0]:
# 5a) Monthly sum of A8 per ICB
barrier_monthly = (
    fact_monthly
      .groupBy("org_code","year","month")
      .agg(F.sum("A8").alias("transport_delays"))
)

# 5b) Peer stats and z-score
barrier_peer = (
    barrier_monthly
      .join(
        barrier_monthly
          .groupBy("year","month")
          .agg(
            F.mean("transport_delays").alias("peer_avg"),
            F.stddev("transport_delays").alias("peer_std")
          ),
        ["year","month"]
      )
      .withColumn(
        "z_score_transport",
        (F.col("transport_delays") - F.col("peer_avg")) / F.col("peer_std")
      )
      .select("year","month","transport_delays","peer_avg","z_score_transport")
)
display(barrier_peer)


In [0]:
from pyspark.sql import functions as F

# 1) unpivot A1–A8 into barrier, count
barrier_unpivot = (
  spark.table("databricks_cata.gold.vw_fact_wide")
    .selectExpr(
      "date_key", "org_code as icb_code",
      "year", "month",
      """
      stack(
        8,
        'A1', A1,
        'A2', A2,
        'A3', A3,
        'A4', A4,
        'A5', A5,
        'A6', A6,
        'A7', A7,
        'A8', A8
      ) AS (barrier, delay_count)
      """
    )
    .filter("delay_count IS NOT NULL")
)

# 2) aggregate to month×ICB×barrier
barrier_monthly = (
  barrier_unpivot
    .groupBy("icb_code","year","month","barrier")
    .agg(F.sum("delay_count").alias("total_barrier"))
)

# 3) compute peer avg & stddev per barrier×month
peer_barrier_stats = (
  barrier_monthly
    .groupBy("barrier","year","month")
    .agg(
      F.mean("total_barrier").alias("peer_avg"),
      F.stddev("total_barrier").alias("peer_std")
    )
)

# 4) join back & z-score
barrier_zscores = (
  barrier_monthly
    .join(peer_barrier_stats, ["barrier","year","month"])
    .withColumn(
      "z_score_barrier",
      (F.col("total_barrier") - F.col("peer_avg")) / F.col("peer_std")
    )
)

barrier_zscores.createOrReplaceTempView("vw_barrier_zscores")


In [0]:
%sql
SELECT * 
FROM vw_barrier_zscores
ORDER BY year, month, barrier, icb_code


In [0]:
barrier_zscores.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.vw_barrier_zscores")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1) Define exactly which metric_group columns you want to include
metrics = [
    "NCTR",
    "A1",
    "A2",
    "A3",
    "A4",
    "A5",
    "A6",
    "A7",
    "A8",
    "Additional_bed_days_lost"
]

# 2) Aggregate each metric by month × ICB, then unpivot with stack()
reason_monthly = (
    spark
      .table("databricks_cata.gold.vw_fact_wide")
      .groupBy("org_code", "year", "month")
      .agg(
        *[F.sum(F.col(m)).alias(m) for m in metrics]
      )
      .selectExpr(
        "org_code", "year", "month",
        f"""
        stack(
          {len(metrics)},
          {', '.join([f"'{m}', `{m}`" for m in metrics])}
        ) AS (reason, count)
        """
      )
      .filter("count IS NOT NULL")      # drop any null‐count rows
)

# 3) Rank each reason within its ICB × month by descending count
window_spec = Window.partitionBy("org_code", "year", "month") \
                    .orderBy(F.desc("count"))

top_reasons = (
    reason_monthly
      .withColumn("rank", F.rank().over(window_spec))
      .filter("rank <= 3")
)

top_reasons.write.mode("overwrite").format("delta").saveAsTable("databricks_cata.gold.vw_top_reasons")


In [0]:
%sql
select * from databricks_cata.gold.vw_top_reasons

In [0]:
%sql
SELECT org_code, year, month, reason, count
FROM vw_top3_reasons
ORDER BY org_code, year, month, rank;


In [0]:
%sql
Select * from databricks_cata.gold.dim_metric

In [0]:
%sql
select * from databricks_cata.gold.vw_barrier_zscores

In [0]:
%sql
select * from databricks_cata.gold.fact_daily_sitrep

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 0) Start from your wide fact
fact = spark.table("databricks_cata.gold.vw_fact_wide")

# 1) Fill NULLs → 0 for A1–A8
for b in ["A1","A2","A3","A4","A5","A6","A7","A8"]:
    fact = fact.withColumn(b, F.coalesce(F.col(b), F.lit(0)))

# 2) Unpivot *including* zeros (no filter)
barrier_unpivot = fact.selectExpr(
    "org_code as icb_code", "year", "month",
    """
    stack(
      8,
      'A1', A1,
      'A2', A2,
      'A3', A3,
      'A4', A4,
      'A5', A5,
      'A6', A6,
      'A7', A7,
      'A8', A8
    ) AS (barrier, total_barrier)
    """
)

# 3) Compute peer stats & Z-score exactly as before
peer_barrier_stats = (
    barrier_unpivot
      .groupBy("barrier","year","month")
      .agg(
        F.mean("total_barrier").alias("peer_avg"),
        F.stddev("total_barrier").alias("peer_std")
      )
)

barrier_zscores = (
    barrier_unpivot
      .join(peer_barrier_stats, ["barrier","year","month"])
      .withColumn("z_score_barrier",
          (F.col("total_barrier") - F.col("peer_avg")) / F.col("peer_std")
      )
)

barrier_zscores.createOrReplaceTempView("vw_barrier_zscores")


In [0]:
%sql
select * from vw_barrier_zscores

In [0]:
barrier_zscores.write.format("delta").mode("overwrite").saveAsTable("databricks_cata.gold.vw_barrier_zscores")

In [0]:
%sql
select count(*) as ctr_count
from databricks_cata.gold.fact_daily_sitrep
where metric_group = 'CTR'