In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *
from pyspark.sql import functions as F


data reading


In [0]:
df=spark.read.format("parquet")\
    .load("abfss://bronze@cajainterview.dfs.core.windows.net/dailydischarge")

In [0]:
display(df)

In [0]:
df.printSchema()

In [0]:
df= df.drop("Include?","_rescued_data")

In [0]:
display(df)

In [0]:
from pyspark.sql.functions import to_date, col, trim

df = df.withColumn(
    "Period",
    to_date( trim(col("Period")) , "dd/MM/yyyy" )
)

In [0]:
display(df)

In [0]:
df=df.withColumn("year", year("Period")).withColumn("month", month("Period"))

In [0]:
display(df)

In [0]:
renamed_cols = ["period", "level", "region", "icb", "org_code", "org_name",
                "metric", "metric_type", "metric_group", "value", "year", "month"]

In [0]:
df = df.toDF(*renamed_cols)

In [0]:
display(df)

In [0]:
df = df.withColumn(
    "value",
    F.when(F.col("value") == "-", None)         # dash → null
     .otherwise(F.col("value").cast("int"))      # else cast legitimately
)

In [0]:
display(df)

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.functions import col, sum

null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
display(null_counts)

Why we back-filled “Metric Type” and “Metric Group”:

Consistency across all months

The first few CSVs didn’t include those two columns, so without back-filling, some rows would look “incomplete.”

By filling them in our Silver layer, every row now has the same set of fields, making our downstream dashboards and reports reliable.

Aligning with NHS definitions

The NHS specification (Annex F) tells us exactly which questions are “daily” vs “weekly,” and gives each a short code (e.g. NCTR for “no longer meet criteria”).

We used those exact definitions so our data matches NHS intent, not arbitrary guesses.

Simpler reporting and comparisons

With a full “Metric Type” (Daily/Weekly) and “Metric Group” code on every row, analysts and visualization tools can easily group, filter and compare metrics without writing special case logic.

Auditability and traceability

We keep the original full question text alongside our back-filled codes, so anyone can trace “NCTR” back to “Number of patients who no longer meet the criteria to reside.”

If the NHS changes their template again, we can adjust our back-fill rules and still see exactly what the original text said.



In [0]:
# 1. Backfill metric_type
#    All current daily questions (Q1‐Q9) are "Daily metric"
df = df.withColumn(
    "metric_type",
    when(col("metric_type").isNull(), lit("Daily metric"))
    .otherwise(col("metric_type"))
)

In [0]:
from pyspark.sql.functions import col, trim, when, lit

# 1. Trim stray whitespace
df = df.withColumn("metric", trim(col("metric")))

# 2. Exact-match back-fill using raw NHS mappings
df = df.withColumn(
    "metric_group",
    when(
        col("metric_group").isNull() & (col("metric") == 'Number of patients who meet the criteria to reside'), lit('CTR')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Number of patients who no longer meet the criteria to reside'), lit('NCTR')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Number of patients discharged'), lit('Discharges')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Number of patients remaining in hospital who no longer meet the criteria to reside'), lit('NCTR not discharged')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Number of additional bed days, patients with length of stay of 7+ days'), lit('Additional bed days lost')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Number of additional bed days, patients with length of stay of 14+ days'), lit('Additional bed days lost')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Number of additional bed days, patients with length of stay of 21+ days'), lit('Additional bed days lost')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Of the total number of patients discharged between 00:00 and 23:59, the number who used a personal health budget'), lit('PHB')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Of the total number of patients who do not meet the criteria to reside that day, the number discharged by 17:00 hours'), lit('DIS004a')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Of the total number of patients who do not meet the criteria to reside that day, the number discharged between 17:01 and 23:59 hours'), lit('DIS004b')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Of the patients discharged by 17:00 hours to the following locations: Extra Care schemes, Local authority owned residential homes, Other funded care home beds'), lit('DIS005a')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Of the patients discharged between 17:01 and 23:59 hours to the following locations: Extra Care schemes, Local authority owned residential homes, Other funded care home beds'), lit('DIS005b')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Pathway 0: Discharge to a domestic home, hotel, or other temporary accommodation without the need for new/increased care or support from health and social care'), lit('Discharge destination')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Pathway 1: Discharge to a domestic home, hotel, or other temporary accommodation, or hospice at home with rehabilitation, reablement and recovery'), lit('Discharge destination')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Pathway 2: Short-term bed/hospice for rehabilitation, reablement and recovery / end of life care'), lit('Discharge destination')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Pathway 3: Discharge to a care home as a new admission'), lit('Discharge destination')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting a medical decision/intervention including writing the discharge summary'), lit('A1')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting community equipment and adaptations to housing'), lit('A2')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting confirmation from community hub/single point of access that referral received and actioned'), lit('A3')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting Diagnostic test'), lit('A4')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting medicines to take home'), lit('A5')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting referral to community single point of access'), lit('A6')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting therapy decision to discharge'), lit('A7')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Awaiting transport'), lit('A8')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Bed-based rehabilitation, reablement or recovery services not yet available'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – End of life care inc Fast-Track CHC not yet available (Pathway 1 or 3)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Equipment and associated training not yet delivered (Pathway 1-3)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Home-based rehabilitation, reablement or recovery services not yet available (Pathway 1)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Housing adaptations not yet completed (Pathway 1 or 3)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Housing provision not yet available (Pathway 0 or 1)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Mental health admitted patient care not yet available (Pathway 2)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Other home-based community health services not yet available (Pathway 1)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Other home-based social care service not yet available (Pathway 1)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Capacity – Residential/nursing home care not yet available (Pathway 3)'), lit('Delay reason')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Care transfer hub process – Awaiting confirmation of funding eligibility'), lit('C')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Repatriation/Transfer to another acute trust for specialist treatment or ongoing treatment'), lit('F')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'No Plan'), lit('Z')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Declared as not meeting the criteria to reside at morning board … later in day meets the criteria to reside so discharge stopped'), lit('NCTR')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Homeless/no right of recourse to public funds/no place to discharge to'), lit('B1')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Individual/family not in agreement with discharge plans'), lit('B2')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Safeguarding concern preventing discharge or Court of protection'), lit('B3')
    ).when(
        col("metric_group").isNull() & (col("metric") == 'Remains in hospital to avoid spread of (non-Covid 19) infectious…'), lit('B4')
    ).when(
        col("metric_group").isNull() & (col("metric") == "P0 - Domestic home without reablement support"), lit("P0")
    ).when(
        col("metric_group").isNull() & (col("metric") == "P0 - Other without reablement support"), lit("P0")
    ).when(
        col("metric_group").isNull() & (col("metric") == "P1 - Domestic home with reablement support"), lit("P1")
    ).when(
        col("metric_group").isNull() & (col("metric") == "P1 - Other with reablement support"), lit("P1")
    ).when(
        col("metric_group").isNull() & (col("metric") == "P1 - Hotel with reablement support"), lit("P1")
    ).when(
        col("metric_group").isNull() & (col("metric") == "Declared as not meeting the criteria to reside at morning board … later in day meets the criteria to reside so discharge stopped"), lit("NCTR")
    ).when(
        col("metric_group").isNull() & (col("metric") == "Pathway 1: awaiting availability of resource for assessment and start of care at home"), lit("C")
    ).when(
        col("metric_group").isNull() & (col("metric") == "Pathway 2: awaiting availability of rehabilitation bed in community hospital or other bedded setting"), lit("D")
    ).when(
        col("metric_group").isNull() & (col("metric") == "Pathway 3: awaiting availability of a bed in a residential or nursing home that is likely to be a permanent placement"), lit("E")
    ).when(
        col("metric_group").isNull() & (col("metric") == "Remains in hospital to avoid spread of (non-Covid 19)"), lit("B4")
    ).when(
        col("metric_group").isNull() & (col("metric") == "Declared as not meeting the criteria to reside at morning board round and then later in day meets the criteria to reside so discharge stopped"), lit("NCTR")
    ).when(
        col("metric_group").isNull() & (col("metric") == "Remains in hospital to avoid spread of (non-Covid 19) infectious disease and because there is no other suitable location to discharge to"), lit("B4")
    ).otherwise(col("metric_group"))
)

In [0]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
display(null_counts)

In [0]:
display(df)

“In the NHS template, they use [z] as a placeholder whenever a column doesn’t apply.

For the ‘England’ row there isn’t a specific region or ICB, so they mark both as [z].

For a ‘Region’ row there isn’t a single ICB, so that cell is [z].

We keep that in our data so it’s clear which rows are aggregates (all England, or entire region) versus rows for a specific ICB or provider.”


In [0]:
row_count = df.count()
row_count

In [0]:
# assuming `df` is your Silver DataFrame
df = df.dropDuplicates(df.columns)


In [0]:
df.count()

In [0]:
df.write.format("delta").mode("append").option("path","abfss://silver@cajainterview.dfs.core.windows.net/dailydischarge").save()

In [0]:
%sql
create table if not exists databricks_cata.silver.acute_discharge_situation
using delta
location 'abfss://silver@cajainterview.dfs.core.windows.net/dailydischarge'

In [0]:
%sql
select * from databricks_cata.silver.acute_discharge_situation