In [None]:
# gold_patient_lifecycle.ipynb
#  GOAL 1: gold_patient_lifecycle — time between visits, patient lifetime span,
#          new/returning classification, age‑band utilization
# SOURCE: kardia_silver.silver_encounters_enriched
# TARGET: TABLE - gold_patient_lifecycle
# TRIGGER: Full snapshot overwrite each run — fast and simple for small datasets.
# NOTE:
#   - lifetime_days = days between first and last visit
#   - classification = 'new' if only one visit, else 'returning'
#   - age_band buckets based on current_date() – birth_year
#   - avg_days_between_visits = lifetime_days ÷ (visit_count–1)
#   - Current dataset has 2,565 distinct patient_id.

# Optional library bootstrap for ephemeral jobs clusters
%run ../../99_utilities/bootstrap_kflow

from kflow.auth_adls import ensure_adls_oauth

# Configure Spark with ADLS OAuth credentials and return base ABFS path
abfss_base = ensure_adls_oauth()

In [0]:
# Ensure Gold DB exists
spark.sql(
  f"""
  CREATE DATABASE IF NOT EXISTS kardia_gold
  LOCATION '{abfss_base}/kardia/gold'
  """
)

spark.sql("USE kardia_gold")

In [0]:
%sql
-- Simple patient lifecycle
CREATE OR REPLACE TABLE kardia_gold.gold_patient_lifecycle
USING DELTA AS

WITH patient_bounds AS (
  -- raw per-patient first/last visit and count
  SELECT
    patient_id,
    MIN(start_ts) AS first_visit_ts,
    MAX(start_ts) AS last_visit_ts,
    COUNT(*)      AS visit_count
  FROM kardia_silver.silver_encounters_enriched
  GROUP BY patient_id
),

patient_metrics AS (
  -- derived fields based on bounds
  SELECT
    patient_id,
    first_visit_ts,
    last_visit_ts,
    DATEDIFF(last_visit_ts, first_visit_ts) AS lifetime_days,
    visit_count,
    CASE WHEN visit_count = 1 THEN 'new' ELSE 'returning' END AS classification,
    CASE
      WHEN visit_count > 1 THEN
        ROUND(
          DATEDIFF(last_visit_ts, first_visit_ts) / CAST(visit_count - 1 AS DOUBLE),
          2
        )
    END AS avg_days_between_visits
  FROM patient_bounds
)

SELECT
  patient_id,
  first_visit_ts,
  last_visit_ts,
  lifetime_days,
  visit_count,
  classification,
  avg_days_between_visits,
  DATEDIFF(CURRENT_DATE(), DATE(last_visit_ts)) AS recency_days
FROM patient_metrics;

In [0]:
%sql
-- 3. Preview: Top 10 patients by visit_count
SELECT *
FROM gold_patient_lifecycle
ORDER BY visit_count DESC
LIMIT 10;