In [0]:
%sql
/* gold_patient_lifecycle.ipynb
 GOAL 1: gold_patient_lifecycle — time between visits, patient lifetime span, new/returning classification, age‑band utilization

 SOURCE: kardia_silver.silver_encounters_enriched
 OUTPUT: TABLE: gold_patient_lifecycle

 TRIGGER: Full snapshot overwrite each run — fast and simple for small datasets.
          In production, switch to foreachBatch + MERGE to only update changed patients.

 NOTE:
  - lifetime_days = days between first and last visit  
  - classification = 'new' if only one visit, else 'returning'  
  - age_band buckets based on current_date() – birth_year  
  - avg_days_between_visits = lifetime_days ÷ (visit_count–1) 
  - Current dataset has 2,565 distinct patient_id. */

-- Ensure Gold DB exists
CREATE DATABASE IF NOT EXISTS kardia_gold LOCATION 'abfss://raw@kardiaadlsdemo.dfs.core.windows.net/kardia/gold';

ALTER DATABASE kardia_gold SET LOCATION 'abfss://raw@kardiaadlsdemo.dfs.core.windows.net/kardia/gold';

USE kardia_gold;

In [0]:
%sql
CREATE OR REPLACE TABLE kardia_gold.gold_patient_lifecycle AS
WITH base AS (
  SELECT patient_id, start_ts
  FROM   kardia_silver.silver_encounters_enriched
  -- birth_year intentionally not used
),
agg AS (
  SELECT
    patient_id,
    MIN(start_ts) AS first_visit_ts,
    MAX(start_ts) AS last_visit_ts,
    DATEDIFF(MAX(start_ts), MIN(start_ts)) AS lifetime_days,
    COUNT(*) AS visit_count,
    CASE WHEN COUNT(*) = 1 THEN 'new' ELSE 'returning' END AS classification,
    CASE WHEN COUNT(*) > 1
      THEN ROUND(DATEDIFF(MAX(start_ts), MIN(start_ts)) / (COUNT(*) - 1), 2)
    END AS avg_days_between_visits
  FROM base
  GROUP BY patient_id
)
SELECT
  a.*,
  CASE
    WHEN lifetime_days < 30  THEN '0–29d'
    WHEN lifetime_days < 90  THEN '30–89d'
    WHEN lifetime_days < 365 THEN '90–364d'
    WHEN lifetime_days < 1095 THEN '1–2y'
    WHEN lifetime_days < 1825 THEN '3–4y'
    ELSE '5y+'
  END AS tenure_band,
  CASE
    WHEN visit_count = 1 THEN '1'
    WHEN visit_count BETWEEN 2 AND 3 THEN '2–3'
    WHEN visit_count BETWEEN 4 AND 6 THEN '4–6'
    WHEN visit_count BETWEEN 7 AND 10 THEN '7–10'
    ELSE '11+'
  END AS visit_frequency_band,
  DATEDIFF(CURRENT_DATE(), DATE(last_visit_ts)) AS recency_days,
  CASE
    WHEN DATEDIFF(CURRENT_DATE(), DATE(last_visit_ts)) <= 30  THEN '≤30d'
    WHEN DATEDIFF(CURRENT_DATE(), DATE(last_visit_ts)) <= 90  THEN '31–90d'
    WHEN DATEDIFF(CURRENT_DATE(), DATE(last_visit_ts)) <= 365 THEN '91–365d'
    ELSE '>365d'
  END AS recency_band
FROM agg a;

In [0]:
%sql
-- 3. Preview: Top 10 patients by visit_count
SELECT *
FROM gold_patient_lifecycle
ORDER BY visit_count DESC
LIMIT 10;