In [0]:
%sql
/* gold_patient_lifecycle.ipynb
 GOAL 1: gold_patient_lifecycle — time between visits, patient lifetime span, new/returning classification, age‑band utilization

 SOURCE: kardia_silver.silver_encounters_enriched
 OUTPUT: TABLE: gold_patient_lifecycle

 TRIGGER: Full snapshot overwrite each run — fast and simple for small datasets.
          In production, switch to foreachBatch + MERGE to only update changed patients.

 NOTE:
  - lifetime_days = days between first and last visit  
  - classification = 'new' if only one visit, else 'returning'  
  - age_band buckets based on current_date() – birth_year  
  - avg_days_between_visits = lifetime_days ÷ (visit_count–1) 
  - Current dataset has 2,565 distinct patient_id. */

-- Ensure Gold GB exists
CREATE DATABASE IF NOT EXISTS kardia_gold;
USE kardia_gold;

In [0]:
%sql
CREATE OR REPLACE TABLE gold_patient_lifecycle AS
WITH base AS (
  SELECT patient_id, start_ts, birth_year
  FROM   kardia_silver.silver_encounters_enriched
  WHERE  birth_year IS NOT NULL
)
SELECT
  patient_id,
  MIN(start_ts) AS first_visit_ts,
  MAX(start_ts) AS last_visit_ts,
  DATEDIFF(MAX(start_ts), MIN(start_ts)) AS lifetime_days,
  COUNT(*) AS visit_count,
  CASE WHEN COUNT(*) = 1 THEN 'new' ELSE 'returning' END AS classification,
  CASE
    WHEN YEAR(CURRENT_DATE()) - MAX(birth_year) < 20               THEN '<20'
    WHEN YEAR(CURRENT_DATE()) - MAX(birth_year) BETWEEN 20 AND 39  THEN '20-39'
    WHEN YEAR(CURRENT_DATE()) - MAX(birth_year) BETWEEN 40 AND 59  THEN '40-59'
    ELSE '60+'
  END AS age_band,
  CASE WHEN COUNT(*) > 1
       THEN ROUND(DATEDIFF(MAX(start_ts), MIN(start_ts)) / (COUNT(*) - 1), 2)
  END AS avg_days_between_visits
FROM base
GROUP BY patient_id;

In [0]:
%sql
-- 3. Preview: Top 10 patients by visit_count
SELECT *
FROM gold_patient_lifecycle
ORDER BY visit_count DESC
LIMIT 10;