# Healthcare Sample Data Generator

This notebook generates synthetic healthcare data for clean room demonstrations and testing. It creates a star schema with one fact table and four dimension tables, ensuring complete referential integrity before persisting to Unity Catalog.

## Data Model

**Fact Table:**
* `visits` - Patient visit records linking to all dimension tables

**Dimension Tables:**
* `patients` - Patient demographic information
* `doctors` - Healthcare provider information with specialties
* `hospitals` - Hospital location data
* `diagnoses` - Medical diagnosis codes (ICD-10 format)

## Notebook Flow

1. **Configure Catalog and Schema** - Widget inputs for target catalog and schema (default: `mkgs.clean_room_sample_data`)
2. **Set Default Namespace** - Execute `USE CATALOG` and `USE SCHEMA` statements
3. **Create Schema** - Ensure the target schema exists
4. **Generate Sample Data** - Programmatically create randomized healthcare data:
	* Patients: 900-1100 (random)
	* Visits: 1400-1600 (random)
	* Doctors: 15-30 (random)
	* Hospitals: 3 (fixed)
	* Diagnoses: 15 (all available)
5. **Verify Referential Integrity** - Run comprehensive checks:
	* Primary key uniqueness
	* Null value detection
	* Foreign key validation
	* **Only save tables if all checks pass**

## Key Features

* **Randomized data generation** - Different counts on each run (with seed for reproducibility)
* **Referential integrity enforcement** - Tables only saved if validation succeeds
* **Overwrite mode** - Tables are created or replaced without manual drops
* **ICD-10 diagnosis codes** - Realistic medical coding standards

In [0]:
dbutils.widgets.text("catalog_use", "", "Catalog")
dbutils.widgets.text("schema_use", "", "Schema")

catalog = dbutils.widgets.get("catalog_use")
schema = dbutils.widgets.get("schema_use")

print(f"""
catalog: {catalog}
schema: {schema}
""")

In [0]:
# Use the specified catalog and schema as defaults
spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

print(f"Using catalog: {catalog}, schema: {schema}")

In [0]:
%sql
select current_catalog(), current_schema();
    
SHOW TABLES;

In [0]:
from pyspark.sql import Row
import random
from datetime import datetime, timedelta, date
from pyspark.sql.functions import to_date

# Set seed for reproducibility
random.seed(42)

# Generate random counts for each entity type
num_patients = random.randint(900, 1100)
num_visits = random.randint(1400, 1600)
num_doctors = random.randint(15, 30)
num_hospitals = 3
num_diagnoses = 15  # Use all 15 diagnoses

print(f"Generating data with:")
print(f"  Patients: {num_patients}")
print(f"  Visits: {num_visits}")
print(f"  Doctors: {num_doctors}")
print(f"  Hospitals: {num_hospitals}")
print(f"  Diagnoses: {num_diagnoses}")
print()

# Dimension table: hospitals (fixed at 3)
hospitals = [
	Row(hospital_id=301, name="General Hospital", city="Springfield")
	, Row(hospital_id=302, name="City Medical Center", city="Rivertown")
	, Row(hospital_id=303, name="Children's Hospital", city="Lakeside")
]
df_hospitals = spark.createDataFrame(hospitals)

# Dimension table: doctors (random between 15-30)
specialties = [
	"Cardiology", "Neurology", "Pediatrics", "Orthopedics", "Dermatology"
	, "Oncology", "Psychiatry", "Endocrinology", "Gastroenterology", "Pulmonology"
	, "Rheumatology", "Urology", "Nephrology", "Ophthalmology", "Otolaryngology"
	, "Radiology", "Anesthesiology", "Emergency Medicine", "Family Medicine", "Internal Medicine"
	, "Obstetrics", "Gynecology", "Pathology", "Surgery", "Hematology"
	, "Infectious Disease", "Allergy", "Sports Medicine", "Geriatrics", "Palliative Care"
]
doctor_names = [
	"Dr. Adams", "Dr. Baker", "Dr. Clark", "Dr. Davis", "Dr. Evans"
	, "Dr. Foster", "Dr. Green", "Dr. Harris", "Dr. Irwin", "Dr. Johnson"
	, "Dr. Kelly", "Dr. Lopez", "Dr. Murphy", "Dr. Nelson", "Dr. Owens"
	, "Dr. Parker", "Dr. Quinn", "Dr. Reed", "Dr. Stone", "Dr. Turner"
	, "Dr. Allen", "Dr. Bell", "Dr. Cooper", "Dr. Dixon", "Dr. Ellis"
	, "Dr. Fisher", "Dr. Gray", "Dr. Hughes", "Dr. Ingram", "Dr. Jenkins"
]

doctors = []
for i in range(num_doctors):
	doctor_id = 201 + i
	name = doctor_names[i % len(doctor_names)]
	specialty = specialties[i % len(specialties)]
	doctors.append(Row(doctor_id=doctor_id, name=name, specialty=specialty))

df_doctors = spark.createDataFrame(doctors)

# Dimension table: diagnoses (all 15)
all_diagnoses = [
	Row(diagnosis_id=401, code="I10", description="Hypertension")
	, Row(diagnosis_id=402, code="E11", description="Type 2 Diabetes")
	, Row(diagnosis_id=403, code="J45", description="Asthma")
	, Row(diagnosis_id=404, code="F32", description="Depression")
	, Row(diagnosis_id=405, code="M79", description="Fibromyalgia")
	, Row(diagnosis_id=406, code="K21", description="GERD")
	, Row(diagnosis_id=407, code="I25", description="Coronary Artery Disease")
	, Row(diagnosis_id=408, code="J44", description="COPD")
	, Row(diagnosis_id=409, code="N18", description="Chronic Kidney Disease")
	, Row(diagnosis_id=410, code="E78", description="Hyperlipidemia")
	, Row(diagnosis_id=411, code="M81", description="Osteoporosis")
	, Row(diagnosis_id=412, code="G43", description="Migraine")
	, Row(diagnosis_id=413, code="K58", description="Irritable Bowel Syndrome")
	, Row(diagnosis_id=414, code="L40", description="Psoriasis")
	, Row(diagnosis_id=415, code="F41", description="Anxiety Disorder")
]

# Use all 15 diagnoses
diagnoses = all_diagnoses[:num_diagnoses]
df_diagnoses = spark.createDataFrame(diagnoses)

# Generate patients programmatically (random between 900-1100)
first_names = ["Alice", "Bob", "Carol", "David", "Emma", "Frank", "Grace", "Henry", "Iris", "Jack"
	, "Karen", "Leo", "Maria", "Nathan", "Olivia", "Paul", "Quinn", "Ryan", "Sarah", "Tom"
	, "Uma", "Victor", "Wendy", "Xavier", "Yara", "Zack", "Amy", "Brian", "Chloe", "Daniel"
	, "Emily", "Felix", "Gina", "Hugo", "Ivy", "James", "Kate", "Liam", "Mia", "Noah"
	, "Ava", "Ben", "Cara", "Dean", "Ella", "Finn", "Gia", "Hank", "Isla", "Jake"]
last_names = ["Smith", "Jones", "Lee", "Kim", "Wilson", "Miller", "Taylor", "Brown", "Chen", "Davis"
	, "White", "Martinez", "Garcia", "Rodriguez", "Anderson", "Thomas", "Jackson", "Moore", "Martin", "Thompson"
	, "Patel", "Harris", "Clark", "Lewis", "Walker", "Hall", "Young", "King", "Wright", "Scott"
	, "Green", "Baker", "Adams", "Nelson", "Carter", "Mitchell", "Perez", "Roberts", "Turner", "Phillips"]
genders = ["M", "F"]

patients = []
for i in range(num_patients):
	patient_id = 101 + i
	first_name = first_names[i % len(first_names)]
	last_name = last_names[(i // len(first_names)) % len(last_names)]
	name = f"{first_name} {last_name}"
	# Generate random DOB between 1950 and 2010 as date object
	year = 1950 + (i % 61)
	month = 1 + (i % 12)
	day = 1 + (i % 28)
	dob = date(year, month, day)
	gender = genders[i % 2]
	patients.append(Row(patient_id=patient_id, name=name, dob=dob, gender=gender))

df_patients = spark.createDataFrame(patients)

# Generate visits programmatically with valid foreign keys (random between 1400-1600)
patient_ids = list(range(101, 101 + num_patients))
doctor_ids = list(range(201, 201 + num_doctors))
hospital_ids = [301, 302, 303]
diagnosis_ids = [d.diagnosis_id for d in diagnoses]

patient_visits = []
start_date = date(2025, 1, 1)
for i in range(num_visits):
	visit_id = i + 1
	patient_id = random.choice(patient_ids)
	doctor_id = random.choice(doctor_ids)
	hospital_id = random.choice(hospital_ids)
	diagnosis_id = random.choice(diagnosis_ids)
	# Spread visits across 365 days as date object
	visit_date = start_date + timedelta(days=i % 365)
	patient_visits.append(Row(
		visit_id=visit_id
		, patient_id=patient_id
		, doctor_id=doctor_id
		, hospital_id=hospital_id
		, diagnosis_id=diagnosis_id
		, visit_date=visit_date
	))

df_patient_visits = spark.createDataFrame(patient_visits)

print(f"\nGenerated:")
print(f"  {df_patients.count()} patients")
print(f"  {df_patient_visits.count()} visits")
print(f"  {df_doctors.count()} doctors")
print(f"  {df_hospitals.count()} hospitals")
print(f"  {df_diagnoses.count()} diagnoses")

display(df_patient_visits.limit(10))
display(df_patients.limit(10))

In [0]:
print("=== REFERENTIAL INTEGRITY CHECKS ===\n")

# Check 1: Verify primary key uniqueness in dimension tables
print("1. PRIMARY KEY UNIQUENESS:")
print(f"   Patients: {df_patients.count()} total, {df_patients.select('patient_id').distinct().count()} unique patient_ids")
print(f"   Doctors: {df_doctors.count()} total, {df_doctors.select('doctor_id').distinct().count()} unique doctor_ids")
print(f"   Hospitals: {df_hospitals.count()} total, {df_hospitals.select('hospital_id').distinct().count()} unique hospital_ids")
print(f"   Diagnoses: {df_diagnoses.count()} total, {df_diagnoses.select('diagnosis_id').distinct().count()} unique diagnosis_ids")
print(f"   Visits: {df_patient_visits.count()} total, {df_patient_visits.select('visit_id').distinct().count()} unique visit_ids")

# Check 2: Verify no nulls in key columns
print("\n2. NULL CHECKS IN KEY COLUMNS:")
print(f"   Null patient_ids in visits: {df_patient_visits.filter('patient_id IS NULL').count()}")
print(f"   Null doctor_ids in visits: {df_patient_visits.filter('doctor_id IS NULL').count()}")
print(f"   Null hospital_ids in visits: {df_patient_visits.filter('hospital_id IS NULL').count()}")
print(f"   Null diagnosis_ids in visits: {df_patient_visits.filter('diagnosis_id IS NULL').count()}")

# Check 3: Verify foreign key references (patient_id)
print("\n3. FOREIGN KEY INTEGRITY:")
patient_ids_in_visits = df_patient_visits.select('patient_id').distinct()
patient_ids_in_dim = df_patients.select('patient_id')
invalid_patients = patient_ids_in_visits.join(patient_ids_in_dim, 'patient_id', 'left_anti')
print(f"   Invalid patient_ids in visits: {invalid_patients.count()}")
if invalid_patients.count() > 0:
	print(f"   Invalid patient_ids: {[row.patient_id for row in invalid_patients.collect()]}")

# Check 4: Verify foreign key references (doctor_id)
doctor_ids_in_visits = df_patient_visits.select('doctor_id').distinct()
doctor_ids_in_dim = df_doctors.select('doctor_id')
invalid_doctors = doctor_ids_in_visits.join(doctor_ids_in_dim, 'doctor_id', 'left_anti')
print(f"   Invalid doctor_ids in visits: {invalid_doctors.count()}")
if invalid_doctors.count() > 0:
	print(f"   Invalid doctor_ids: {[row.doctor_id for row in invalid_doctors.collect()]}")

# Check 5: Verify foreign key references (hospital_id)
hospital_ids_in_visits = df_patient_visits.select('hospital_id').distinct()
hospital_ids_in_dim = df_hospitals.select('hospital_id')
invalid_hospitals = hospital_ids_in_visits.join(hospital_ids_in_dim, 'hospital_id', 'left_anti')
print(f"   Invalid hospital_ids in visits: {invalid_hospitals.count()}")
if invalid_hospitals.count() > 0:
	print(f"   Invalid hospital_ids: {[row.hospital_id for row in invalid_hospitals.collect()]}")

# Check 6: Verify foreign key references (diagnosis_id)
diagnosis_ids_in_visits = df_patient_visits.select('diagnosis_id').distinct()
diagnosis_ids_in_dim = df_diagnoses.select('diagnosis_id')
invalid_diagnoses = diagnosis_ids_in_visits.join(diagnosis_ids_in_dim, 'diagnosis_id', 'left_anti')
print(f"   Invalid diagnosis_ids in visits: {invalid_diagnoses.count()}")
if invalid_diagnoses.count() > 0:
	print(f"   Invalid diagnosis_ids: {[row.diagnosis_id for row in invalid_diagnoses.collect()]}")

# Summary
print("\n=== SUMMARY ===")
total_issues = (
	(df_patients.count() - df_patients.select('patient_id').distinct().count()) +
	(df_doctors.count() - df_doctors.select('doctor_id').distinct().count()) +
	(df_hospitals.count() - df_hospitals.select('hospital_id').distinct().count()) +
	(df_diagnoses.count() - df_diagnoses.select('diagnosis_id').distinct().count()) +
	(df_patient_visits.count() - df_patient_visits.select('visit_id').distinct().count()) +
	df_patient_visits.filter('patient_id IS NULL').count() +
	df_patient_visits.filter('doctor_id IS NULL').count() +
	df_patient_visits.filter('hospital_id IS NULL').count() +
	df_patient_visits.filter('diagnosis_id IS NULL').count() +
	invalid_patients.count() +
	invalid_doctors.count() +
	invalid_hospitals.count() +
	invalid_diagnoses.count()
)

if total_issues == 0:
	print("✓ All referential integrity checks PASSED")
	print("✓ No duplicate primary keys")
	print("✓ No null foreign keys")
	print("✓ All foreign keys reference valid dimension records")
	
	# Only save tables if integrity checks pass
	print("\n=== SAVING TABLES ===")
	df_patient_visits.write.mode("overwrite").saveAsTable("visits")
	print("✓ Saved table: visits")
	
	df_patients.write.mode("overwrite").saveAsTable("patients")
	print("✓ Saved table: patients")
	
	df_doctors.write.mode("overwrite").saveAsTable("doctors")
	print("✓ Saved table: doctors")
	
	df_hospitals.write.mode("overwrite").saveAsTable("hospitals")
	print("✓ Saved table: hospitals")
	
	df_diagnoses.write.mode("overwrite").saveAsTable("diagnoses")
	print("✓ Saved table: diagnoses")
	
	print("\n=== APPLYING DELTA TABLE PROPERTIES ===")
	
	# Apply liquid clustering (auto mode) - visits is fact table with multiple join keys
	spark.sql("ALTER TABLE visits CLUSTER BY (visit_date, patient_id, doctor_id, hospital_id, diagnosis_id)")
	print("✓ Applied liquid clustering to visits table")
	
	# Apply liquid clustering to dimension tables by their primary keys
	spark.sql("ALTER TABLE patients CLUSTER BY (patient_id)")
	print("✓ Applied liquid clustering to patients table")
	
	spark.sql("ALTER TABLE doctors CLUSTER BY (doctor_id)")
	print("✓ Applied liquid clustering to doctors table")
	
	spark.sql("ALTER TABLE hospitals CLUSTER BY (hospital_id)")
	print("✓ Applied liquid clustering to hospitals table")
	
	spark.sql("ALTER TABLE diagnoses CLUSTER BY (diagnosis_id)")
	print("✓ Applied liquid clustering to diagnoses table")
	
	# Enable Change Data Feed on all tables
	for table in ['visits', 'patients', 'doctors', 'hospitals', 'diagnoses']:
		spark.sql(f"ALTER TABLE {table} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")
	print("✓ Enabled Change Data Feed on all tables")
	
	# Enable auto compaction and optimized writes
	for table in ['visits', 'patients', 'doctors', 'hospitals', 'diagnoses']:
		spark.sql(f"ALTER TABLE {table} SET TBLPROPERTIES (delta.autoOptimize.autoCompact = true)")
		spark.sql(f"ALTER TABLE {table} SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true)")
	print("✓ Enabled auto compaction and optimized writes on all tables")
	
	# Enable deletion vectors for efficient deletes/updates
	for table in ['visits', 'patients', 'doctors', 'hospitals', 'diagnoses']:
		spark.sql(f"ALTER TABLE {table} SET TBLPROPERTIES (delta.enableDeletionVectors = true)")
	print("✓ Enabled deletion vectors on all tables")
	
	# Enable predictive optimization (auto-optimize)
	for table in ['visits', 'patients', 'doctors', 'hospitals', 'diagnoses']:
		spark.sql(f"ALTER TABLE {table} SET TBLPROPERTIES (delta.autoOptimize.autoCompact = true, delta.targetFileSize = '128MB')")
	print("✓ Set target file size to 128MB for optimal performance")
	
	print("\n✓ All tables created/replaced successfully with Delta optimizations")
else:
	print(f"✗ Found {total_issues} integrity issue(s)")
	print("✗ TABLES NOT SAVED - Fix integrity issues before saving")

In [0]:
print("=== ADDING TABLE AND COLUMN COMMENTS ===")

# Add comments to patients table
spark.sql("""
	ALTER TABLE patients
	SET TBLPROPERTIES ('comment' = 'Patient demographic information including name, date of birth, and gender')
""")
spark.sql("ALTER TABLE patients ALTER COLUMN patient_id COMMENT 'Unique identifier for each patient'")
spark.sql("ALTER TABLE patients ALTER COLUMN name COMMENT 'Full name of the patient'")
spark.sql("ALTER TABLE patients ALTER COLUMN dob COMMENT 'Date of birth in YYYY-MM-DD format'")
spark.sql("ALTER TABLE patients ALTER COLUMN gender COMMENT 'Gender (M/F)'")
print("✓ Added comments to patients table")

# Add comments to doctors table
spark.sql("""
	ALTER TABLE doctors
	SET TBLPROPERTIES ('comment' = 'Healthcare provider information with medical specialties')
""")
spark.sql("ALTER TABLE doctors ALTER COLUMN doctor_id COMMENT 'Unique identifier for each doctor'")
spark.sql("ALTER TABLE doctors ALTER COLUMN name COMMENT 'Full name of the doctor'")
spark.sql("ALTER TABLE doctors ALTER COLUMN specialty COMMENT 'Medical specialty or area of practice'")
print("✓ Added comments to doctors table")

# Add comments to hospitals table
spark.sql("""
	ALTER TABLE hospitals
	SET TBLPROPERTIES ('comment' = 'Hospital location and facility information')
""")
spark.sql("ALTER TABLE hospitals ALTER COLUMN hospital_id COMMENT 'Unique identifier for each hospital'")
spark.sql("ALTER TABLE hospitals ALTER COLUMN name COMMENT 'Name of the hospital facility'")
spark.sql("ALTER TABLE hospitals ALTER COLUMN city COMMENT 'City where the hospital is located'")
print("✓ Added comments to hospitals table")

# Add comments to diagnoses table
spark.sql("""
	ALTER TABLE diagnoses
	SET TBLPROPERTIES ('comment' = 'Medical diagnosis codes following ICD-10 standards')
""")
spark.sql("ALTER TABLE diagnoses ALTER COLUMN diagnosis_id COMMENT 'Unique identifier for each diagnosis'")
spark.sql("ALTER TABLE diagnoses ALTER COLUMN code COMMENT 'ICD-10 diagnosis code'")
spark.sql("ALTER TABLE diagnoses ALTER COLUMN description COMMENT 'Human-readable description of the diagnosis'")
print("✓ Added comments to diagnoses table")

# Add comments to visits table
spark.sql("""
	ALTER TABLE visits
	SET TBLPROPERTIES ('comment' = 'Patient visit records linking patients, doctors, hospitals, and diagnoses')
""")
spark.sql("ALTER TABLE visits ALTER COLUMN visit_id COMMENT 'Unique identifier for each visit'")
spark.sql("ALTER TABLE visits ALTER COLUMN patient_id COMMENT 'Foreign key reference to patients table'")
spark.sql("ALTER TABLE visits ALTER COLUMN doctor_id COMMENT 'Foreign key reference to doctors table'")
spark.sql("ALTER TABLE visits ALTER COLUMN hospital_id COMMENT 'Foreign key reference to hospitals table'")
spark.sql("ALTER TABLE visits ALTER COLUMN diagnosis_id COMMENT 'Foreign key reference to diagnoses table'")
spark.sql("ALTER TABLE visits ALTER COLUMN visit_date COMMENT 'Date of the visit in YYYY-MM-DD format'")
print("✓ Added comments to visits table")

print("\n✓ All table and column comments added successfully")

In [0]:
print("=== SETTING PRIMARY KEY COLUMNS TO NOT NULL ===")

# Set primary key columns to NOT NULL
spark.sql("ALTER TABLE patients ALTER COLUMN patient_id SET NOT NULL")
print("✓ Set patients.patient_id to NOT NULL")

spark.sql("ALTER TABLE doctors ALTER COLUMN doctor_id SET NOT NULL")
print("✓ Set doctors.doctor_id to NOT NULL")

spark.sql("ALTER TABLE hospitals ALTER COLUMN hospital_id SET NOT NULL")
print("✓ Set hospitals.hospital_id to NOT NULL")

spark.sql("ALTER TABLE diagnoses ALTER COLUMN diagnosis_id SET NOT NULL")
print("✓ Set diagnoses.diagnosis_id to NOT NULL")

spark.sql("ALTER TABLE visits ALTER COLUMN visit_id SET NOT NULL")
print("✓ Set visits.visit_id to NOT NULL")

print("\n=== ADDING PRIMARY AND FOREIGN KEY CONSTRAINTS ===")

# Add primary key constraints with RELY
spark.sql("ALTER TABLE patients DROP CONSTRAINT IF EXISTS patients_pk CASCADE")
spark.sql("ALTER TABLE patients ADD CONSTRAINT patients_pk PRIMARY KEY (patient_id) RELY")
print("✓ Added primary key constraint to patients table")

spark.sql("ALTER TABLE doctors DROP CONSTRAINT IF EXISTS doctors_pk CASCADE")
spark.sql("ALTER TABLE doctors ADD CONSTRAINT doctors_pk PRIMARY KEY (doctor_id) RELY")
print("✓ Added primary key constraint to doctors table")

spark.sql("ALTER TABLE hospitals DROP CONSTRAINT IF EXISTS hospitals_pk CASCADE")
spark.sql("ALTER TABLE hospitals ADD CONSTRAINT hospitals_pk PRIMARY KEY (hospital_id) RELY")
print("✓ Added primary key constraint to hospitals table")

spark.sql("ALTER TABLE diagnoses DROP CONSTRAINT IF EXISTS diagnoses_pk CASCADE")
spark.sql("ALTER TABLE diagnoses ADD CONSTRAINT diagnoses_pk PRIMARY KEY (diagnosis_id) RELY")
print("✓ Added primary key constraint to diagnoses table")

spark.sql("ALTER TABLE visits DROP CONSTRAINT IF EXISTS visits_pk CASCADE")
spark.sql("ALTER TABLE visits ADD CONSTRAINT visits_pk PRIMARY KEY (visit_id) RELY")
print("✓ Added primary key constraint to visits table")

# Add foreign key constraints to visits table with RELY using full three-level namespace
spark.sql("ALTER TABLE visits DROP CONSTRAINT IF EXISTS visits_patient_fk CASCADE")
spark.sql(f"""
	ALTER TABLE visits
	ADD CONSTRAINT visits_patient_fk
	FOREIGN KEY (patient_id) REFERENCES {catalog}.{schema}.patients(patient_id) RELY
""")
print(f"✓ Added foreign key constraint: visits.patient_id -> {catalog}.{schema}.patients.patient_id")

spark.sql("ALTER TABLE visits DROP CONSTRAINT IF EXISTS visits_doctor_fk CASCADE")
spark.sql(f"""
	ALTER TABLE visits
	ADD CONSTRAINT visits_doctor_fk
	FOREIGN KEY (doctor_id) REFERENCES {catalog}.{schema}.doctors(doctor_id) RELY
""")
print(f"✓ Added foreign key constraint: visits.doctor_id -> {catalog}.{schema}.doctors.doctor_id")

spark.sql("ALTER TABLE visits DROP CONSTRAINT IF EXISTS visits_hospital_fk CASCADE")
spark.sql(f"""
	ALTER TABLE visits
	ADD CONSTRAINT visits_hospital_fk
	FOREIGN KEY (hospital_id) REFERENCES {catalog}.{schema}.hospitals(hospital_id) RELY
""")
print(f"✓ Added foreign key constraint: visits.hospital_id -> {catalog}.{schema}.hospitals.hospital_id")

spark.sql("ALTER TABLE visits DROP CONSTRAINT IF EXISTS visits_diagnosis_fk CASCADE")
spark.sql(f"""
	ALTER TABLE visits
	ADD CONSTRAINT visits_diagnosis_fk
	FOREIGN KEY (diagnosis_id) REFERENCES {catalog}.{schema}.diagnoses(diagnosis_id) RELY
""")
print(f"✓ Added foreign key constraint: visits.diagnosis_id -> {catalog}.{schema}.diagnoses.diagnosis_id")

print("\n✓ All primary and foreign key constraints added successfully with RELY option")