In [None]:
# =============================================================================
# Functional Testing Notebook
# =============================================================================
# This notebook demonstrates:
# - Custom library usage (Faker, html2text)
# - Job parameter configuration
# - Spark configuration settings
# - Data generation and validation

from pyspark.sql import SparkSession
import argparse

# Get Spark session
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()

print("="*70)
print("FUNCTIONAL TESTING - Library & Configuration Demonstration")
print("="*70)


In [None]:
# =============================================================================
# Cell 2: Import Job Parameters and Environment Variables
# =============================================================================

import os

# Get catalog_name from Databricks widgets (job parameter)
try:
    from pyspark.dbutils import DBUtils
    dbutils = DBUtils(spark)
    catalog_name = dbutils.widgets.get("catalog_name")
except Exception:
    parser = argparse.ArgumentParser()
    parser.add_argument("--catalog_name", type=str, required=True)
    args, _ = parser.parse_known_args()
    catalog_name = args.catalog_name

# Get database_name and location from cluster environment variables
database_name = os.environ.get("DATABASE_NAME", "synthea")
location = os.environ.get("LOCATION", "/Volumes/maggiedatabricksterraform_dbw/synthea/landing")

print("\nJob Configuration:")
print(f"  Catalog Name (parameter):     {catalog_name}")
print(f"  Database Name (env var):      {database_name}")
print(f"  Location (env var):           {location}")
print("="*70)


In [None]:
# =============================================================================
# Cell 3: Display Spark Configuration (Set at Cluster Level)
# =============================================================================

print("\nSpark Configuration (configured at cluster level):")

# Display key configurations that are set in the cluster config
configs_to_display = [
    ("Delta Preview Enabled", "spark.databricks.delta.preview.enabled"),
    ("Initial Catalog", "spark.databricks.sql.initial.catalog.name"),
    ("Data Lineage Enabled", "spark.databricks.dataLineage.enabled"),
    ("Network Timeout", "spark.network.timeout"),
    ("Legacy Time Parser Policy", "spark.sql.legacy.timeParserPolicy"),
    ("Store Assignment Policy", "spark.sql.storeAssignmentPolicy"),
    ("Parquet Int96 Rebase (Read)", "spark.sql.parquet.int96RebaseModeInRead"),
    ("Parquet Int96 Rebase (Write)", "spark.sql.parquet.int96RebaseModeInWrite"),
    ("Off-Heap Memory", "spark.memory.offHeap.enabled")
]

for label, config_key in configs_to_display:
    try:
        value = spark.conf.get(config_key)
        print(f"  {label:.<40} {value}")
    except Exception:
        print(f"  {label:.<40} (not set)")

print("\n✓ All Spark configurations are applied at cluster startup")
print("="*70)


In [None]:
# =============================================================================
# Cell 4: Demonstrate Faker Library - Generate Sample Data
# =============================================================================

print("\nDemonstrating Faker Library from hls_external_libs wheel...")

try:
    from faker import Faker
    
    # Initialize Faker
    fake = Faker()
    Faker.seed(42)  # Set seed for reproducibility
    
    print("✓ Faker library imported successfully")
    
    # Generate sample patient data
    sample_data = []
    for i in range(10):
        sample_data.append({
            'patient_id': fake.uuid4(),
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'email': fake.email(),
            'phone': fake.phone_number(),
            'address': fake.address().replace('\n', ', '),
            'date_of_birth': fake.date_of_birth(minimum_age=18, maximum_age=90),
            'ssn': fake.ssn(),
            'blood_type': fake.random_element(elements=('A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB+', 'AB-'))
        })
    
    # Convert to DataFrame
    from pyspark.sql import Row
    df_patients = spark.createDataFrame([Row(**d) for d in sample_data])
    
    print(f"\n✓ Generated {df_patients.count()} sample patient records")
    print("\nSample Generated Data:")
    df_patients.show(5, truncate=False)
    
    # Display schema
    print("\nDataFrame Schema:")
    df_patients.printSchema()
    
except ImportError as e:
    print(f"✗ Faker not available: {e}")
    print("  Note: Faker is available in serverless environments with hls_external_libs wheel")

print("="*70)


In [None]:
# =============================================================================
# Cell 5: Demonstrate html2text Library - Parse HTML Content
# =============================================================================

print("\nDemonstrating html2text Library from requirements.txt...")

try:
    import html2text
    
    print("✓ html2text library imported successfully")
    
    # Sample HTML content (simulating healthcare documentation)
    sample_html = """
    <html>
    <head><title>Patient Care Summary</title></head>
    <body>
        <h1>Patient Care Summary Report</h1>
        <h2>Patient Information</h2>
        <p><strong>Name:</strong> John Doe</p>
        <p><strong>MRN:</strong> 123456789</p>
        <p><strong>Date:</strong> 2024-01-15</p>
        
        <h2>Diagnosis</h2>
        <ul>
            <li>Hypertension (I10)</li>
            <li>Type 2 Diabetes (E11.9)</li>
            <li>Hyperlipidemia (E78.5)</li>
        </ul>
        
        <h2>Medications</h2>
        <table>
            <tr><th>Drug</th><th>Dosage</th><th>Frequency</th></tr>
            <tr><td>Lisinopril</td><td>10mg</td><td>Once daily</td></tr>
            <tr><td>Metformin</td><td>500mg</td><td>Twice daily</td></tr>
            <tr><td>Atorvastatin</td><td>20mg</td><td>Once daily</td></tr>
        </table>
        
        <h2>Notes</h2>
        <p>Patient reports improved <em>blood glucose control</em> and stable blood pressure readings.</p>
        <p><a href="https://example.com/followup">Schedule follow-up appointment</a></p>
    </body>
    </html>
    """
    
    # Initialize html2text converter
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.ignore_images = True
    h.ignore_emphasis = False
    
    # Convert HTML to markdown/text
    markdown_text = h.handle(sample_html)
    
    print("\n--- Original HTML Content ---")
    print(sample_html[:200] + "...")
    
    print("\n--- Converted to Markdown/Text ---")
    print(markdown_text)
    
    # Demonstrate parsing multiple HTML snippets
    html_snippets = [
        ("<p><strong>Alert:</strong> Lab results available</p>", "Alert"),
        ("<h3>Vital Signs</h3><p>BP: 120/80, HR: 72, Temp: 98.6°F</p>", "Vitals"),
        ("<ul><li>Chest X-Ray: Normal</li><li>CBC: Within normal limits</li></ul>", "Lab Results")
    ]
    
    print("\n--- Batch Processing HTML Snippets ---")
    parsed_data = []
    for html_snippet, label in html_snippets:
        text = h.handle(html_snippet).strip()
        parsed_data.append({'label': label, 'html': html_snippet, 'text': text})
        print(f"\n{label}:")
        print(f"  HTML: {html_snippet[:50]}...")
        print(f"  Text: {text[:50]}...")
    
    # Convert to DataFrame
    df_parsed = spark.createDataFrame(parsed_data)
    print("\n✓ Created DataFrame from parsed HTML content")
    df_parsed.show(truncate=False)
    
except ImportError as e:
    print(f"✗ html2text not available: {e}")
    print("  Note: html2text is available in serverless environments via requirements.txt")

print("="*70)


In [None]:
# =============================================================================
# Cell 6: Summary and Completion
# =============================================================================

print("\n" + "="*70)
print("FUNCTIONAL TESTING SUMMARY")
print("="*70)
print("\n✓ Job Configuration:")
print(f"    - Catalog (parameter): {catalog_name}")
print(f"    - Database (env var): {database_name}")
print(f"    - Location (env var): {location}")
print("\n✓ Spark Configurations:")
print("    - 13 custom configurations set at cluster level")
print("    - Delta preview enabled")
print("    - Data lineage enabled")
print("    - Legacy compatibility modes configured")
print("\n✓ Custom Libraries Demonstrated:")
print("    - Faker: Installed via init script from custom wheel")
print("    - html2text: Available from requirements.txt")
print("\n✓ Functional testing completed successfully!")
print("="*70)
