In [0]:
# =============================================================================
# Functional Testing Notebook
# =============================================================================
# This notebook demonstrates:
# - Custom library usage (html2text)
# - Job parameter configuration to replace default catalog config, cluster environment variable
# - Spark configuration settings

from pyspark.sql import SparkSession
import argparse
from pyspark.dbutils import DBUtils

from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType, StructField
from datetime import datetime
import tempfile
import os


# Get Spark session
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()

# Initialize test results tracker
test_results = {
    "TEST 1": {"name": "Catalog config (job parameter workaround)", "status": "PENDING"},
    "TEST 2": {"name": "Type conversion (ANSI mode workaround)", "status": "PENDING"},
    "TEST 3": {"name": "Time parser (SUPPORTED)", "status": "PENDING"},
    "TEST 4": {"name": "Parquet timestamps (code refactoring required)", "status": "PENDING"},
    "TEST 5": {"name": "UDF limits (managed by Databricks)", "status": "PENDING"},
    "TEST 6": {"name": "Execution timeout (workaround)", "status": "PENDING"},
    "TEST 7": {"name": "Default configurations", "status": "PENDING"},
    "LIBRARY": {"name": "html2text library", "status": "PENDING"}
}

dbutils = DBUtils(spark)

# Get parameters from job configuration (base_parameters in job task definition)
catalog_name = dbutils.widgets.get("catalog_name")
database_name = dbutils.widgets.get("database_name")
location = dbutils.widgets.get("location")

print("\nJob Configuration (from job task parameters):")
print(f"  Catalog Name: {catalog_name}")
print(f"  Database Name: {database_name}")
print(f"  Location: {location}")

FUNCTIONAL TESTING - Library & Configuration Demonstration


In [0]:
# =============================================================================
# TEST 1 - Catalog Configuration (Job Parameter Workaround)
# =============================================================================

print("\n" + "="*70)
print("TEST 1: Catalog Configuration")
print("="*70)
print("\n‚ö†Ô∏è  Classic Config: spark.databricks.sql.initial.catalog.name")
print("‚úì Serverless Workaround: Use catalog_name from job parameter + USE CATALOG\n")

print(f"Catalog from job parameter: {catalog_name}")
current_catalog = spark.catalog.currentCatalog()
print(f"Current active catalog: {current_catalog}")

# Set the catalog using USE CATALOG statement
spark.sql(f"USE CATALOG {catalog_name}")
print(f"\n‚úì Set default catalog to: {catalog_name}")
print("="*70)

# In serverless, we use the catalog_name from job parameters
# instead of spark.databricks.sql.initial.catalog.name


# Verify we can query without catalog prefix and show schemas/databases
try:
    test_query = "SHOW DATABASES"
    result = spark.sql(test_query)
    print(f"‚úì Can query without catalog prefix in catalog: {catalog_name}")
    print(f"\nSchemas/Databases in catalog '{catalog_name}':")
    result.show(truncate=False)
    test_results["TEST 1"]["status"] = "PASSED"
except Exception as e:
    print(f"‚ö†Ô∏è Query error: {str(e)[:100]}")
    test_results["TEST 1"]["status"] = "FAILED"
    test_results["TEST 1"]["error"] = str(e)[:100]



TEST 1: spark.databricks.sql.initial.catalog.name (synthea)
Current catalog: maggiedatabricksterraform_dbw
‚úì Can query without catalog prefix, initial catalog is: maggiedatabricksterraform_dbw


In [0]:
# =============================================================================
# TEST 2 - Type Conversion (ANSI Mode Workaround)
# =============================================================================

print("\n" + "="*70)
print("TEST 2: Lenient Type Conversion")
print("="*70)
print("\n‚ö†Ô∏è  Classic Config: spark.sql.storeAssignmentPolicy = LEGACY")
print("‚úì Serverless Workaround: spark.sql.ansi.enabled = False\n")

# WORKAROUND: Use spark.sql.ansi.enabled instead of spark.sql.storeAssignmentPolicy
# Setting to false provides lenient type conversions (invalid values become NULL)
# ‚ö†Ô∏è  WARNING: This has broader implications than just storeAssignmentPolicy:
#     - Division by zero returns NULL instead of error
#     - Arithmetic overflow wraps around or returns NULL
#     - More permissive type coercion and function behavior

spark.conf.set("spark.sql.ansi.enabled", False)

# Verify the config is set
ansi_enabled = spark.conf.get("spark.sql.ansi.enabled")
print(f"Current spark.sql.ansi.enabled: {ansi_enabled}")
print("(False provides lenient type conversions like LEGACY policy)\n")

try:
    # Create DataFrame with string values
    df_test = spark.createDataFrame([
        ("123",),
        ("456",),
        ("not_a_number",),
        ("789",),
        ("invalid",)
    ], ["value_str"])
    
    # Apply CAST - with ANSI disabled, invalid values convert to NULL without errors
    # This provides the same behavior as storeAssignmentPolicy LEGACY
    df_result = df_test.selectExpr(
        "value_str as original_string",
        "CAST(value_str AS INT) as casted_to_int"
    )
    
    print("‚úì Type coercion results with ANSI disabled:")
    df_result.show(truncate=False)
    
    print("\nObservation:")
    print("  - Valid strings ('123', '456', '789') converted to integers")
    print("  - Invalid strings ('not_a_number', 'invalid') converted to NULL")
    print("  - No errors thrown - spark.sql.ansi.enabled=False provides LEGACY-like behavior")
    
    print("\n‚ö†Ô∏è  IMPORTANT: spark.sql.ansi.enabled=False has BROADER implications:")
    print("  - Division by zero: Returns NULL instead of error")
    print("  - Arithmetic overflow: Wraps around or returns NULL instead of error")
    print("  - Type coercion: More permissive implicit conversions")
    print("  - Function behavior: More lenient error handling (e.g., to_date with invalid dates)")
    print("  - SQL parsing: Reserved keywords can be used as identifiers")
    print("\n  This is NOT a perfect 1:1 replacement for storeAssignmentPolicy LEGACY.")
    print("  Review your code for operations that may be affected by these changes.")
    
    test_results["TEST 2"]["status"] = "PASSED"
    
except Exception as e:
    print(f"‚ö†Ô∏è Error with type conversion: {str(e)[:200]}")
    print("  Note: If this fails, spark.sql.ansi.enabled may not be set correctly")
    test_results["TEST 2"]["status"] = "FAILED"
    test_results["TEST 2"]["error"] = str(e)[:200]



TEST 2: spark.sql.storeAssignmentPolicy (LEGACY)
Testing implicit type conversions with LEGACY policy...

‚úì LEGACY policy type coercion results:
+---------------+-------------+
|original_string|casted_to_int|
+---------------+-------------+
|123            |123          |
|456            |456          |
|not_a_number   |NULL         |
|789            |789          |
|invalid        |NULL         |
+---------------+-------------+


Observation:
  - Valid strings ('123', '456', '789') converted to integers
  - Invalid strings ('not_a_number', 'invalid') converted to NULL
  - No errors thrown - this is the LEGACY behavior


In [0]:
# =============================================================================
# TEST 3 - Legacy Time Parser Policy (SUPPORTED)
# =============================================================================

print("\n" + "="*70)
print("TEST 3: Legacy Time Parser Policy")
print("="*70)
print("\n‚úÖ SUPPORTED in Serverless: spark.sql.legacy.timeParserPolicy = LEGACY")
print("Testing lenient date/time parsing with LEGACY policy...\n")

# SUPPORTED: Legacy Time Parser Policy - enables lenient date/time parsing
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Verify the config is set
current_policy = spark.conf.get("spark.sql.legacy.timeParserPolicy")
print(f"Current timeParserPolicy: {current_policy}\n")

try:
    # LEGACY policy is more lenient with date formats and edge cases
    # Create DataFrame with various date/time formats
    df_dates = spark.createDataFrame([
        ("2024-01-15",),           # Standard ISO format
        ("2024-01-15 14:30:00",),  # With time
        ("15-01-2024",),           # Day-first format (may not parse)
        ("01/15/2024",),           # Slash separator (may not parse)
        ("2024-1-5",),             # Single digit month/day
        ("invalid_date",)          # Invalid format
    ], ["date_string"])
    
    # Try parsing with LEGACY policy - it's more forgiving than CORRECTED or EXCEPTION
    df_parsed = df_dates.selectExpr(
        "date_string",
        "to_date(date_string) as parsed_date",
        "to_timestamp(date_string) as parsed_timestamp"
    )
    
    print("‚úì LEGACY time parser results:")
    df_parsed.show(truncate=False)
    
    print("\nObservation:")
    print("  - LEGACY policy attempts to parse various date formats")
    print("  - Invalid or unsupported formats result in NULL rather than throwing errors")
    print("  - More lenient than CORRECTED or EXCEPTION policies")
    print("  - Handles edge cases with single-digit months/days gracefully")
    test_results["TEST 3"]["status"] = "PASSED"
    
except Exception as e:
    print(f"‚ö†Ô∏è Error with LEGACY time parser: {str(e)[:200]}")
    print("  Note: If this fails, the spark.sql.legacy.timeParserPolicy may not be set correctly")
    test_results["TEST 3"]["status"] = "FAILED"
    test_results["TEST 3"]["error"] = str(e)[:200]



TEST 3: spark.sql.legacy.timeParserPolicy (LEGACY)
‚úì LEGACY time parser results:
  to_date('2024-01-15') = 2024-01-15
  to_timestamp('2024-01-15 14:30:00') = 2024-01-15 14:30:00


In [0]:
# =============================================================================
# TEST 4 - Parquet Timestamp Handling (Code Refactoring Required)
# =============================================================================

print("\n" + "="*70)
print("TEST 4: Parquet Timestamp Handling")
print("="*70)
print("\n‚ö†Ô∏è  Classic Configs: spark.sql.parquet.int96RebaseModeInRead/Write = LEGACY")
print("‚ö†Ô∏è  Classic Configs: spark.sql.parquet.datetimeRebaseModeInRead/Write = LEGACY")
print("‚úì Serverless Workaround: Code refactoring required\n")

print("Note: Parquet rebase mode configs are NOT SUPPORTED in serverless.")
print("For legacy Parquet files with timestamps, consider:")
print("  1. Rewriting Parquet files with modern timestamp encoding")
print("  2. Using explicit timestamp conversion functions in code")
print("  3. Handling edge case dates (pre-1900, post-2262) in application logic\n")

# Test Parquet write/read with LEGACY rebasing
temp_dir = f"{location}/functional_tests/parquet_legacy"
try:
    # Create test data with various timestamps including edge cases
    df_timestamps = spark.sql("""
        SELECT 
            1 as id,
            timestamp('2020-01-01 00:00:00') as event_time
        UNION ALL
        SELECT 
            2 as id,
            timestamp('1950-06-15 12:30:00') as event_time
    """)
    
    print("Test data created (modern timestamps only):")
    df_timestamps.show(truncate=False)
    
    # Write to Parquet (serverless uses modern timestamp encoding by default)
    df_timestamps.write.mode("overwrite").parquet(temp_dir)
    print(f"‚úì Written Parquet to: {temp_dir}")
    
    # Read back
    df_read = spark.read.parquet(temp_dir)
    count = df_read.count()
    print(f"‚úì Read Parquet: {count} rows")
    
    print("\nVerifying data integrity:")
    df_read.show(truncate=False)
    
    print("\nObservation:")
    print("  - Serverless uses modern Parquet timestamp encoding by default")
    print("  - For legacy Parquet files with INT96 timestamps, code refactoring is needed")
    print("  - Avoid dates before 1900 or after 2262 in new data")
    test_results["TEST 4"]["status"] = "PASSED"
    
except Exception as e:
    print(f"‚ö†Ô∏è Parquet test error: {str(e)[:200]}")
    print("  Note: This test demonstrates modern Parquet handling in serverless")
    test_results["TEST 4"]["status"] = "FAILED"
    test_results["TEST 4"]["error"] = str(e)[:200]



TEST 4: Parquet INT96 & DateTime Rebase Modes (LEGACY)
‚úì Written Parquet with LEGACY rebasing
‚úì Read Parquet with LEGACY rebasing: 2 rows
  int96RebaseModeInRead: LEGACY
  int96RebaseModeInWrite: LEGACY
  datetimeRebaseModeInRead: LEGACY
  datetimeRebaseModeInWrite: LEGACY


In [0]:
# =============================================================================
# TEST 5 - UDF Limits in Serverless
# =============================================================================

print("\n" + "="*70)
print("TEST 5: UDF Behavior (Managed by Databricks)")
print("="*70)
print("\n‚ö†Ô∏è  Classic Config: spark.databricks.safespark.externalUDF.plan.limit = 25")
print("‚úì Serverless: UDF limits managed by Databricks (not user-configurable)\n")

print("Note on UDFs in Serverless:")
print("  - In classic compute, spark.databricks.safespark.externalUDF.plan.limit")
print("    restricts the number of external UDFs in a single query plan")
print("  - In serverless compute, UDF behavior is managed by Databricks at the platform level")
print("  - There is NO user-configurable UDF limit setting in serverless")
print("  - This test verifies UDFs work, but will not test any limits")
print("  - Best practice: Minimize UDF usage and use built-in Spark functions instead")

print("\nVerifying UDF functionality in serverless...\n")

try:
    # Create 26 UDFs (exceeds classic limit of 25, but serverless has no user-configurable limit)
    udfs = []
    for i in range(26):
        udf_func = udf(lambda x, i=i: f"udf_{i}_{x}" if x else None, StringType())
        udfs.append(udf_func)
    
    print(f"‚úì Created {len(udfs)} UDFs (exceeds classic limit of 25)")
    
    # Create test DataFrame
    df_test = spark.sql("SELECT 'test' as value")
    
    # Apply UDFs in a single select() call
    print(f"\nApplying all {len(udfs)} UDFs in a single select() statement...")
    
    # Build select expression with all UDFs at once
    select_exprs = ["value"] + [udfs[i]("value").alias(f"udf_{i}") for i in range(len(udfs))]
    df_result = df_test.select(*select_exprs)
    
    # Execute the query
    result_count = df_result.count()
    print(f"‚úì Query executed successfully with {len(udfs)} UDFs: {result_count} rows")
    
    print("\nObservation:")
    print("  - Serverless successfully handles 26 UDFs (exceeds classic limit of 25)")
    print("  - No user-configurable UDF limit in serverless")
    print("  - UDF behavior is managed by Databricks platform")
    print("  - Best practice: Minimize UDF usage and prefer built-in Spark functions")
    test_results["TEST 5"]["status"] = "PASSED"
    
except Exception as e:
    print(f"‚ö†Ô∏è UDF test error: {str(e)[:300]}")
    print("  Note: UDF limits are managed internally by Databricks in serverless")
    test_results["TEST 5"]["status"] = "FAILED"
    test_results["TEST 5"]["error"] = str(e)[:300]



TEST 5: spark.databricks.safespark.externalUDF.plan.limit (25)
‚úì SafeSpark UDF limit: 25
  This limit restricts the number of external UDFs in a query plan

Testing with 26 UDFs (exceeds limit of 25)...
‚úì Created 26 UDFs
‚úì Query executed with 26 UDFs: 1 rows
  (SafeSpark may have applied optimizations or restrictions)


In [0]:
# =============================================================================
# TEST 6 - Execution Timeout (Serverless)
# =============================================================================

print("\n" + "="*70)
print("TEST 6: Execution Timeout (Workaround)")
print("="*70)
print("\n‚ö†Ô∏è  Classic Config: spark.network.timeout = 800")
print("‚úì Serverless Workaround: spark.databricks.execution.timeout = 800s\n")

print("\nNote on timeout configurations:")
print("  - spark.network.timeout is NOT SUPPORTED in serverless")
print("  - Use spark.databricks.execution.timeout instead")
print("  - Controls the maximum execution time for queries/operations")
print("  - Helps prevent long-running or hung operations")

# WORKAROUND: Execution timeout for serverless (replaces spark.network.timeout)
spark.conf.set("spark.databricks.execution.timeout", "800s")
print("‚úì Set spark.databricks.execution.timeout = 800s (replaces network.timeout)")

# Verify the config is set
timeout_value = spark.conf.get("spark.databricks.execution.timeout")
print(f"‚úì Execution timeout: {timeout_value}")

test_results["TEST 6"]["status"] = "PASSED"



TEST 6: spark.network.timeout (800)
Network timeout: 800 seconds
‚úì Timeout set to prevent network failures on long operations


In [0]:
# =============================================================================
# TEST 7 - Verify All Configurations
# =============================================================================

print("\n" + "="*70)
print("TEST 7: Default Configurations (Enabled by Default)")
print("="*70)
print("\n‚úÖ The following are enabled by default in serverless:\n")

# Check default configurations
try:
    delta_preview = spark.conf.get('spark.databricks.delta.preview.enabled')
    print(f"  - spark.databricks.delta.preview.enabled: {delta_preview}")
except:
    print(f"  - spark.databricks.delta.preview.enabled: true (default)")

try:
    data_lineage = spark.conf.get('spark.databricks.dataLineage.enabled')
    print(f"  - spark.databricks.dataLineage.enabled: {data_lineage} (in Unity Catalog)")
except:
    print(f"  - spark.databricks.dataLineage.enabled: true (default in Unity Catalog)")

try:
    offheap = spark.conf.get('spark.memory.offHeap.enabled')
    print(f"  - spark.memory.offHeap.enabled: {offheap}")
except:
    print(f"  - spark.memory.offHeap.enabled: false (default)")

print("\n‚ö†Ô∏è  NOT SUPPORTED in serverless (managed by Databricks):")
print("  - spark.driver.extraJavaOptions (JVM options managed by platform)")

print("\n" + "="*70)
print("‚úì ALL FUNCTIONAL TESTS COMPLETED FOR SERVERLESS")
print("="*70)
test_results["TEST 7"]["status"] = "PASSED"



TEST 7: Other Configurations
spark.databricks.delta.preview.enabled: true
spark.databricks.dataLineage.enabled: true
spark.memory.offHeap.enabled: false
spark.driver.extraJavaOptions: -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED --add-opens=java.management/sun.management=ALL-UNNAM

In [0]:
# =============================================================================
# Demonstrate html2text Library - Parse HTML Content
# =============================================================================

print("\nDemonstrating html2text Library from init script/wheel file...")

try:
    import html2text
    
    print("‚úì html2text library imported successfully")
    
    # Simple example of html2text usage
    h = html2text.HTML2Text()
    h.ignore_links = True
    plain_text = h.handle("<p>Hello, <a href='https://www.google.com/earth/'>world</a>!</p>")
    print("\nConverted HTML to plain text:")
    print(plain_text)
    test_results["LIBRARY"]["status"] = "PASSED"
    
except ImportError as e:
    print(f"‚úó html2text not available: {e}")
    print("  Note: html2text is installed via init script on classic compute")
    test_results["LIBRARY"]["status"] = "FAILED"
    test_results["LIBRARY"]["error"] = str(e)

print("="*70)



Demonstrating html2text Library from init script...
‚úì html2text library imported successfully

Converted HTML to plain text:
Hello, world!




In [0]:
# =============================================================================
# Summary and Completion
# =============================================================================

print("\n" + "="*70)
print("FUNCTIONAL TESTING SUMMARY - SERVERLESS COMPUTE")
print("="*70)

print("\nüìã Job Configuration (from job task parameters):")
print(f"    - Catalog: {catalog_name}")
print(f"    - Database: {database_name}")
print(f"    - Location: {location}")

print("\n‚öôÔ∏è  Serverless Spark Configurations Applied:")
print("    ‚úÖ SUPPORTED:")
print("       - spark.sql.legacy.timeParserPolicy = LEGACY")
print("    ‚úì WORKAROUNDS:")
print("       - spark.sql.ansi.enabled = False (replaces storeAssignmentPolicy)")
print("       - spark.databricks.execution.timeout = 800s (replaces network.timeout)")
print("       - catalog_name from job parameter (replaces initial.catalog.name)")
print("    ‚ö†Ô∏è  NOT SUPPORTED (require code refactoring):")
print("       - Parquet rebase modes (int96/datetime)")
print("       - SafeSpark UDF limits")
print("       - JVM options (extraJavaOptions)")

print("\nüß™ Test Results:")
passed_count = 0
failed_count = 0

for test_key in ["TEST 1", "TEST 2", "TEST 3", "TEST 4", "TEST 5", "TEST 6", "TEST 7", "LIBRARY"]:
    test = test_results[test_key]
    status = test["status"]
    name = test["name"]
    
    if status == "PASSED":
        symbol = "‚úÖ"
        passed_count += 1
    elif status == "FAILED":
        symbol = "‚ùå"
        failed_count += 1
    else:
        symbol = "‚è∏Ô∏è"
    
    print(f"    {symbol} {test_key}: {name} - {status}")
    if status == "FAILED" and "error" in test:
        print(f"        Error: {test['error']}")

print(f"\nüìä Summary: {passed_count} passed, {failed_count} failed out of {len(test_results)} tests")

if failed_count == 0:
    print("\n‚úÖ All functional tests completed successfully for serverless compute!")
else:
    print(f"\n‚ö†Ô∏è  {failed_count} test(s) failed. Please review the errors above.")

print("="*70)



FUNCTIONAL TESTING SUMMARY

‚úì Job Configuration:
    - Catalog (Spark config): maggiedatabricksterraform_dbw
    - Database (env var): prod
    - Location (env var): /Volumes/maggiedatabricksterraform_dbw/synthea/functional_testing

‚úì Spark Configuration Tests Completed:
    - TEST 1: spark.databricks.sql.initial.catalog.name - Verified default catalog
    - TEST 2: spark.sql.storeAssignmentPolicy (LEGACY) - Tested type coercion
    - TEST 3: spark.sql.legacy.timeParserPolicy (LEGACY) - Tested date parsing
    - TEST 4: Parquet INT96 & DateTime rebasing - Tested read/write with legacy timestamps
    - TEST 5: spark.databricks.safespark.externalUDF.plan.limit - Tested multiple UDFs
    - TEST 6: spark.network.timeout - Verified timeout configuration
    - TEST 7: Other configs - Delta preview, data lineage, memory, JVM options

‚úì Custom Library Tests:
    - html2text: Tested HTML to plain text conversion

‚úì All functional tests completed successfully!
