# Neo4j JDBC Unity Catalog Connection - Support Ticket

## Issue Summary

**Problem**: Unity Catalog JDBC connection to Neo4j fails with `Connection was closed before the operation completed` error, despite:
- Network connectivity working (TCP test passes)
- Neo4j Python driver working
- Neo4j Spark Connector working

**Error Location**: `com.databricks.safespark.jdbc.grpc_client.JdbcConnectClient.awaitWhileConnected`

This notebook provides a systematic test progression to isolate the failure point.

---

## Configuration

**Prerequisites**: Run `setup.sh` to configure Databricks secrets before running this notebook.

The setup script reads credentials from `.env` and stores them in the `neo4j-uc-creds` secret scope:
- `host` - Neo4j host
- `user` - Neo4j username
- `password` - Neo4j password
- `connection_name` - Unity Catalog connection name
- `jdbc_jar_path` - Path to Neo4j JDBC full bundle JAR in UC Volume
- `cleaner_jar_path` - Path to Neo4j JDBC Spark cleaner JAR in UC Volume
- `database` - Neo4j database (optional, defaults to "neo4j")

In [None]:
# =============================================================================
# CONFIGURATION - Loaded from Databricks Secrets
# =============================================================================
# Secrets are configured using setup.sh which creates scope "neo4j-uc-creds"
# with secrets: host, user, password, connection_name, jdbc_jar_path, cleaner_jar_path, database

SCOPE_NAME = "neo4j-uc-creds"

# Aura Connection Details (from secrets)
NEO4J_HOST = dbutils.secrets.get(SCOPE_NAME, "host")
NEO4J_USER = dbutils.secrets.get(SCOPE_NAME, "user")
NEO4J_PASSWORD = dbutils.secrets.get(SCOPE_NAME, "password")

# Database defaults to "neo4j" if not set
try:
    NEO4J_DATABASE = dbutils.secrets.get(SCOPE_NAME, "database")
except:
    NEO4J_DATABASE = "neo4j"

# Unity Catalog Resources (from secrets)
JDBC_JAR_PATH = dbutils.secrets.get(SCOPE_NAME, "jdbc_jar_path")
CLEANER_JAR_PATH = dbutils.secrets.get(SCOPE_NAME, "cleaner_jar_path")
UC_CONNECTION_NAME = dbutils.secrets.get(SCOPE_NAME, "connection_name")

# Combined java_dependencies for CREATE CONNECTION
JAVA_DEPENDENCIES = f'["{JDBC_JAR_PATH}", "{CLEANER_JAR_PATH}"]'

# Derived URLs (no need to edit)
NEO4J_BOLT_URI = f"neo4j+s://{NEO4J_HOST}"
NEO4J_JDBC_URL = f"jdbc:neo4j+s://{NEO4J_HOST}:7687/{NEO4J_DATABASE}"
NEO4J_JDBC_URL_SQL = f"{NEO4J_JDBC_URL}?enableSQLTranslation=true"

print("Configuration loaded from Databricks Secrets:")
print(f"  Secret Scope: {SCOPE_NAME}")
print(f"  Neo4j Host: {NEO4J_HOST}")
print(f"  Bolt URI: {NEO4J_BOLT_URI}")
print(f"  JDBC URL: {NEO4J_JDBC_URL}")
print(f"  Connection Name: {UC_CONNECTION_NAME}")
print(f"  JDBC JAR Path: {JDBC_JAR_PATH}")
print(f"  Cleaner JAR Path: {CLEANER_JAR_PATH}")
print(f"  Java Dependencies: {JAVA_DEPENDENCIES}")

---

## Section 1: Environment Information

Capture cluster and runtime details for support context.

In [None]:
# Collect environment information
print("=" * 60)
print("ENVIRONMENT INFORMATION")
print("=" * 60)

# Spark version
print(f"\nSpark Version: {spark.version}")

# Databricks Runtime
try:
    dbr_version = spark.conf.get("spark.databricks.clusterUsageTags.sparkVersion")
    print(f"Databricks Runtime: {dbr_version}")
except:
    print("Databricks Runtime: Unable to determine")

# Python version
import sys
print(f"Python Version: {sys.version}")

# Check neo4j package
try:
    import neo4j
    print(f"Neo4j Python Driver: {neo4j.__version__}")
except ImportError:
    print("Neo4j Python Driver: NOT INSTALLED")

# Check JAR files exist
print(f"\nJDBC JAR Path: {JDBC_JAR_PATH}")
print(f"Cleaner JAR Path: {CLEANER_JAR_PATH}")
try:
    files = dbutils.fs.ls(JDBC_JAR_PATH.rsplit('/', 1)[0])
    file_names = [f.name for f in files]
    jdbc_jar_found = JDBC_JAR_PATH.split('/')[-1] in file_names
    cleaner_jar_found = CLEANER_JAR_PATH.split('/')[-1] in file_names
    print(f"JDBC JAR File Exists: {jdbc_jar_found}")
    print(f"Cleaner JAR File Exists: {cleaner_jar_found}")
except Exception as e:
    print(f"JAR File Check Error: {e}")

---

## Section 2: Network Connectivity Test (TCP Layer)

**Expected Result**: PASS - Proves network path is open.

In [None]:
# TCP connectivity test using netcat
import time

print("=" * 60)
print("TEST: Network Connectivity (TCP)")
print("=" * 60)
print(f"\nTarget: {NEO4J_HOST}:7687 (Bolt protocol port)")
print("Testing: Can Databricks reach Neo4j at the network level?")

spark.sql("""
CREATE OR REPLACE TEMPORARY FUNCTION connectionTest(host STRING, port STRING)
RETURNS STRING
LANGUAGE PYTHON AS $$
import subprocess
import time
try:
    start = time.time()
    command = ['nc', '-zv', host, str(port)]
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=10)
    elapsed = (time.time() - start) * 1000  # ms
    output = result.stdout.decode() + result.stderr.decode()
    if result.returncode == 0:
        status = "SUCCESS"
        message = f"TCP connection established in {elapsed:.1f}ms"
    else:
        status = "FAILURE"
        message = f"Cannot reach {host}:{port} - check firewall rules"
    return f"{status}|{elapsed:.1f}|{message}|{output.strip()}"
except Exception as e:
    return f"FAILURE|0|Error: {str(e)}|"
$$
""")

start_time = time.time()
result = spark.sql(f"SELECT connectionTest('{NEO4J_HOST}', '7687') AS result").collect()[0]['result']
total_time = (time.time() - start_time) * 1000

parts = result.split('|')
status = parts[0]
latency = parts[1]
message = parts[2]
details = parts[3] if len(parts) > 3 else ""

if status == "SUCCESS":
    print("\n" + "=" * 60)
    print(">>> CONNECTIVITY VERIFIED <<<")
    print("=" * 60)
    print(f"\n[PASS] {message}")
    print(f"\nConnection Details:")
    print(f"  - Host: {NEO4J_HOST}")
    print(f"  - Port: 7687 (Bolt)")
    print(f"  - TCP Latency: {latency}ms")
    print(f"  - Total Test Time: {total_time:.1f}ms")
    if details:
        print(f"  - Raw Output: {details}")
    print("\n" + "-" * 60)
    print("RESULT: Network path to Neo4j is OPEN")
    print("        Firewall rules allow Bolt protocol traffic")
    print("-" * 60)
    print("\nStatus: PASS")
else:
    print(f"\n[FAIL] {message}")
    print(f"Details: {details}")
    print("\nStatus: FAIL")

---

## Section 3: Neo4j Python Driver Test

**Expected Result**: PASS - Proves credentials work and Neo4j is accessible.

In [None]:
# Test Neo4j Python driver connectivity
import time

print("=" * 60)
print("TEST: Neo4j Python Driver")
print("=" * 60)
print(f"\nTarget: {NEO4J_BOLT_URI}")
print("Testing: Can we authenticate and execute queries via Bolt protocol?")

from neo4j import GraphDatabase

try:
    start_time = time.time()
    driver = GraphDatabase.driver(NEO4J_BOLT_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    
    # Verify connectivity
    driver.verify_connectivity()
    connect_time = (time.time() - start_time) * 1000
    
    print("\n" + "=" * 60)
    print(">>> AUTHENTICATION SUCCESSFUL <<<")
    print("=" * 60)
    print(f"\n[PASS] Driver connected and authenticated in {connect_time:.1f}ms")
    
    # Test simple query
    with driver.session() as session:
        query_start = time.time()
        result = session.run("RETURN 1 AS test")
        record = result.single()
        query_time = (time.time() - query_start) * 1000
        print(f"[PASS] Query executed: RETURN 1 = {record['test']} ({query_time:.1f}ms)")
        
        # Get Neo4j version
        result = session.run("CALL dbms.components() YIELD name, versions RETURN name, versions")
        neo4j_info = []
        for record in result:
            neo4j_info.append(f"{record['name']} {record['versions']}")
    
    total_time = (time.time() - start_time) * 1000
    driver.close()
    
    print(f"\nConnection Details:")
    print(f"  - URI: {NEO4J_BOLT_URI}")
    print(f"  - User: {NEO4J_USER}")
    print(f"  - Database: {NEO4J_DATABASE}")
    print(f"  - Neo4j Server: {', '.join(neo4j_info)}")
    print(f"  - Connection Time: {connect_time:.1f}ms")
    print(f"  - Total Test Time: {total_time:.1f}ms")
    
    print("\n" + "-" * 60)
    print("RESULT: Neo4j Python Driver connection WORKING")
    print("        Credentials valid, Bolt protocol functional")
    print("-" * 60)
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Connection failed: {e}")
    print("\nStatus: FAIL")

---

## Section 4: Neo4j Spark Connector (Working Baseline)

**Expected Result**: PASS - This is our working baseline that proves Spark can communicate with Neo4j.

In [None]:
# Test Neo4j Spark Connector (known working method)
print("=" * 60)
print("TEST: Neo4j Spark Connector (org.neo4j.spark.DataSource)")
print("=" * 60)

try:
    df = spark.read.format("org.neo4j.spark.DataSource") \
        .option("url", NEO4J_BOLT_URI) \
        .option("authentication.type", "basic") \
        .option("authentication.basic.username", NEO4J_USER) \
        .option("authentication.basic.password", NEO4J_PASSWORD) \
        .option("query", "RETURN 'Spark Connector Works!' AS message, 1 AS value") \
        .load()
    
    print("\n[PASS] Spark Connector query executed successfully:")
    df.show(truncate=False)
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Spark Connector failed: {e}")
    print("\nStatus: FAIL")

---

## Section 5: Direct JDBC Tests (Bypassing Unity Catalog)

These tests use the Neo4j JDBC driver directly with Spark, **without** Unity Catalog's SafeSpark wrapper.

**Note**: Requires the JDBC JAR to be installed as a cluster library (not just in a UC Volume).

**Limitation Discovered**: Spark's JDBC driver wraps `query` option queries in a subquery for schema inference:
```sql
SELECT * FROM (your_query) SPARK_GEN_SUBQ_N WHERE 1=0
```
This breaks native Cypher even with `FORCE_CYPHER` hint (hint is inside subquery, outer wrapper is still SQL).

**Schema Inference Issue**: When using `dbtable` option, Spark's schema inference returns `NullType()` for all columns from Neo4j JDBC. This causes `No column has been read prior to this call` error when reading data. **Fix**: Use `customSchema` option to explicitly specify column types.

**Workarounds**:
1. Use `dbtable` option with `customSchema` (required to avoid NullType inference)
2. Use `query` option with `customSchema` for SQL queries
3. Use Neo4j Spark Connector instead of JDBC (Section 4 - works without customSchema)

In [None]:
# Direct JDBC - Using dbtable (reads Neo4j label as table, no subquery wrapping)
print("=" * 60)
print("TEST: Direct JDBC - dbtable option (reads label as table)")
print("=" * 60)
print(f"URL: {NEO4J_JDBC_URL_SQL}")

# Use dbtable to read a Neo4j label directly (no subquery wrapper)
# Replace 'Aircraft' with any label that exists in your Neo4j database
TEST_LABEL = "Aircraft"  # Change this to a label in your database

# IMPORTANT: customSchema is REQUIRED when using dbtable with Neo4j JDBC
# Without it, Spark schema inference returns NullType() for all columns,
# causing "No column has been read prior to this call" error when reading data.
# Adjust column names and types to match your actual Neo4j node properties.
# NOTE: Use backticks around column names with special characters (like $)
AIRCRAFT_SCHEMA = "`v$id` STRING, aircraft_id STRING, tail_number STRING, icao24 STRING, model STRING, operator STRING, manufacturer STRING"

try:
    df = spark.read.format("jdbc") \
        .option("url", NEO4J_JDBC_URL_SQL) \
        .option("driver", "org.neo4j.jdbc.Neo4jDriver") \
        .option("user", NEO4J_USER) \
        .option("password", NEO4J_PASSWORD) \
        .option("dbtable", TEST_LABEL) \
        .option("customSchema", AIRCRAFT_SCHEMA) \
        .load()
    
    print(f"\n[PASS] Direct JDBC dbtable '{TEST_LABEL}' read successfully:")
    print(f"Schema: {df.schema}")
    df.show(5, truncate=False)
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Direct JDBC dbtable failed: {e}")
    print("\nStatus: FAIL")
    print("\nNote: Ensure the label exists in Neo4j and JAR is installed as cluster library.")
    print("Also verify customSchema column names match your Neo4j node properties.")

In [None]:
# Direct JDBC - SQL Translation (SQL automatically converted to Cypher)
print("=" * 60)
print("TEST: Direct JDBC - SQL Translation")
print("=" * 60)
print(f"URL: {NEO4J_JDBC_URL_SQL}")

# Use customSchema to bypass Spark's schema inference
try:
    df = spark.read.format("jdbc") \
        .option("url", NEO4J_JDBC_URL_SQL) \
        .option("driver", "org.neo4j.jdbc.Neo4jDriver") \
        .option("user", NEO4J_USER) \
        .option("password", NEO4J_PASSWORD) \
        .option("query", "SELECT 1 AS value") \
        .option("customSchema", "value INT") \
        .load()
    
    print("\n[PASS] Direct JDBC (SQL translation) query executed:")
    df.show(truncate=False)
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Direct JDBC with SQL translation failed: {e}")
    print("\nStatus: FAIL")

In [None]:
# Direct JDBC - SQL Aggregate Query (COUNT)
print("=" * 60)
print("TEST: Direct JDBC - SQL Aggregate (COUNT)")
print("=" * 60)
print(f"URL: {NEO4J_JDBC_URL_SQL}")

# Aggregate functions work reliably with SQL translation
# SQL: SELECT COUNT(*) AS flight_count FROM Flight
# Cypher: MATCH (n:Flight) RETURN count(n) AS flight_count
try:
    df = spark.read.format("jdbc") \
        .option("url", NEO4J_JDBC_URL_SQL) \
        .option("driver", "org.neo4j.jdbc.Neo4jDriver") \
        .option("user", NEO4J_USER) \
        .option("password", NEO4J_PASSWORD) \
        .option("query", "SELECT COUNT(*) AS flight_count FROM Flight") \
        .option("customSchema", "flight_count LONG") \
        .load()

    print("\n[PASS] Direct JDBC SQL aggregate query executed:")
    df.show(truncate=False)
    print("\nStatus: PASS")

except Exception as e:
    print(f"\n[FAIL] Direct JDBC aggregate query failed: {e}")
    print("\nStatus: FAIL")
    print("\nNote: Ensure 'Flight' label exists in your Neo4j database, or change to a label that exists.")

In [None]:
# Direct JDBC - SQL JOIN Translation (NATURAL JOIN -> Cypher relationship)
print("=" * 60)
print("TEST: Direct JDBC - SQL JOIN Translation")
print("=" * 60)
print(f"URL: {NEO4J_JDBC_URL_SQL}")

# Neo4j JDBC translates SQL JOINs to Cypher relationship patterns:
# SQL:    SELECT COUNT(*) FROM Flight f NATURAL JOIN DEPARTS_FROM r NATURAL JOIN Airport a
# Cypher: MATCH (f:Flight)-[:DEPARTS_FROM]->(a:Airport) RETURN count(*) AS cnt
#
# See: https://neo4j.com/docs/jdbc-manual/current/sql2cypher/
try:
    df = spark.read.format("jdbc") \
        .option("url", NEO4J_JDBC_URL_SQL) \
        .option("driver", "org.neo4j.jdbc.Neo4jDriver") \
        .option("user", NEO4J_USER) \
        .option("password", NEO4J_PASSWORD) \
        .option("query", """SELECT COUNT(*) AS cnt
                           FROM Flight f
                           NATURAL JOIN DEPARTS_FROM r
                           NATURAL JOIN Airport a""") \
        .option("customSchema", "cnt LONG") \
        .load()

    print("\n[PASS] Direct JDBC SQL JOIN translation executed:")
    print("SQL JOINs translated to Cypher relationship pattern!")
    df.show(truncate=False)
    print("\nStatus: PASS")

except Exception as e:
    print(f"\n[FAIL] Direct JDBC JOIN translation failed: {e}")
    print("\nStatus: FAIL")
    print("\nNote: Requires Flight-[:DEPARTS_FROM]->Airport pattern in Neo4j.")
    print("Adjust labels/relationship types to match your graph model.")

---

## Section 6: Unity Catalog JDBC Connection

This section creates and tests the Unity Catalog JDBC connection, which uses the SafeSpark wrapper.

In [None]:
# Create Unity Catalog JDBC Connection
print("=" * 60)
print("SETUP: Create Unity Catalog JDBC Connection")
print("=" * 60)

# Drop existing connection
spark.sql(f"DROP CONNECTION IF EXISTS {UC_CONNECTION_NAME}")
print(f"Dropped existing connection (if any): {UC_CONNECTION_NAME}")

# Create connection with explicit driver class
# NOTE: customSchema must be in externalOptionsAllowList to bypass Spark schema inference
# NOTE: java_dependencies includes both the full bundle and spark cleaner JARs
create_sql = f"""
CREATE CONNECTION {UC_CONNECTION_NAME} TYPE JDBC
ENVIRONMENT (
  java_dependencies '{JAVA_DEPENDENCIES}'
)
OPTIONS (
  url '{NEO4J_JDBC_URL_SQL}',
  user '{NEO4J_USER}',
  password '{NEO4J_PASSWORD}',
  driver 'org.neo4j.jdbc.Neo4jDriver',
  externalOptionsAllowList 'dbtable,query,partitionColumn,lowerBound,upperBound,numPartitions,fetchSize,customSchema'
)
"""

print(f"\n[INFO] java_dependencies: {JAVA_DEPENDENCIES}")

try:
    spark.sql(create_sql)
    print(f"\n[PASS] Connection created: {UC_CONNECTION_NAME}")
except Exception as e:
    print(f"\n[FAIL] Failed to create connection: {e}")

In [None]:
# Verify connection configuration
print("=" * 60)
print("VERIFY: Connection Configuration")
print("=" * 60)

try:
    df = spark.sql(f"DESCRIBE CONNECTION {UC_CONNECTION_NAME}")
    print("\nConnection details:")
    df.show(truncate=False)
except Exception as e:
    print(f"\n[FAIL] Cannot describe connection: {e}")

---

## Section 7: Unity Catalog JDBC Tests

These tests use the Unity Catalog connection through the SafeSpark JDBC wrapper.

In [None]:
# Test UC Connection via Spark DataFrame API
print("=" * 60)
print("TEST: Unity Catalog - Spark DataFrame API")
print("=" * 60)

try:
    df = spark.read.format("jdbc") \
        .option("databricks.connection", UC_CONNECTION_NAME) \
        .option("query", "SELECT 1 AS test") \
        .load()
    
    print("\n[PASS] Unity Catalog Spark DataFrame API:")
    df.show()
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Unity Catalog Spark DataFrame API failed:")
    print(f"\nError: {e}")
    print("\nStatus: FAIL")

In [None]:
# Test UC Connection with native Cypher (FORCE_CYPHER hint)
print("=" * 60)
print("TEST: Unity Catalog - Native Cypher (FORCE_CYPHER)")
print("=" * 60)

# NOTE: Spark wraps query option in subquery for schema inference:
#   SELECT * FROM (your_query) SPARK_GEN_SUBQ_N WHERE 1=0
# This breaks native Cypher. Use customSchema to bypass schema inference.

try:
    df = spark.read.format("jdbc") \
        .option("databricks.connection", UC_CONNECTION_NAME) \
        .option("query", "/*+ NEO4J FORCE_CYPHER */ RETURN 1 AS test") \
        .option("customSchema", "test INT") \
        .load()
    
    print("\n[PASS] Unity Catalog with FORCE_CYPHER:")
    df.show()
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Unity Catalog with FORCE_CYPHER failed:")
    print(f"\nError: {e}")
    print("\nStatus: FAIL")

In [None]:
# Test UC Connection via remote_query() function
print("=" * 60)
print("TEST: Unity Catalog - remote_query() Function")
print("=" * 60)

try:
    df = spark.sql(f"""
        SELECT * FROM remote_query(
            '{UC_CONNECTION_NAME}',
            query => 'SELECT 1 AS test'
        )
    """)
    
    print("\n[PASS] Unity Catalog remote_query():")
    df.show()
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Unity Catalog remote_query() failed:")
    print(f"\nError: {e}")
    print("\nStatus: FAIL")

In [None]:
# Test UC Connection with SQL Aggregate Query using Custom Schema
print("=" * 60)
print("TEST: Unity Catalog - SQL Aggregate with Custom Schema")
print("=" * 60)

# CustomSchema for Neo4j JDBC
# ============================================
# Spark's automatic schema inference wraps queries in a subquery:
#   SELECT * FROM (your_query) SPARK_GEN_SUBQ WHERE 1=0
# Neo4j JDBC returns NullType() for all columns during inference,
# causing "No column has been read" errors when reading data.
#
# Possible Workaround: Use customSchema to explicitly define column types:
# - Column names MUST match query result aliases exactly
# - Use Spark SQL types: STRING, LONG, INT, DOUBLE, BOOLEAN, DECIMAL(p,s), etc.
# - Partial schemas allowed: unspecified columns use default inference
#
# This also failed to work
#
# Reference: https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html

# Define schema for aggregate query result
FLIGHT_COUNT_SCHEMA = "flight_count LONG"

try:
    df = spark.read.format("jdbc") \
        .option("databricks.connection", UC_CONNECTION_NAME) \
        .option("query", "SELECT COUNT(*) AS flight_count FROM Flight") \
        .option("customSchema", FLIGHT_COUNT_SCHEMA) \
        .load()
    
    print("\n[PASS] Unity Catalog SQL Aggregate with customSchema:")
    print(f"Schema applied: {FLIGHT_COUNT_SCHEMA}")
    print(f"DataFrame schema: {df.schema}")
    df.show(truncate=False)
    print("\nStatus: PASS")
    
except Exception as e:
    print(f"\n[FAIL] Unity Catalog SQL aggregate query failed:")
    print(f"\nError: {e}")
    print("\nStatus: FAIL")
    print("\nNote: Ensure 'Flight' label exists in your Neo4j database.")
    print("Adjust the label name to match your graph model if needed.")