In [0]:
!pip install neo4j tqdm

In [0]:
%restart_python

In [0]:
# Databricks to Neo4j Data Loading Notebook

# This notebook demonstrates how to extract data from Databricks tables (without Spark Connectors)
# and load it into a Neo4j graph database.
# This version is optimized for running directly within a Databricks workspace.

# --- 1. Prerequisites and Library Installation ---
# Ensure you have the necessary libraries installed.
# If running in a Databricks notebook, you might need to install these using pip.
# %pip install neo4j pandas tqdm

import pandas as pd
from neo4j import GraphDatabase
import logging
from tqdm.notebook import tqdm # For progress bars in notebooks

# SparkSession is implicitly available in Databricks notebooks as 'spark'
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("Neo4jDataLoad").getOrCreate()

# Configure logging for better visibility
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- 2. Configuration ---
# IMPORTANT: Replace these placeholders with your actual secret scope name.
# For production environments, consider using Databricks Secrets or environment variables
# to manage sensitive credentials securely.

# Neo4j Secret Scope Name
SECRET_SCOPE_NAME = "my-neo4j-scope" # IMPORTANT: Replace with your Databricks secret scope name

# Neo4j Connection Details (will be pulled from Databricks Secrets)
# These variables will be populated using dbutils.secrets.get()
NEO4J_URI = None
NEO4J_USERNAME = None
NEO4J_PASSWORD = None
NEO4J_DATABASE = None

# Batch size for Neo4j transactions (number of items per UNWIND statement)
BATCH_SIZE = 5000

# --- 3. Databricks Data Extraction Function (using Spark directly) ---

def load_data_from_databricks(table_name):
    """
    Loads data from a specified Databricks table into a Pandas DataFrame
    using the available SparkSession.
    Args:
        table_name (str): The name of the table to load.
    Returns:
        pd.DataFrame: A DataFrame containing the table data, or None if an error occurs.
    """
    logging.info(f"Loading data from Databricks table: {table_name} using Spark.")
    try:
        # Access the global SparkSession directly available in Databricks notebooks
        df_spark = spark.table(table_name)
        df_pandas = df_spark.toPandas()
        logging.info(f"Successfully loaded {len(df_pandas)} rows from {table_name}.")
        return df_pandas
    except Exception as e:
        logging.error(f"Error loading data from table {table_name} using Spark: {e}")
        return None

# --- 4. Neo4j Connection and Utility Functions ---

def get_neo4j_driver():
    """Establishes and returns a Neo4j GraphDatabase driver, pulling credentials from Databricks secrets."""
    global NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD, NEO4J_DATABASE
    try:
        # Retrieve Neo4j credentials from Databricks secrets
        # dbutils is a global object available in Databricks notebooks
        NEO4J_URI = dbutils.secrets.get(scope=SECRET_SCOPE_NAME, key="neo4j-uri")
        NEO4J_USERNAME = dbutils.secrets.get(scope=SECRET_SCOPE_NAME, key="neo4j-username")
        NEO4J_PASSWORD = dbutils.secrets.get(scope=SECRET_SCOPE_NAME, key="neo4j-password")
        NEO4J_DATABASE = dbutils.secrets.get(scope=SECRET_SCOPE_NAME, key="neo4j-database")

        # For Neo4j Aura, the database name is typically part of the URI or defaults to 'neo4j'.
        # The `database` parameter in GraphDatabase.driver might not be strictly necessary if it's 'neo4j'
        # and included in the URI, but it's good practice to pass it if available.
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD), database=NEO4J_DATABASE)
        driver.verify_connectivity()
        logging.info("Successfully connected to Neo4j using secrets.")
        return driver
    except Exception as e:
        logging.error(f"Error connecting to Neo4j. Ensure secrets are configured correctly: {e}")
        raise

def execute_cypher_query(driver, query, parameters=None):
    """
    Executes a Cypher query in a Neo4j session.
    Args:
        driver: The Neo4j GraphDatabase driver.
        query (str): The Cypher query string.
        parameters (dict, optional): Parameters for the Cypher query. Defaults to None.
    Returns:
        neo4j.Result: The result of the query execution.
    """
    with driver.session() as session:
        try:
            result = session.run(query, parameters)
            return result
        except Exception as e:
            logging.error(f"Error executing Cypher query: {query}\nParameters: {parameters}\nError: {e}")
            raise

def clear_neo4j_database(driver):
    """
    Clears all nodes and relationships from the Neo4j database.
    Use with extreme caution, especially in production environments!
    """
    logging.warning("Clearing existing data in Neo4j. This cannot be undone!")
    query = "MATCH (n) DETACH DELETE n"
    execute_cypher_query(driver, query)
    logging.info("Neo4j database cleared.")

def prepare_properties(row):
    """
    Prepares a dictionary of properties for a Neo4j node/relationship,
    handling NaN values by converting them to None.
    """
    props = row.where(pd.notna(row), None).to_dict()
    # Convert any datetime objects to string for consistent Neo4j handling
    for key, value in props.items():
        if isinstance(value, pd.Timestamp):
            props[key] = str(value.date())
    return props

# --- 5. Node and Relationship Creation Functions ---

def create_beneficiary_nodes(driver, df_beneficiary):
    """Creates Beneficiary nodes in Neo4j."""
    logging.info("Creating Beneficiary nodes...")
    query = """
    UNWIND $data AS row
    MERGE (b:Beneficiary {BeneID: row.BeneID})
    SET b += row
    """
    data = [prepare_properties(row) for _, row in df_beneficiary.iterrows()]
    
    for i in tqdm(range(0, len(data), BATCH_SIZE), desc="Beneficiary Nodes"):
        batch = data[i:i + BATCH_SIZE]
        execute_cypher_query(driver, query, parameters={"data": batch})
    logging.info(f"Created/Updated {len(df_beneficiary)} Beneficiary nodes.")

def create_provider_nodes(driver, df_provider):
    """Creates Provider nodes in Neo4j."""
    logging.info("Creating Provider nodes...")
    query = """
    UNWIND $data AS row
    MERGE (p:Provider {ProviderID: row.ProviderID})
    SET p += row
    """
   
    data = [prepare_properties(row) for _, row in df_provider.iterrows()]

    for i in tqdm(range(0, len(data), BATCH_SIZE), desc="Provider Nodes"):
        batch = data[i:i + BATCH_SIZE]
        execute_cypher_query(driver, query, parameters={"data": batch})
    logging.info(f"Created/Updated {len(df_provider)} Provider nodes.")

def create_physician_nodes(driver, df_claims):
    """
    Extracts unique physician IDs from claims data and creates Physician nodes.
    """
    logging.info("Creating Physician nodes from claim data...")
    
    # Concatenate physician columns and filter out NaN values.
    # Assuming 'NA' string values are handled at the source.
    all_physicians = pd.concat([
        df_claims['AttendingPhysician'].dropna(),
        df_claims['OperatingPhysician'].dropna(),
        df_claims['OtherPhysician'].dropna()
    ]).unique()

    if len(all_physicians) == 0:
        logging.info("No valid physician IDs found in claim data.")
        return

    query = """
    UNWIND $physician_ids AS physician_id
    MERGE (ph:Physician {Physician: physician_id})
    """
    for i in tqdm(range(0, len(all_physicians), BATCH_SIZE), desc="Physician Nodes"):
        batch = all_physicians[i:i + BATCH_SIZE].tolist()
        execute_cypher_query(driver, query, parameters={"physician_ids": batch})
    logging.info(f"Created/Updated {len(all_physicians)} Physician nodes.")

def create_diagnosis_nodes(driver, df_claims):
    """
    Extracts unique diagnosis codes from claims data and creates Diagnosis nodes.
    """
    logging.info("Creating Diagnosis nodes from claim data...")
    diagnosis_cols = [f'ClmDiagnosisCode_{i}' for i in range(1, 11)]
    
    all_diagnosis_codes = pd.Series(dtype='object')
    for col in diagnosis_cols:
        if col in df_claims.columns:
            all_diagnosis_codes = pd.concat([all_diagnosis_codes, df_claims[col].dropna()])
            
    all_diagnosis_codes = all_diagnosis_codes.astype(str).str.strip().unique()
    all_diagnosis_codes = all_diagnosis_codes[all_diagnosis_codes != ''] 

    if len(all_diagnosis_codes) == 0:
        logging.info("No valid diagnosis codes found in claim data.")
        return

    query = """
    UNWIND $diagnosis_codes AS diagnosis_code
    MERGE (d:Diagnosis {DiagnosisCode: diagnosis_code})
    """
    for i in tqdm(range(0, len(all_diagnosis_codes), BATCH_SIZE), desc="Diagnosis Nodes"):
        batch = all_diagnosis_codes[i:i + BATCH_SIZE].tolist()
        execute_cypher_query(driver, query, parameters={"diagnosis_codes": batch})
    logging.info(f"Created/Updated {len(all_diagnosis_codes)} Diagnosis nodes.")

def create_procedure_nodes(driver, df_claims):
    """
    Extracts unique procedure codes from claims data and creates Procedure nodes.
    """
    logging.info("Creating Procedure nodes from claim data...")
    procedure_cols = [f'ClmProcedureCode_{i}' for i in range(1, 7)]
    
    all_procedure_codes = pd.Series(dtype='object')
    for col in procedure_cols:
        if col in df_claims.columns:
            # Convert to string before dropping NA to handle potential numeric/mixed types
            all_procedure_codes = pd.concat([all_procedure_codes, df_claims[col].astype(str).dropna()])
            
    all_procedure_codes = all_procedure_codes.astype(str).str.strip().unique()
    all_procedure_codes = all_procedure_codes[all_procedure_codes != ''] 

    if len(all_procedure_codes) == 0:
        logging.info("No valid procedure codes found in claim data.")
        return

    query = """
    UNWIND $procedure_codes AS procedure_code
    MERGE (proc:Procedure {ProcedureCode: procedure_code})
    """
    for i in tqdm(range(0, len(all_procedure_codes), BATCH_SIZE), desc="Procedure Nodes"):
        batch = all_procedure_codes[i:i + BATCH_SIZE].tolist()
        execute_cypher_query(driver, query, parameters={"procedure_codes": batch})
    logging.info(f"Created/Updated {len(all_procedure_codes)} Procedure nodes.")

def create_diagnosis_group_nodes(driver, df_claims):
    """
    Extracts unique DiagnosisGroupCode from claims data and creates DiagnosisGroup nodes.
    """
    logging.info("Creating DiagnosisGroup nodes from claim data...")
    
    all_diagnosis_group_codes = df_claims['DiagnosisGroupCode'].dropna().astype(str).str.strip().unique()
    all_diagnosis_group_codes = all_diagnosis_group_codes[all_diagnosis_group_codes != '']

    if len(all_diagnosis_group_codes) == 0:
        logging.info("No valid diagnosis group codes found in claim data.")
        return

    query = """
    UNWIND $group_codes AS group_code
    MERGE (dg:DiagnosisGroup {DiagnosisGroupCode: group_code})
    """
    for i in tqdm(range(0, len(all_diagnosis_group_codes), BATCH_SIZE), desc="DiagnosisGroup Nodes"):
        batch = all_diagnosis_group_codes[i:i + BATCH_SIZE].tolist()
        execute_cypher_query(driver, query, parameters={"group_codes": batch})
    logging.info(f"Created/Updated {len(all_diagnosis_group_codes)} DiagnosisGroup nodes.")

def create_diagnosis_group_hierarchy(driver, df_claims):
    """
    Creates IS_PART_OF_GROUP relationships between Diagnosis and DiagnosisGroup nodes.
    Links ClmAdmitDiagnosisCode to its DiagnosisGroupCode.
    """
    logging.info("Creating Diagnosis -> DiagnosisGroup hierarchy relationships...")
    
    # Filter claims to only include those with valid ClmAdmitDiagnosisCode and DiagnosisGroupCode
    df_filtered = df_claims[
        df_claims['ClmAdmitDiagnosisCode'].notna() & 
        (df_claims['ClmAdmitDiagnosisCode'] != '') &
        df_claims['DiagnosisGroupCode'].notna() &
        (df_claims['DiagnosisGroupCode'] != '')
    ].copy() # Use .copy() to avoid SettingWithCopyWarning

    # Ensure these columns are string type for consistent merging
    df_filtered['ClmAdmitDiagnosisCode'] = df_filtered['ClmAdmitDiagnosisCode'].astype(str).str.strip()
    df_filtered['DiagnosisGroupCode'] = df_filtered['DiagnosisGroupCode'].astype(str).str.strip()

    # Get unique pairs to avoid redundant MERGE operations
    unique_diagnosis_group_pairs = df_filtered[['ClmAdmitDiagnosisCode', 'DiagnosisGroupCode']].drop_duplicates().to_dict(orient='records')

    if not unique_diagnosis_group_pairs:
        logging.info("No valid diagnosis-group pairs found for hierarchy creation.")
        return

    query = """
    UNWIND $data AS pair
    MATCH (d:Diagnosis {DiagnosisCode: pair.ClmAdmitDiagnosisCode})
    MATCH (dg:DiagnosisGroup {DiagnosisGroupCode: pair.DiagnosisGroupCode})
    MERGE (d)-[:IS_PART_OF_GROUP]->(dg)
    """
    for i in tqdm(range(0, len(unique_diagnosis_group_pairs), BATCH_SIZE), desc="DiagnosisGroup Hierarchy"):
        batch = unique_diagnosis_group_pairs[i:i + BATCH_SIZE]
        execute_cypher_query(driver, query, parameters={"data": batch})
    logging.info(f"Created/Updated {len(unique_diagnosis_group_pairs)} Diagnosis -> DiagnosisGroup relationships.")


def create_claim_nodes_and_relationships(driver, df_claims, claim_type):
    """
    Creates Claim nodes (Inpatient_Claim or Outpatient_Claim) and their relationships
    to Beneficiary, Provider, Physician, Diagnosis, and Procedure nodes.
    Args:
        driver: The Neo4j GraphDatabase driver.
        df_claims (pd.DataFrame): DataFrame containing claim data.
        claim_type (str): 'Inpatient_Claim' or 'Outpatient_Claim'.
    """
    logging.info(f"Creating {claim_type} nodes and relationships...")
    
    # Ensure date columns are handled as strings for Neo4j properties
    date_cols = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']
    for col in date_cols:
        if col in df_claims.columns:
            df_claims[col] = df_claims[col].dt.strftime('%Y-%m-%d').fillna('')

    data = [prepare_properties(row) for _, row in df_claims.iterrows()]

    # Cypher query for creating Claim nodes and relationships
    query = f"""
    UNWIND $data AS row
    MERGE (c:{claim_type} {{ClaimID: row.ClaimID}})
    SET c += row
    
    // Relationship to Beneficiary
    MERGE (b:Beneficiary {{BeneID: row.BeneID}})
    MERGE (c)-[:CLAIM_BENEFICIARY]->(b)
    
    // Relationship to Provider
    MERGE (p:Provider {{ProviderID: row.ProviderID}})
    MERGE (c)-[:HAS_{claim_type.upper()}]->(p)
    
    // Relationship to Attending Physician
    WITH c, row
    WHERE row.AttendingPhysician IS NOT NULL AND row.AttendingPhysician <> ''
    MERGE (att_ph:Physician {{Physician: row.AttendingPhysician}})
    MERGE (c)-[:ATTENDED_BY]->(att_ph)
    
    // Relationship to Operating Physician
    WITH c, row
    WHERE row.OperatingPhysician IS NOT NULL AND row.OperatingPhysician <> ''
    MERGE (op_ph:Physician {{Physician: row.OperatingPhysician}})
    MERGE (c)-[:OPERATED_BY]->(op_ph)

    // Relationship to Other Physician
    WITH c, row
    WHERE row.OtherPhysician IS NOT NULL AND row.OtherPhysician <> ''
    MERGE (other_ph:Physician {{Physician: row.OtherPhysician}})
    MERGE (c)-[:ATTENDED_BY]->(other_ph)

    // Relationships to Diagnosis Nodes (ClmDiagnosisCode_1 to ClmDiagnosisCode_10)
    WITH c, row
    UNWIND range(1, 10) AS i
    WITH c, row, 'ClmDiagnosisCode_' + toString(i) AS diag_col_name
    WHERE row[diag_col_name] IS NOT NULL AND row[diag_col_name] <> ''
    MERGE (diag:Diagnosis {{DiagnosisCode: row[diag_col_name]}})
    MERGE (c)-[:HAS_DIAGNOSIS]->(diag)

    // Relationships to Procedure Nodes (ClmProcedureCode_1 to ClmProcedureCode_6)
    WITH c, row
    UNWIND range(1, 6) AS i
    WITH c, row, 'ClmProcedureCode_' + toString(i) AS proc_col_name
    WHERE row[proc_col_name] IS NOT NULL AND row[proc_col_name] <> ''
    MERGE (proc:Procedure {{ProcedureCode: toString(row[proc_col_name])}}) 
    MERGE (c)-[:HAS_PROCEDURE]->(proc)
    """

    for i in tqdm(range(0, len(data), BATCH_SIZE), desc=f"{claim_type} Nodes & Relationships"):
        batch = data[i:i + BATCH_SIZE]
        execute_cypher_query(driver, query, parameters={"data": batch})
    logging.info(f"Created/Updated {len(df_claims)} {claim_type} nodes and their relationships.")

# --- 6. Main Execution Flow ---

def main():
    """Main function to orchestrate the data loading process."""
    neo4j_driver = None
    try:
        # 1. Load Data from Databricks Tables using Spark directly
        df_beneficiary = load_data_from_databricks("Beneficiary")
        df_provider = load_data_from_databricks("Provider")
        df_inpatient_claim = load_data_from_databricks("Inpatient_Claim")
        df_outpatient_claim = load_data_from_databricks("Outpatient_Claim")

        # Concatenate claims for physician, diagnosis, procedure, and diagnosis group node creation
        df_all_claims = pd.concat([df_inpatient_claim, df_outpatient_claim], ignore_index=True)

        # Basic check if data was loaded and apply filtering for primary keys
        # Assuming 'NA' string values are handled at the source,
        # so dropping NaN values should be sufficient for primary keys.
        if df_beneficiary is not None:
            initial_rows = len(df_beneficiary)
            df_beneficiary.dropna(subset=['BeneID'], inplace=True)
            if len(df_beneficiary) < initial_rows:
                logging.warning(f"Filtered out {initial_rows - len(df_beneficiary)} rows from Beneficiary due to missing BeneID.")
        else:
            logging.error("Beneficiary DataFrame failed to load.")
            return

        if df_provider is not None:
            initial_rows = len(df_provider)
            # Now using 'ProviderID' as the column name directly from the source table
            df_provider.dropna(subset=['ProviderID'], inplace=True)
            if len(df_provider) < initial_rows:
                logging.warning(f"Filtered out {initial_rows - len(df_provider)} rows from Provider due to missing Provider ID.")
        else:
            logging.error("Provider DataFrame failed to load.")
            return

        if df_inpatient_claim is not None:
            initial_rows = len(df_inpatient_claim)
            df_inpatient_claim.dropna(subset=['ClaimID'], inplace=True)
            if len(df_inpatient_claim) < initial_rows:
                logging.warning(f"Filtered out {initial_rows - len(df_inpatient_claim)} rows from Inpatient_Claim due to missing ClaimID.")
        else:
            logging.error("Inpatient_Claim DataFrame failed to load.")
            return

        if df_outpatient_claim is not None:
            initial_rows = len(df_outpatient_claim)
            df_outpatient_claim.dropna(subset=['ClaimID'], inplace=True)
            if len(df_outpatient_claim) < initial_rows:
                logging.warning(f"Filtered out {initial_rows - len(df_outpatient_claim)} rows from Outpatient_Claim due to missing ClaimID.")
        else:
            logging.error("Outpatient_Claim DataFrame failed to load.")
            return

        # Convert date columns to datetime objects for consistent handling before passing to prepare_properties
        # This helps ensure dates are formatted correctly when converted to strings for Neo4j.
        for df in [df_inpatient_claim, df_outpatient_claim]:
            for col in ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']:
                if col in df.columns:
                    df[col] = pd.to_datetime(df[col], errors='coerce')

        # 2. Connect to Neo4j (credentials now pulled from secrets)
        neo4j_driver = get_neo4j_driver()

        # Optional: Clear existing data in Neo4j before import
        # Uncomment the line below if you want to clear the database on each run.
        # clear_neo4j_database(neo4j_driver)

        # 3. Create Nodes and Relationships in Neo4j
        create_beneficiary_nodes(neo4j_driver, df_beneficiary)
        create_provider_nodes(neo4j_driver, df_provider)
        
        # Create Physician, Diagnosis, Procedure, and DiagnosisGroup nodes from combined claims data
        create_physician_nodes(neo4j_driver, df_all_claims)
        create_diagnosis_nodes(neo4j_driver, df_all_claims)
        create_procedure_nodes(neo4j_driver, df_all_claims)
        create_diagnosis_group_nodes(neo4j_driver, df_all_claims)

        # Create Claim nodes and their respective relationships
        create_claim_nodes_and_relationships(neo4j_driver, df_inpatient_claim, "Inpatient_Claim")
        create_claim_nodes_and_relationships(neo4j_driver, df_outpatient_claim, "Outpatient_Claim")

        # Create hierarchy relationship for DiagnosisGroup
        create_diagnosis_group_hierarchy(neo4j_driver, df_all_claims)

        logging.info("Data loading to Neo4j completed successfully!")

    except Exception as e:
        logging.critical(f"An unhandled error occurred during the process: {e}")
    finally:
        # Close connections
        if neo4j_driver:
            neo4j_driver.close()
            logging.info("Neo4j driver closed.")

# --- Execute the main function ---
if __name__ == "__main__":
    main()

# --- 7. Verification (Optional) ---
# After running the notebook, you can connect to your Neo4j Browser (usually at http://localhost:7474)
# and run some Cypher queries to verify the data.

# Example Verification Queries:
# - Count all nodes: MATCH (n) RETURN count(n)
# - Count nodes by label: MATCH (n:Beneficiary) RETURN count(n)
# - Find a beneficiary and their claims:
#   MATCH (b:Beneficiary {BeneID: 'BENEID_EXAMPLE'})-[:CLAIM_BENEFICIARY]-(c)
#   RETURN b, c
# - Find a provider and their claims:
#   MATCH (p:Provider {ProviderID: 'PROVIDERID_EXAMPLE'})-[:HAS_INPATIENT_CLAIM|HAS_OUTPATIENT_CLAIM]-(c)
#   RETURN p, c
# - Find a physician and claims they attended:
#   MATCH (ph:Physician {Physician: 'PHYSICIAN_EXAMPLE'})<-[:ATTENDED_BY|OPERATED_BY]-(c)
#   RETURN ph, c
# - Find a diagnosis and its group:
#   MATCH (d:Diagnosis {DiagnosisCode: 'DIAGNOSIS_CODE_EXAMPLE'})-[:IS_PART_OF_GROUP]->(dg:DiagnosisGroup)
#   RETURN d, dg
# - Find a claim and its diagnoses:
#   MATCH (c:Inpatient_Claim {ClaimID: 'CLAIMID_EXAMPLE'})-[:HAS_DIAGNOSIS]->(d:Diagnosis)
#   RETURN c, d
# - Find a claim and its procedures:
#   MATCH (c:Outpatient_Claim {ClaimID: 'CLAIMID_EXAMPLE'})-[:HAS_PROCEDURE]->(p:Procedure)
#   RETURN c, p


2025-07-20 12:27:59,274 - INFO - Loading data from Databricks table: Beneficiary using Spark.
2025-07-20 12:28:00,411 - INFO - Successfully loaded 63968 rows from Beneficiary.
2025-07-20 12:28:00,412 - INFO - Loading data from Databricks table: Provider using Spark.
2025-07-20 12:28:00,735 - INFO - Successfully loaded 1353 rows from Provider.
2025-07-20 12:28:00,735 - INFO - Loading data from Databricks table: Inpatient_Claim using Spark.
2025-07-20 12:28:01,138 - INFO - Successfully loaded 9551 rows from Inpatient_Claim.
2025-07-20 12:28:01,140 - INFO - Loading data from Databricks table: Outpatient_Claim using Spark.
2025-07-20 12:28:02,138 - INFO - Successfully loaded 125841 rows from Outpatient_Claim.
2025-07-20 12:28:03,396 - INFO - Successfully connected to Neo4j using secrets.
2025-07-20 12:28:03,396 - INFO - Creating Beneficiary nodes...


Beneficiary Nodes:   0%|          | 0/13 [00:00<?, ?it/s]

2025-07-20 12:28:40,745 - INFO - Created/Updated 63968 Beneficiary nodes.
2025-07-20 12:28:40,745 - INFO - Creating Provider nodes...


Provider Nodes:   0%|          | 0/1 [00:00<?, ?it/s]

2025-07-20 12:28:41,596 - INFO - Created/Updated 1353 Provider nodes.
2025-07-20 12:28:41,597 - INFO - Creating Physician nodes from claim data...


Physician Nodes:   0%|          | 0/6 [00:00<?, ?it/s]

2025-07-20 12:30:12,009 - INFO - Created/Updated 26591 Physician nodes.
2025-07-20 12:30:12,012 - INFO - Creating Diagnosis nodes from claim data...


Diagnosis Nodes:   0%|          | 0/2 [00:00<?, ?it/s]

2025-07-20 12:30:21,218 - INFO - Created/Updated 9057 Diagnosis nodes.
2025-07-20 12:30:21,219 - INFO - Creating Procedure nodes from claim data...


Procedure Nodes:   0%|          | 0/1 [00:00<?, ?it/s]

2025-07-20 12:30:22,045 - INFO - Created/Updated 809 Procedure nodes.
2025-07-20 12:30:22,046 - INFO - Creating DiagnosisGroup nodes from claim data...


DiagnosisGroup Nodes:   0%|          | 0/1 [00:00<?, ?it/s]

2025-07-20 12:30:22,226 - INFO - Created/Updated 712 DiagnosisGroup nodes.
2025-07-20 12:30:22,227 - INFO - Creating Inpatient_Claim nodes and relationships...


Inpatient_Claim Nodes & Relationships:   0%|          | 0/2 [00:00<?, ?it/s]

2025-07-20 12:33:14,328 - INFO - Created/Updated 9551 Inpatient_Claim nodes and their relationships.
2025-07-20 12:33:14,332 - INFO - Creating Outpatient_Claim nodes and relationships...


Outpatient_Claim Nodes & Relationships:   0%|          | 0/26 [00:00<?, ?it/s]

2025-07-20 13:48:26,437 - INFO - Created/Updated 125841 Outpatient_Claim nodes and their relationships.
2025-07-20 13:48:26,438 - INFO - Creating Diagnosis -> DiagnosisGroup hierarchy relationships...


DiagnosisGroup Hierarchy:   0%|          | 0/2 [00:00<?, ?it/s]

2025-07-20 13:48:43,747 - INFO - Created/Updated 6728 Diagnosis -> DiagnosisGroup relationships.
2025-07-20 13:48:43,747 - INFO - Data loading to Neo4j completed successfully!
2025-07-20 13:48:43,751 - INFO - Neo4j driver closed.
