# ICIJ analysis: load Neo4j

## Set up

Load the Python dependencies.

In [1]:
import os
import pathlib
import typing

from graphdatascience import GraphDataScience
from icecream import ic
from tqdm import tqdm
import dotenv
import neo4j
import pandas as pd
import watermark

%load_ext watermark

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-07-08T11:45:48.137332-07:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

watermark: 2.4.3
neo4j    : 5.22.0
pandas   : 2.2.2



Establish a GDS connection to Neo4j.

In [3]:
dotenv.load_dotenv(dotenv.find_dotenv())

bolt_uri: str = os.environ.get("NEO4J_BOLT")
database: str = os.environ.get("NEO4J_DBMS")
username: str = os.environ.get("NEO4J_USER")
password: str = os.environ.get("NEO4J_PASS")

gds:GraphDataScience = GraphDataScience(
    bolt_uri,
    auth = ( username, password, ),
    database = database,
    aura_ds = False,
)

UnableToConnectError: {code: Neo.ClientError.Security.Unauthorized} {message: The client is unauthorized due to authentication failure.}

## Schema definitions

### `Entity` nodes

In [None]:
gds.run_cypher("""
DROP CONSTRAINT `entity_node_key` IF EXISTS
""")

gds.run_cypher("""
CREATE CONSTRAINT `entity_node_key` IF NOT EXISTS
  FOR (ent:Entity)
  REQUIRE ent.node_id IS NODE KEY
""")

Load the Senzing entities.

In [None]:
df_ent: pd.DataFrame = pd.DataFrame([
    {
        "uid": entity.entity_uid,
        "name": entity.name,
        "has_ref": entity.has_ref,
    }
    for entity in entities.values()
])

unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MERGE (ent:SzEntity {uid: row.uid, name: row.name, has_ref: row.has_ref})
} IN TRANSACTIONS OF 10000 ROWS
    """

gds.run_cypher(
    unwind_query,
    {"rows": df_ent.to_dict(orient = "records")},
)

In [None]:
df_rec: pd.DataFrame = gds.run_cypher(
  """
MATCH (rec)
WHERE rec.node_id IS NOT NULL
RETURN rec.node_id AS node_id, head(labels(rec)) AS label, rec.name AS name
  """
)

df_rec

In [None]:
ent_kind: dict = pd.Series(df_rec.label.values,index=df_rec.node_id.astype(str)).to_dict()

In [None]:
df_load: pd.DataFrame = pd.DataFrame([
    {
        "entity_uid": entity.entity_uid,
        "record_uid": record_uid,
        "label": ent_kind[record_uid],
        "match_key": match_key,
    }
    for entity in entities.values()
    for record_uid, match_key in entity.records.items()
    if record_uid in ent_kind
])

len(df_load)

In [None]:
df_load

Connect the Senzing entities with records already loaded in Neo4j.

In [None]:
for _, row in tqdm(df_load.iterrows(), desc = "load rows"):
    template: str = f"""
  MATCH
    (ent:SzEntity {{uid: {row.entity_uid}}}),
    (rec:{row.label} {{node_id: {row.record_uid}}})
  MERGE (ent)-[rel:RESOLVES {{match_key: "{row.match_key}"}}]->(rec)
"""

    gds.run_cypher(template)

Test the results

In [None]:
df_test: pd.DataFrame = gds.run_cypher(
  """
MATCH (ent)-[rel:RESOLVES]->(rec)
RETURN labels(ent), ent.name, rec.node_id
  """
)

df_test