# ICIJ analysis: load KùzuDB

## Set up

Load the Python dependencies.

In [1]:
import pathlib
import typing

from icecream import ic
import kuzu
import pandas as pd
import watermark

%load_ext watermark

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-07-08T18:49:46.091796-07:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

watermark: 2.4.3
kuzu     : 0.4.3.dev49
pandas   : 2.2.2



Create a KùzuDB database and establish a connection.

In [3]:
!rm -rf ./demo

In [4]:
db: kuzu.database.Database = kuzu.Database("./demo")
conn: kuzu.database.Database = kuzu.Connection(db)

In [5]:
TEMP_DIR: pathlib.Path = pathlib.Path("temp")

## Schema definitions

### `Entity` nodes

After iterating the first time through this analysis, we return to this point and redefine an `Entity` node structure which is a superset of the fields defined among all of the Entity-ish nodes in ICIJ.
See <https://docs.google.com/spreadsheets/d/1eSelhXhix_DtTZuzR2vfl_UdQlEwbwql6NZxqrtROxk/edit?usp=sharing>

In [6]:
conn.execute("""
  CREATE NODE TABLE Entity (
    node_id STRING,
    role STRING,
    name STRING,
    original_name STRING,
    former_name STRING,
    jurisdiction STRING,
    jurisdiction_description STRING,
    company_type STRING,
    address STRING,
    internal_id STRING,
    incorporation_date STRING,
    inactivation_date STRING,
    struck_off_date STRING,
    dorm_date STRING,
    status STRING,
    service_provider STRING,
    ibcRUC STRING,
    country_codes STRING,
    countries STRING,
    sourceID STRING,
    valid_until STRING,
    note STRING,
    vague BOOLEAN,
    PRIMARY KEY (node_id)
  )
""");

Temporary workaround prior to KùzuDB release 0.5.0:

  * concatenate the four CSV files into one

In [7]:
!cp ./temp/entity.1.csv ./temp/entity.all.csv
!tail -n +2 ./temp/entity.2.csv >> ./temp/entity.all.csv
!tail -n +2 ./temp/entity.3.csv >> ./temp/entity.all.csv
!tail -n +2 ./temp/entity.4.csv >> ./temp/entity.all.csv

In [8]:
!wc -l ./temp/entity.?.csv

  814617 ./temp/entity.1.csv
  771369 ./temp/entity.2.csv
   25636 ./temp/entity.3.csv
    2990 ./temp/entity.4.csv
 1614612 total


In [9]:
!wc -l ./temp/entity.all.csv

 1614609 ./temp/entity.all.csv


In [10]:
conn.execute("""
    COPY Entity FROM "./temp/entity.all.csv" (header=true, escape='"', parallel=False)
""");

In [11]:
results = conn.execute("""
  MATCH (n:Entity)
  RETURN *
  LIMIT 1;
""")

while results.has_next():
    ic(results.get_next())

ic| results.get_next(): [{'_id': {'offset': 786432, 'table': 0},
                          '_label': 'Entity',
                          'address': None,
                          'company_type': None,
                          'countries': None,
                          'country_codes': None,
                          'dorm_date': None,
                          'former_name': None,
                          'ibcRUC': None,
                          'inactivation_date': None,
                          'incorporation_date': None,
                          'internal_id': None,
                          'jurisdiction': None,
                          'jurisdiction_description': None,
                          'name': 'SPRINGER KEVIN DOUGLAS',
                          'node_id': '110100020',
                          'note': None,
                          'original_name': None,
                          'role': 'Officer',
                          'service_provider': None,
            

How many `Entity` nodes have been loaded?

In [12]:
results = conn.execute("""
  MATCH (n:Entity)
  RETURN COUNT(*)
""")

while results.has_next():
    row = results.get_next()
    ic(row[0])

ic| row[0]: 1614277


Compare with the `node_id` values in the merged CSV files

In [13]:
nodes_kuzu: typing.Set[ str ] = set()

results = conn.execute("""
  MATCH (n:Entity)
  RETURN n.node_id
""")

while results.has_next():
    row = results.get_next()
    nodes_kuzu.add(str(row[0]))

len(nodes_kuzu)

1614277

In [14]:
node_ids: typing.List[ str ] = []

data_file: pathlib.Path = TEMP_DIR / "entity.all.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).astype(str).fillna("")

node_ids: typing.List[ str ] = df.node_id.values.tolist()
ic(len(node_ids))

nodes_file: typing.Set[ str ] = set(node_ids)
ic(len(nodes_file))

ic| len(node_ids): 1614277
ic| len(nodes_file): 1614277


1614277

How many missing `node_id` values?

In [15]:
len(nodes_file) - len(nodes_kuzu)

0

In [16]:
nodes_missing = nodes_file - nodes_kuzu
nodes_missing

set()

### `Address` nodes

In [17]:
conn.execute("""
  CREATE NODE TABLE Address (
    node_id STRING,
    address STRING,
    name STRING,
    countries STRING,
    country_codes STRING,
    sourceID STRING,
    valid_until STRING,
    note STRING,
    PRIMARY KEY (node_id)
  )
""");

In [18]:
conn.execute("""
    COPY Address FROM "./temp/addr.csv" (header=true, escape='"', parallel=False)
""");

In [19]:
results = conn.execute("""
  MATCH (n:Address)
  RETURN *
  LIMIT 1;
""")

while results.has_next():
    ic(results.get_next())

ic| results.get_next(): [{'_id': {'offset': 262144, 'table': 1},
                          '_label': 'Address',
                          'address': 'Shangrila Mactan Resort; Lapu Lapu City; Philippines',
                          'countries': 'Philippines',
                          'country_codes': 'PHL',
                          'name': None,
                          'node_id': '14077819',
                          'note': None,
                          'sourceID': 'Panama Papers',
                          'valid_until': 'The Panama Papers  data is current through 2015'}]


### `RegisteredAddress` relations

In [20]:
conn.execute("""
  CREATE REL TABLE RegisteredAddress (FROM Entity TO Address,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [21]:
try:
    conn.execute("""
        COPY RegisteredAddress FROM "./temp/rel_regaddr.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

### `OfficerOf` relations

In [22]:
conn.execute("""
  CREATE REL TABLE OfficerOf (FROM Entity TO Entity,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [23]:
try:
    conn.execute("""
        COPY OfficerOf FROM "./temp/rel_officer.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

### `IntermediaryOf` relations

In [24]:
conn.execute("""
  CREATE REL TABLE IntermediaryOf (FROM Entity TO Entity,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [25]:
try:
    conn.execute("""
        COPY IntermediaryOf FROM "./temp/rel_intermed.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

### `ConnectedTo` relations

In [26]:
conn.execute("""
  CREATE REL TABLE ConnectedTo (FROM Entity TO Entity,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [27]:
try:
    conn.execute("""
        COPY OfficerOf FROM "./temp/rel_connect.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

### `Underlying` relations

In [28]:
conn.execute("""
  CREATE REL TABLE Underlying (FROM Entity TO Entity,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [29]:
try:
    conn.execute("""
        COPY Underlying FROM "./temp/rel_underly.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

### `AliasOfficer` relations

In [30]:
conn.execute("""
  CREATE REL TABLE AliasOfficer (FROM Entity TO Entity,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [31]:
try:
    conn.execute("""
        COPY AliasOfficer FROM "./temp/rel_same_officer.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

### `AliasAddress` relations

In [32]:
conn.execute("""
  CREATE REL TABLE AliasAddress (FROM Address TO Address,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [33]:
try:
    conn.execute("""
        COPY AliasAddress FROM "./temp/rel_same_address.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)