# ICIJ analysis: load KùzuDB

## Set up

Load the Python dependencies.

In [1]:
import pathlib
import typing

from icecream import ic
import kuzu
import pandas as pd
import watermark

%load_ext watermark

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-07-08T09:43:42.522841-07:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

pandas   : 2.2.2
kuzu     : 0.4.2
watermark: 2.4.3



Create a KùzuDB database and establish a connection.

In [3]:
!rm -rf ./demo

In [4]:
db: kuzu.database.Database = kuzu.Database("./demo")
conn: kuzu.database.Database = kuzu.Connection(db)

In [5]:
TEMP_DIR: pathlib.Path = pathlib.Path("temp")

## Schema definitions

### `Entity` nodes

After iterating the first time through this analysis, we return to this point and redefine an `Entity` node structure which is a superset of the fields defined among all of the Entity-ish nodes in ICIJ.
See <https://docs.google.com/spreadsheets/d/1eSelhXhix_DtTZuzR2vfl_UdQlEwbwql6NZxqrtROxk/edit?usp=sharing>

In [6]:
conn.execute("""
  CREATE NODE TABLE Entity (
    node_id STRING,
    role STRING,
    name STRING,
    original_name STRING,
    former_name STRING,
    jurisdiction STRING,
    jurisdiction_description STRING,
    company_type STRING,
    address STRING,
    internal_id STRING,
    incorporation_date STRING,
    inactivation_date STRING,
    struck_off_date STRING,
    dorm_date STRING,
    status STRING,
    service_provider STRING,
    ibcRUC STRING,
    country_codes STRING,
    countries STRING,
    sourceID STRING,
    valid_until STRING,
    note STRING,
    vague BOOLEAN,
    PRIMARY KEY (node_id)
  )
""");

In [7]:
conn.execute("""
    COPY Entity FROM "./temp/entity.1.csv" (header=true, escape='"', parallel=False)
""");

In [8]:
conn.execute("""
    COPY Entity FROM "./temp/entity.2.csv" (header=true, escape='"', parallel=False)
""");

In [9]:
conn.execute("""
    COPY Entity FROM "./temp/entity.3.csv" (header=true, escape='"', parallel=False)
""");

In [10]:
conn.execute("""
    COPY Entity FROM "./temp/entity.4.csv" (header=true, escape='"', parallel=False)
""");

In [11]:
results = conn.execute("""
  MATCH (n:Entity)
  RETURN *
  LIMIT 1;
""")

while results.has_next():
    ic(results.get_next())

ic| results.get_next(): [{'_id': {'offset': 16384, 'table': 0},
                          '_label': 'Entity',
                          'address': 'Portcullis TrustNet Chambers P.O. Box 3444 Road Town, Tortola '
                                     'BRITISH VIRGIN ISLANDS',
                          'company_type': 'Business Company Limited by Shares',
                          'countries': 'British Virgin Islands',
                          'country_codes': 'VGB',
                          'dorm_date': None,
                          'former_name': None,
                          'ibcRUC': '1524901',
                          'inactivation_date': None,
                          'incorporation_date': '18-MAR-2009',
                          'internal_id': None,
                          'jurisdiction': 'BVI',
                          'jurisdiction_description': 'British Virgin Islands',
                          'name': 'N.P. Rugs Industries Limited',
                          'node_i

How many `Entity` nodes have been loaded?

In [12]:
results = conn.execute("""
  MATCH (n:Entity)
  RETURN COUNT(*)
""")

while results.has_next():
    row = results.get_next()
    ic(row[0])

ic| row[0]: 1613491


Compare with the `node_id` values in the CSV files

In [13]:
!wc -l temp/entity.*.csv

  814617 temp/entity.1.csv
  771369 temp/entity.2.csv
   25636 temp/entity.3.csv
    2990 temp/entity.4.csv
 1614612 total


`(1614612-4)-1613491 = 1117` missing records?

In [14]:
nodes_kuzu: typing.Set[ str ] = set()

results = conn.execute("""
  MATCH (n:Entity)
  RETURN n.node_id
""")

while results.has_next():
    row = results.get_next()
    nodes_kuzu.add(str(row[0]))

len(nodes_kuzu)

1613491

In [15]:
nodes_file: typing.Set[ str ] = set()

files: typing.List[ str ] = [
    "entity.1.csv",
    "entity.2.csv",
    "entity.3.csv",
    "entity.4.csv",
]

for file in files:
    data_file: pathlib.Path = TEMP_DIR / file

    df: pd.DataFrame = pd.read_csv(
        data_file,
        header = 0,
        low_memory = False,
    ).astype(str).fillna("")

    nodes_file = nodes_file.union(set(df.node_id.values))

len(nodes_file)

1614277

How many missing `node_id` values?

In [16]:
len(nodes_file) - len(nodes_kuzu)

786

In [17]:
nodes_missing = nodes_file - nodes_kuzu
nodes_missing

{'10002052',
 '10004114',
 '10006180',
 '10008247',
 '10010339',
 '10012409',
 '10014467',
 '10016554',
 '10018596',
 '10020664',
 '10022729',
 '10024787',
 '10026845',
 '10028897',
 '100301499',
 '100304018',
 '100307887',
 '10030949',
 '100313694',
 '100318478',
 '100322246',
 '100325611',
 '100329133',
 '10033011',
 '100332718',
 '100336773',
 '100340520',
 '10035063',
 '10037127',
 '10039177',
 '10041241',
 '10043289',
 '10045417',
 '10047431',
 '10049507',
 '10051580',
 '10053657',
 '10055705',
 '10061494',
 '100629143',
 '10063402',
 '10065286',
 '10067218',
 '10071120',
 '10071989',
 '10073099',
 '10075025',
 '10076993',
 '10078965',
 '10080919',
 '10080931',
 '10082935',
 '10084945',
 '10086903',
 '10088865',
 '10090823',
 '10092829',
 '10094801',
 '10096775',
 '10098819',
 '10100877',
 '10102904',
 '10104906',
 '10106948',
 '10108967',
 '10111009',
 '10113030',
 '10115046',
 '10117094',
 '10119171',
 '10121249',
 '10123372',
 '10125481',
 '10127637',
 '10129513',
 '10129752',


### `Address` nodes

In [18]:
conn.execute("""
  CREATE NODE TABLE Address (
    node_id STRING,
    address STRING,
    name STRING,
    countries STRING,
    country_codes STRING,
    sourceID STRING,
    valid_until STRING,
    note STRING,
    PRIMARY KEY (node_id)
  )
""");

In [19]:
conn.execute("""
    COPY Address FROM "./temp/addr.csv" (header=true, escape='"', parallel=False)
""");

In [20]:
results = conn.execute("""
  MATCH (n:Address)
  RETURN *
  LIMIT 1;
""")

while results.has_next():
    ic(results.get_next())

ic| results.get_next(): [{'_id': {'offset': 8192, 'table': 1},
                          '_label': 'Address',
                          'address': '5A,Evagora Palikaridi, Engomi,P.C.2430, Nicosia ,Cyprus.',
                          'countries': 'Cyprus',
                          'country_codes': 'CYP',
                          'name': None,
                          'node_id': '235652',
                          'note': None,
                          'sourceID': 'Offshore Leaks',
                          'valid_until': 'The Offshore Leaks data is current through 2010'}]


### `OfficerOf` relations

In [21]:
conn.execute("""
  CREATE REL TABLE OfficerOf (FROM Entity TO Entity,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [22]:
try:
    conn.execute("""
        COPY OfficerOf FROM "./temp/rel_officer.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

ic| ex: RuntimeError('Runtime exception: Unable to find primary key value 12004238.')
ic| node_id: '12004238', node_id in nodes_missing: False


### `RegisteredAddress` relations

In [23]:
conn.execute("""
  CREATE REL TABLE RegisteredAddress (FROM Entity TO Address,
    link STRING,
    status STRING,
    start_date STRING,
    end_date STRING,
    sourceID STRING,
    MANY_MANY
  )
""");

In [24]:
try:
    conn.execute("""
        COPY OfficerOf FROM "./temp/rel_regaddr.csv" (header=true, escape='"', parallel=False)
    """);
except Exception as ex:
    ic(ex)
    node_id = str(getattr(ex, "message", repr(ex)).split(" ")[-1].strip(".')"))
    ic(node_id, node_id in nodes_missing)

ic| ex: RuntimeError('Runtime exception: Unable to find primary key value 10012730.')
ic| node_id: '10012730', node_id in nodes_missing: False
