# ICIJ analysis: load Neo4j

## Set up

Load the Python dependencies.

In [1]:
import math
import os
import pathlib
import typing

from graphdatascience import GraphDataScience
from icecream import ic
from tqdm import tqdm
import dotenv
import neo4j
import numpy as np
import pandas as pd
import watermark

%load_ext watermark

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-07-11T12:19:28.622339-07:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.26.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

neo4j    : 5.22.0
watermark: 2.4.3
numpy    : 1.26.4
pandas   : 2.2.2



Establish a GDS connection to Neo4j.

In [3]:
dotenv.load_dotenv(dotenv.find_dotenv())

bolt_uri: str = os.environ.get("NEO4J_BOLT")
database: str = os.environ.get("NEO4J_DBMS")
username: str = os.environ.get("NEO4J_USER")
password: str = os.environ.get("NEO4J_PASS")

gds:GraphDataScience = GraphDataScience(
    bolt_uri,
    auth = ( username, password, ),
    database = database,
    aura_ds = False,
)



Define a function to load "chunked" Pandas dataframes into Neo4j using mini-batch.

In [4]:
MAX_ROWS: int = 25000

def load_neo4j_df (
    df: pd.DataFrame,
    query: str,
    ) -> None:
    n_splits: int = math.ceil(len(df) / MAX_ROWS)

    for df_chunk in tqdm(np.array_split(df, n_splits), desc = "chunks"):
        gds.run_cypher(
            query,
            {"rows": df_chunk.to_dict(orient = "records")},
        )

## Schema definitions

In [5]:
TEMP_DIR: pathlib.Path = pathlib.Path("temp")

In [6]:
gds.run_cypher("""
DROP CONSTRAINT `entity_node_key` IF EXISTS
""")

gds.run_cypher("""
DROP INDEX `entity_node_id` IF EXISTS
""")

gds.run_cypher("""
DROP CONSTRAINT `location_node_key` IF EXISTS
""")

gds.run_cypher("""
DROP INDEX `location_node_id` IF EXISTS
""")

gds.run_cypher("""
CREATE CONSTRAINT `entity_node_key` IF NOT EXISTS
  FOR (e:Entity)
  REQUIRE e.node_id IS NODE KEY
""")

gds.run_cypher("""
CREATE INDEX entity_node_id IF NOT EXISTS
  FOR (e:Entity) ON (e.node_id)
""")

gds.run_cypher("""
CREATE CONSTRAINT `location_node_key` IF NOT EXISTS
  FOR (l:Location)
  REQUIRE l.node_id IS NODE KEY
""")

gds.run_cypher("""
CREATE INDEX location_node_id IF NOT EXISTS
  FOR (l:Location) ON (l.node_id)
""")

## Load data

### `Entity` nodes

In [7]:
data_file: pathlib.Path = TEMP_DIR / "entity.all.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
)

df.head(3)

Unnamed: 0,node_id,role,name,original_name,former_name,jurisdiction,jurisdiction_description,company_type,address,internal_id,...,dorm_date,status,service_provider,ibcRUC,country_codes,countries,sourceID,valid_until,note,vague
0,10000001,Entity,"TIANSHENG INDUSTRY AND TRADING CO., LTD.","TIANSHENG INDUSTRY AND TRADING CO., LTD.",,SAM,Samoa,,ORION HOUSE SERVICES (HK) LIMITED ROOM 1401; 1...,1001256.0,...,,Defaulted,Mossack Fonseca,25221,HKG,Hong Kong,Panama Papers,The Panama Papers data is current through 2015,,False
1,10000002,Entity,"NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.","NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.",,SAM,Samoa,,ORION HOUSE SERVICES (HK) LIMITED ROOM 1401; 1...,1001263.0,...,,Defaulted,Mossack Fonseca,25249,HKG,Hong Kong,Panama Papers,The Panama Papers data is current through 2015,,False
2,10000003,Entity,"HOTFOCUS CO., LTD.","HOTFOCUS CO., LTD.",,SAM,Samoa,,ORION HOUSE SERVICES (HK) LIMITED ROOM 1401; 1...,1000896.0,...,,Defaulted,Mossack Fonseca,24138,HKG,Hong Kong,Panama Papers,The Panama Papers data is current through 2015,,False


In [8]:
list(df.columns)

['node_id',
 'role',
 'name',
 'original_name',
 'former_name',
 'jurisdiction',
 'jurisdiction_description',
 'company_type',
 'address',
 'internal_id',
 'incorporation_date',
 'inactivation_date',
 'struck_off_date',
 'dorm_date',
 'status',
 'service_provider',
 'ibcRUC',
 'country_codes',
 'countries',
 'sourceID',
 'valid_until',
 'note',
 'vague']

In [9]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MERGE (e:Entity {node_id: row.node_id})
  SET e += {
    role: row.role,
    name: row.name,
    original_name: row.original_name,
    former_name: row.former_name,
    jurisdiction: row.jurisdiction,
    jurisdiction_description: row.jurisdiction_description,
    company_type: row.company_type,
    address: row.address,
    internal_id: row.internal_id,
    incorporation_date: row.incorporation_date,
    inactivation_date: row.inactivation_date,
    struck_off_date: row.struck_off_date,
    dorm_date: row.dorm_date,
    status: row.status,
    service_provider: row.service_provider,
    ibcRUC: row.ibcRUC,
    country_codes: row.country_codes,
    countries: row.countries,
    sourceID: row.sourceID,
    valid_until: row.valid_until,
    note: row.note,
    vague: row.vague
  }
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65/65 [01:51<00:00,  1.72s/it]


In [10]:
df_test: pd.DataFrame = gds.run_cypher(
  """
MATCH (e:Entity)
RETURN e.node_id, e.role, e.name
  """
)

df_test

Unnamed: 0,e.node_id,e.role,e.name
0,1,Officer,Peter Sabourin
1,2,Officer,Irene Knowles
2,3,Officer,Mr. Hua guoqiang
3,4,Officer,Chen Lifen
4,5,Officer,Mr. Wan Jianping
...,...,...,...
1614272,240558066,Other,Donard Trading Limited
1614273,240558067,Other,Ballford Holdings Limited
1614274,240558068,Other,Lansdale Corporate Limited
1614275,240558069,Other,Kapecod Enterprises Ltd


### `Location` nodes

In [11]:
data_file: pathlib.Path = TEMP_DIR / "location.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
)

df.head(3)

Unnamed: 0,node_id,address,name,countries,country_codes,sourceID,valid_until,note
0,24000001,"ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805...",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through earl...,
1,24000002,"SUITE E-2,UNION COURT BUILDING, P.O. BOX N-818...",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through earl...,
2,24000003,"LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785,...",,Bahamas,BHS,Bahamas Leaks,The Bahamas Leaks data is current through earl...,


In [12]:
list(df.columns)

['node_id',
 'address',
 'name',
 'countries',
 'country_codes',
 'sourceID',
 'valid_until',
 'note']

In [13]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MERGE (l:Location {node_id: row.node_id})
  SET l += {
    address: row.address,
    name: row.name,
    countries: row.countries,
    country_codes: row.country_codes,
    sourceID: row.sourceID,
    valid_until: row.valid_until,
    note: row.note
  }
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:11<00:00,  1.51it/s]


In [14]:
df_test: pd.DataFrame = gds.run_cypher(
  """
MATCH (l:Location)
RETURN l.node_id, l.address, l.countries
  """
)

df_test

Unnamed: 0,l.node_id,l.address,l.countries
0,67268,"#2236 Albert Hoy Street, Belize City, Belize.",Belize
1,67276,"1, rue Ferdid Dormal B-4280 Hannut Belgique",Belgium
2,67277,"11 Coomber Road, The Peak, Hong Kong",Hong Kong
3,67278,"11A, Branksome 3 Tregunter Path Mid-Levels Hon...",Hong Kong
4,67282,14 Lysander Road West Mailing Kent ME19 4TT,United Kingdom
...,...,...,...
402241,240492573,"115 GRIVA DIGENI AVENUE, 5TH FLOOR, LIMA, 3101...",Cyprus
402242,240492574,"115 GRIVA DIGENI AVENUE, 5TH FLOOR, LIMASSOL, ...",
402243,240492575,"115 GRIVA DIGENI AVENUE, 5TH FLOOR, 3101, CVPRUS",
402244,240492576,"PINELOPIS 9, LAMACA, 6057, TORTOLA, CYPRUS",Cyprus


### `RegisteredAddress` relations

In [15]:
data_file: pathlib.Path = TEMP_DIR / "rel_regaddr.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).fillna("")

df.head(3)

Unnamed: 0,node_id_start,node_id_end,link,status,start_date,end_date,sourceID
0,10000035,14095990,registered address,,,,Panama Papers
1,10000044,14091035,registered address,,,,Panama Papers
2,10000055,14095990,registered address,,,,Panama Papers


In [16]:
list(df.columns)

['node_id_start',
 'node_id_end',
 'link',
 'status',
 'start_date',
 'end_date',
 'sourceID']

In [17]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MATCH
    (ent:Entity {node_id: row.node_id_start}),
    (loc:Location {node_id: row.node_id_end})
  MERGE (ent)-[:RegisteredAddress {link: row.link, status: row.status, start_date: row.start_date, end_date: row.end_date, sourceID: row.sourceID}]->(loc)
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33/33 [00:20<00:00,  1.61it/s]


### `OfficerOf` relations

In [18]:
data_file: pathlib.Path = TEMP_DIR / "rel_officer.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).fillna("")

df.head(3)

Unnamed: 0,node_id_start,node_id_end,link,status,start_date,end_date,sourceID
0,12000001,10073324,shareholder of,,19-NOV-1999,04-JUL-2000,Panama Papers
1,12000002,10148386,shareholder of,,30-MAR-2012,06-JUL-2012,Panama Papers
2,12000003,10024966,shareholder of,,14-JAN-2010,,Panama Papers


In [19]:
list(df.columns)

['node_id_start',
 'node_id_end',
 'link',
 'status',
 'start_date',
 'end_date',
 'sourceID']

In [20]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MATCH
    (src:Entity {node_id: row.node_id_start}),
    (dst:Entity {node_id: row.node_id_end})
  MERGE (src)-[:OfficerOf {link: row.link, status: row.status, start_date: row.start_date, end_date: row.end_date, sourceID: row.sourceID}]->(dst)
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [00:42<00:00,  1.55it/s]


### `IntermediaryOf` relations

In [21]:
data_file: pathlib.Path = TEMP_DIR / "rel_intermed.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).fillna("")

df.head(3)

Unnamed: 0,node_id_start,node_id_end,link,status,start_date,end_date,sourceID
0,11000001,10208879,intermediary of,,,,Panama Papers
1,11000001,10198662,intermediary of,,,,Panama Papers
2,11000001,10159927,intermediary of,,,,Panama Papers


In [22]:
list(df.columns)

['node_id_start',
 'node_id_end',
 'link',
 'status',
 'start_date',
 'end_date',
 'sourceID']

In [23]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MATCH
    (src:Entity {node_id: row.node_id_start}),
    (dst:Entity {node_id: row.node_id_end})
  MERGE (src)-[:IntermediaryOf {link: row.link, status: row.status, start_date: row.start_date, end_date: row.end_date, sourceID: row.sourceID}]->(dst)
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:15<00:00,  1.58it/s]


### `ConnectedTo` relations

In [24]:
data_file: pathlib.Path = TEMP_DIR / "rel_connect.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).fillna("")

df.head(3)

Unnamed: 0,node_id_start,node_id_end,link,status,start_date,end_date,sourceID
0,85004929,85008101,connected to,,,,Paradise Papers - Aruba corporate registry
1,85004929,85021444,connected to,,,,Paradise Papers - Aruba corporate registry
2,85008443,85011025,connected to,,,,Paradise Papers - Aruba corporate registry


In [25]:
list(df.columns)

['node_id_start',
 'node_id_end',
 'link',
 'status',
 'start_date',
 'end_date',
 'sourceID']

In [26]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MATCH
    (src:Entity {node_id: row.node_id_start}),
    (dst:Entity {node_id: row.node_id_end})
  MERGE (src)-[:ConnectedTo {link: row.link, status: row.status, start_date: row.start_date, end_date: row.end_date, sourceID: row.sourceID}]->(dst)
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.76it/s]


### `Underlying` relations

In [27]:
data_file: pathlib.Path = TEMP_DIR / "rel_underly.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).fillna("")

df.head(3)

Unnamed: 0,node_id_start,node_id_end,link,status,start_date,end_date,sourceID
0,51240,110010,Nominee Shareholder of,,,,Offshore Leaks
1,51364,122604,Nominee Shareholder of,,,,Offshore Leaks
2,51425,85812,Nominee Shareholder of,,,,Offshore Leaks


In [28]:
list(df.columns)

['node_id_start',
 'node_id_end',
 'link',
 'status',
 'start_date',
 'end_date',
 'sourceID']

In [29]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MATCH
    (src:Entity {node_id: row.node_id_start}),
    (dst:Entity {node_id: row.node_id_end})
  MERGE (src)-[:Underlying {link: row.link, status: row.status, start_date: row.start_date, end_date: row.end_date, sourceID: row.sourceID}]->(dst)
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.63it/s]


### `AliasOfficer` relations

In [30]:
data_file: pathlib.Path = TEMP_DIR / "rel_same_officer.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).fillna("")

df.head(3)

Unnamed: 0,node_id_start,node_id_end,link,status,start_date,end_date,sourceID
0,59178341,59190179,same id as,,,,Paradise Papers - Malta corporate registry
1,59181407,59108285,same id as,,,,Paradise Papers - Malta corporate registry
2,56031433,56031434,same id as,,,,Paradise Papers - Malta corporate registry


In [31]:
list(df.columns)

['node_id_start',
 'node_id_end',
 'link',
 'status',
 'start_date',
 'end_date',
 'sourceID']

In [32]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MATCH
    (src:Entity {node_id: row.node_id_start}),
    (dst:Entity {node_id: row.node_id_end})
  MERGE (src)-[:AliasOfficer {link: row.link, status: row.status, start_date: row.start_date, end_date: row.end_date, sourceID: row.sourceID}]->(dst)
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.23it/s]


In [33]:
### `AliasLocation` relations

In [34]:
data_file: pathlib.Path = TEMP_DIR / "rel_same_loc.csv"

df: pd.DataFrame = pd.read_csv(
    data_file,
    header = 0,
    low_memory = False,
).fillna("")

df.head(3)

Unnamed: 0,node_id_start,node_id_end,link,status,start_date,end_date,sourceID
0,24000030,14035591,same address as,,,,Bahamas Leaks
1,24000086,14077570,same address as,,,,Bahamas Leaks
2,24000090,14077931,same address as,,,,Bahamas Leaks


In [35]:
list(df.columns)

['node_id_start',
 'node_id_end',
 'link',
 'status',
 'start_date',
 'end_date',
 'sourceID']

In [36]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MATCH
    (src:Entity {node_id: row.node_id_start}),
    (dst:Entity {node_id: row.node_id_end})
  MERGE (src)-[:AliasLocation {link: row.link, status: row.status, start_date: row.start_date, end_date: row.end_date, sourceID: row.sourceID}]->(dst)
} IN TRANSACTIONS OF 5000 ROWS
    """

load_neo4j_df(df, unwind_query)

  return bound(*args, **kwds)
chunks: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 35.75it/s]
