# Experiments on ICIJ Paradise Papers (Neo4j sandbox)

## Setting up

In [1]:
%%capture
%pip install graphdatascience pandas ipython

In [1]:
import pandas as pd
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience

In [2]:
# Neo4j Sandbox Connection details
DB_ULR = 'bolt://18.205.6.212:7687'
DB_USER = 'neo4j'
DB_PASS = 'passage-originator-merchant'
gds = GraphDataScience.from_neo4j_driver(DB_ULR, auth=(DB_USER, DB_PASS))

In [14]:
# Increase the transaction time out, since we will do some long running transactions
gds.run_cypher('call dbms.setConfigValue("dbms.transaction.timeout","20m")')

## A subset of the "Power players in the data"
https://offshoreleaks.icij.org/power-players

In [3]:
peps = [
    'JEAN CHRÉTIEN',
    'MUKHTAR ABLYAZOV',
    'BEIBUT ATAMKULOV',
    'HENRIQUE DE CAMPOS MEIRELLES',
    'WESLEY K. CLARK',
    'SHAUKAT AZIZ',
    'ALEJANDRO GERTZ MANERO',
    'ANTANAS GUOGA',
    'HAKAINDE SAMMY HICHILEMA',
    'RAVINDRA KISHORE (RK) SINHA',
    'VALERIY VOSHCHEVSKY',
    'SALLY KOSGEI',
    'REX TILLERSON',
    'PAUL MARTIN',
    'ELLEN JOHNSON SIRLEAF',
    'BRIAN MULRONEY',
    'MUDHAR GHASSAN SHAWKAT',
    'SAUAT MUKHAMETBAYEVICH MYNBAYEV',
    'JAMES MEYER SASSOON',
    'PRABOWO SUBIANTO',
    'SAM KAHAMBA KUTESA',
    'WILBUR LOUIS ROSS, JR.',
    'PRINCE KHALED BIN SULTAN BIN ABDULAZIZ',   
    'CARLOS QUINTANILLA SCHMIDT',
    'ALFRED GUSENBAUER',
    'YUKIO HATOYAMA',
    'ANTON PRIGODSKY',
    'PENNY PRITZKER',
    'JOHN MAHAMA',
    'BINALI YILDIRIM',
    'SUHARTO',
    'BASHAR AL-ASSAD',
    'BLAIRO BORGES MAGGI',
    'JAYANT SINHA',
    'JUAN MANUEL SANTOS'
]

In [4]:
# Create fulltext index
gds.run_cypher('''
    call db.index.fulltext.createNodeIndex('ftx_offshore',
        ['Officer'],
        ['name']
    )
''')


In [5]:
# Check index fully polulated
gds.run_cypher('''call db.indexDetails('ftx_offshore') yield state, populationPercent''').head()

Unnamed: 0,state,populationPercent
0,ONLINE,100.0


In [6]:
# Flag politically exposed persons (PEP)
test = gds.run_cypher(''' 
    unwind $peps as pep
    call db.index.fulltext.queryNodes('ftx_offshore', 'name: ' + pep + '',{limit:1}) yield node,score
    with node, score, pep
    set node:PEP
    return pep, node.name as name, node.type as type, labels(node) as labels, score, id(node) as id    
''', params = { 'peps': peps})
test.head(40)

Unnamed: 0,pep,name,type,labels,score,id
0,JEAN CHRÉTIEN,Joseph Jacques Jean Chrétien,,"[Officer, PEP]",6.888455,87589
1,MUKHTAR ABLYAZOV,Ablyazov - Mukhtar,,"[Officer, PEP]",11.695206,88025
2,BEIBUT ATAMKULOV,BEIBUT ATAMKULOV,,"[Officer, PEP]",11.4198,87507
3,HENRIQUE DE CAMPOS MEIRELLES,Meirelles - Henrique de Campos,,"[Officer, PEP]",13.590715,158044
4,WESLEY K. CLARK,Wesley Clark,,"[Officer, PEP]",8.055619,87592
5,SHAUKAT AZIZ,Shaukat Aziz,,"[Officer, PEP]",10.095312,87594
6,ALEJANDRO GERTZ MANERO,Gertz - Alejandro M,,"[Officer, PEP]",8.707655,102886
7,ANTANAS GUOGA,ANTANAS GUOGA,,"[Officer, PEP]",11.144423,87519
8,HAKAINDE SAMMY HICHILEMA,Hichilema - Hakainde Sammy,,"[Officer, PEP]",14.305145,105417
9,RAVINDRA KISHORE (RK) SINHA,RAVINDRA KISHORE SINHA,,"[Officer, PEP]",13.723818,87537


## Who is who

In [7]:
companies = gds.run_cypher(''' 
    match path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments" OR
          p.name contains "Investments" OR
          p.name contains "L.L.C." OR
          p.name contains "LIMITED" OR
          p.name contains "Investors" OR
          p.name contains "CORPORATE" OR
          p.name contains "LTD." OR
          p.name contains "MANAGERS" OR 
          p.name contains "REGISTER" 
        )
    return distinct p.name limit 50
''')
companies.head(50)

Unnamed: 0,p.name
0,1022385 Alberta Ltd. as Trustee of the Drinkwa...
1,1022387 Alberta Ltd. as Trustee of the Whitewo...
2,1035533 Ontario Ltd.
3,1105328 Alberta Ltd.
4,"1245180 Alberta Ltd., as trustee of the Sapunj..."
5,2035910 Ontario Ltd.
6,385069 Alberta Ltd.
7,3VS1 Asia Growth Fund Ltd.
8,4 M Associates Ltd.
9,561309 Alberta Ltd.


In [8]:
# Label Officers as Companies
gds.run_cypher(''' 
    match path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments" OR
          p.name contains "Investments" OR
          p.name contains "L.L.C." OR
          p.name contains "LIMITED" OR
          p.name contains "Investors" OR
          p.name contains "CORPORATE" OR
          p.name contains "LTD." OR
          p.name contains "MANAGERS" OR 
          p.name contains "REGISTER" 
        )
    with distinct p
    set p:Company remove p:Person
''')

In [9]:
# Label Officers that ar not labeled Company with Person
gds.run_cypher(''' 
    match (p:Officer)
    where not p:Company 
    set p:Person
''')

In [10]:
# Check what we have
gds.run_cypher(''' 
    match (p:Officer)
    return labels(p), count(*)
''').head(10)


Unnamed: 0,labels(p),count(*)
0,"[Officer, Person]",66006
1,"[Officer, Company]",10972
2,"[Officer, PEP, Person]",34


## Playground

In [11]:
# Check what outgoing rels we have for Persons
# Why? Node similarity expects a directed graph
gds.run_cypher(''' 
    match p=(:Person)-[r]->()
    return type(r), count(*)
''').head(10)

# Note: Explore the "CONNECTED_TO" relationships!

Unnamed: 0,type(r),count(*)
0,REGISTERED_ADDRESS,75894
1,OFFICER_OF,190149
2,CONNECTED_TO,693
3,SAME_ID_AS,2
4,SAME_NAME_AS,64


In [12]:
# Cleanup
gds.run_cypher(''' 
    match (:Person)-[a:AFFILIATED]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    match (:Person)-[a:STAKEHOLDER]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    match (:Person)-[a:LIVES_AT]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
    
''')

In [16]:
# Prep: To focus on persons, let's say they are STAKEHOLDER's and LIVES_AT
# (i.e. avoid similarity to be computed between Companies)
gds.run_cypher(''' 
    match (p:Person)-[:OFFICER_OF]->(c)
    call {
        with p,c
        merge (p)-[:STAKEHOLDER]->(c)
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    match (p:Person)-[:REGISTERED_ADDRESS]->(c)
    call {
        with p,c
        merge (p)-[:LIVES_AT]->(c)
    } in transactions of 5000 rows
''')

In [17]:
G, project_stats = gds.graph.project(
    'test', 
    node_spec=['Person','Company','Entity','Intermediary'], 
    relationship_spec=['STAKEHOLDER','LIVES_AT']
)

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

In [18]:
project_stats

nodeProjection            {'Entity': {'label': 'Entity', 'properties': {...
relationshipProjection    {'STAKEHOLDER': {'orientation': 'NATURAL', 'ag...
graphName                                                              test
nodeCount                                                            104000
relationshipCount                                                    138894
projectMillis                                                         12571
Name: 0, dtype: object

In [19]:
# This is how we would estimate how much resources it takes to run
# an algorithm in a specific mode (stats, write, mutate).
# We don't have to do it, it is done behind the scenes when using the gds client
estimate = gds.nodeSimilarity.stats.estimate(G)
sysinfo = gds.debug.sysInfo()
assert sysinfo[ sysinfo["key"] == "heapTotalInBytes" ]["value"].values[0] > estimate.bytesMin

In [20]:
ns_stats = gds.nodeSimilarity.stats(G,
    similarityCutoff = 0.6,
    degreeCutoff = 2,
    topK = 3
)

In [21]:
ns_stats.similarityDistribution

{'p1': 0.5999984741210938,
 'max': 1.0000038146972656,
 'p5': 0.6666641235351562,
 'p90': 1.0000038146972656,
 'p50': 1.0000038146972656,
 'p95': 1.0000038146972656,
 'p10': 0.6666641235351562,
 'p75': 1.0000038146972656,
 'p99': 1.0000038146972656,
 'p25': 0.7777748107910156,
 'p100': 1.0000038146972656,
 'min': 0.5999984741210938,
 'mean': 0.9086447123901552,
 'stdDev': 0.14122972770088346}

In [22]:
gds.nodeSimilarity.write(G,
    similarityCutoff = 0.6,
    degreeCutoff = 2,
    topK = 3,
    writeRelationshipType = 'AFFILIATED',
    writeProperty = 'score'
)

NodeSimilarity:   0%|          | 0/100 [00:00<?, ?%/s]

preProcessingMillis                                                       0
computeMillis                                                         50522
writeMillis                                                            5970
postProcessingMillis                                                     -1
nodesCompared                                                         15326
relationshipsWritten                                                  25699
similarityDistribution    {'p1': 0.5999984741210938, 'max': 1.0000038146...
configuration             {'topK': 3, 'writeConcurrency': 4, 'similarity...
Name: 0, dtype: object

In [23]:
# Clean up
G = gds.graph.get('test') # Get our graph projection in case we accidently lost G
G.drop() # Drop it from the graph catalogue to release memory allocation

graphName                                                         test
database                                                         neo4j
memoryUsage                                                           
sizeInBytes                                                         -1
nodeCount                                                       104000
relationshipCount                                               138894
configuration        {'relationshipProjection': {'STAKEHOLDER': {'o...
density                                                       0.000013
creationTime                       2022-11-10T08:20:29.501283000+00:00
modificationTime                   2022-11-10T08:20:35.370876000+00:00
schema               {'graphProperties': {}, 'relationships': {'STA...
Name: 0, dtype: object

In [27]:
gds.run_cypher(''' 
    MATCH (p:Person)-[a:AFFILIATED]->(o)
    return p.name as pep, o.name as affilated_with, a.score as score limit 10
''').head(10)


Unnamed: 0,pep,affilated_with,score
0,Pliska - Bernard F.,NIKE Elevate C.V.,0.666667
1,Ponthaud - Benito de,Murray - Cynthia,1.0
2,Ponthaud - Benito de,Looz - Thierry de,1.0
3,Ponthaud - Benito de,Vassallo - Francis J.,1.0
4,POON - Jing,KWAN - Phileas Po Lam,0.75
5,POON - Jing,LIM - Yin Cheng,0.75
6,POON - Jing,LUN - Pui Kan,0.75
7,Popiel - Brian,PricewaterhouseCoopers LLP - San Francisco,0.625
8,Popiel - Brian,Rothman - Matt,0.714286
9,Porrini - Paul T.,Ezrati - Lester D.,0.611111


In [28]:
# Delete any symmetric relationships
gds.run_cypher(''' 
    match (p:Person)-[a:AFFILIATED]->(o)
    where exists { (p)<-[:AFFILIATED]-(o) }
    delete a
    return count(*) as numberOfRelsDeleted
''').head(1)

Unnamed: 0,numberOfRelsDeleted
0,0


## Excerciese

Explore the `(:Person)-[:AFFILIATED]-(:Person)` network
- Communities?
- Central, important or influencial Persons?

## Excerciese

Merge nodes if they have th e same id
```
MATCH p=()-[r:SAME_ID_AS]->() RETURN p LIMIT 25
```
Use apoc to merge nodes and keep relationships
