# Experiments on ICIJ Paradise Papers (Neo4j sandbox)

## Setting up

In [None]:
!%pip install graphdatascience pandas

In [33]:
import pandas as pd
from graphdatascience import GraphDataScience

In [34]:
# Neo4j Sandbox Connection details
DB_ULR = 'bolt://3.237.238.94:7687'
DB_USER = 'neo4j'
DB_PASS = 'knives-swords-cloths'
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))

## A subset of the "Power players in the data"
https://offshoreleaks.icij.org/power-players

In [35]:
peps = [
    'JEAN CHRÉTIEN',
    'MUKHTAR ABLYAZOV',
    'BEIBUT ATAMKULOV',
    'HENRIQUE DE CAMPOS MEIRELLES',
    'WESLEY K. CLARK',
    'SHAUKAT AZIZ',
    'ALEJANDRO GERTZ MANERO',
    'ANTANAS GUOGA',
    'HAKAINDE SAMMY HICHILEMA',
    'RAVINDRA KISHORE (RK) SINHA',
    'VALERIY VOSHCHEVSKY',
    'SALLY KOSGEI',
    'REX TILLERSON',
    'PAUL MARTIN',
    'ELLEN JOHNSON SIRLEAF',
    'BRIAN MULRONEY',
    'MUDHAR GHASSAN SHAWKAT',
    'SAUAT MUKHAMETBAYEVICH MYNBAYEV',
    'JAMES MEYER SASSOON',
    'PRABOWO SUBIANTO',
    'SAM KAHAMBA KUTESA',
    'WILBUR LOUIS ROSS, JR.',
    'PRINCE KHALED BIN SULTAN BIN ABDULAZIZ',   
    'CARLOS QUINTANILLA SCHMIDT',
    'ALFRED GUSENBAUER',
    'YUKIO HATOYAMA',
    'ANTON PRIGODSKY',
    'PENNY PRITZKER',
    'JOHN MAHAMA',
    'BINALI YILDIRIM',
    'SUHARTO',
    'BASHAR AL-ASSAD',
    'BLAIRO BORGES MAGGI',
    'JAYANT SINHA',
    'JUAN MANUEL SANTOS'
]

In [36]:
# Create fulltext index
gds.run_cypher('''
    call db.index.fulltext.createNodeIndex('ftx_offshore',
        ['Officer'],
        ['name']
    )
''')


ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.index.fulltext.createNodeIndex`: Caused by: org.neo4j.kernel.api.exceptions.schema.EquivalentSchemaRuleAlreadyExistsException: An equivalent index already exists, 'Index( id=3, name='ftx_offshore', type='GENERAL FULLTEXT', schema=(:Officer {name}), indexProvider='fulltext-1.0' )'.}

In [37]:
# Check index fully polulated
gds.run_cypher('''call db.indexDetails('ftx_offshore') yield state, populationPercent''').head()

Unnamed: 0,state,populationPercent
0,ONLINE,100.0


In [52]:
# Flag politically exposed persons (PEP)
test = gds.run_cypher(''' 
    unwind $peps as pep
    call db.index.fulltext.queryNodes('ftx_offshore', 'name: ' + pep + '',{limit:1}) yield node,score
    with node, score, pep
    set node:PEP
    return pep, node.name as name, node.type as type, labels(node) as labels, score, id(node) as id    
''', params = { 'peps': peps})
test.head(40)

Unnamed: 0,pep,name,type,labels,score,id
0,JEAN CHRÉTIEN,Joseph Jacques Jean Chrétien,,"[Officer, PEP, Person]",6.888455,87589
1,MUKHTAR ABLYAZOV,Ablyazov - Mukhtar,,"[Officer, PEP, Person]",11.695206,88025
2,BEIBUT ATAMKULOV,BEIBUT ATAMKULOV,,"[Officer, PEP, Person]",11.4198,87507
3,HENRIQUE DE CAMPOS MEIRELLES,Meirelles - Henrique de Campos,,"[Officer, PEP, Person]",13.590715,158044
4,WESLEY K. CLARK,Wesley Clark,,"[Officer, PEP, Person]",8.055619,87592
5,SHAUKAT AZIZ,Shaukat Aziz,,"[Officer, PEP, Person]",10.095312,87594
6,ALEJANDRO GERTZ MANERO,Gertz - Alejandro M,,"[Officer, PEP, Person]",8.707655,102886
7,ANTANAS GUOGA,ANTANAS GUOGA,,"[Officer, PEP, Person]",11.144423,87519
8,HAKAINDE SAMMY HICHILEMA,Hichilema - Hakainde Sammy,,"[Officer, PEP, Person]",14.305145,105417
9,RAVINDRA KISHORE (RK) SINHA,RAVINDRA KISHORE SINHA,,"[Officer, PEP, Person]",13.723818,87537


## Who is who

In [40]:
persons = gds.run_cypher(''' 
    MATCH path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and not p.name contains "Ltd."
    and not p.name contains "Limited"
    and not p.name contains "Trust"
    and not p.name contains "Investments"
    return distinct p.name limit 50
''')
persons.head(50)


Unnamed: 0,p.name
0,Lee Mo Lin - Noel Patrick L.C.K - Mauritius
1,Pleshko - Daniel
2,Plested - John
3,Plewright - Peter John
4,Plexman - Eric Anthony
5,Plianthos - Nicolas Lawrence
6,Plimpton - Tara
7,Pliska - Bernard F.
8,Plotkin - Roger B.
9,Plott - Jeffrey


In [42]:
companies = gds.run_cypher(''' 
    MATCH path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments"
        )
    return distinct p.name limit 50999
''')
companies.head(50)

Unnamed: 0,p.name
0,1022385 Alberta Ltd. as Trustee of the Drinkwa...
1,1022387 Alberta Ltd. as Trustee of the Whitewo...
2,1035533 Ontario Ltd.
3,1105328 Alberta Ltd.
4,"1245180 Alberta Ltd., as trustee of the Sapunj..."
5,2035910 Ontario Ltd.
6,385069 Alberta Ltd.
7,3VS1 Asia Growth Fund Ltd.
8,4 M Associates Ltd.
9,561309 Alberta Ltd.


In [47]:
# Label Officers as Companies
gds.run_cypher(''' 
    MATCH path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments"
        )
    with distinct p
    set p:Company
''')

In [49]:
# Label Officers that ar not labeled Company with Person
gds.run_cypher(''' 
    MATCH (p:Officer)
    where not p:Company 
    set p:Person
''')

In [53]:
# Check what we have
gds.run_cypher(''' 
    MATCH (p:Officer)
    return labels(p), count(*)
''').head(10)


Unnamed: 0,labels(p),count(*)
0,"[Officer, Person]",67000
1,"[Officer, Company]",9978
2,"[Officer, PEP, Person]",34


## Playground

In [56]:
# Check what outgoing rels we have for Persons
# Why? Node similarity expects a directed graph
gds.run_cypher(''' 
    MATCH p=(:Person)-[r]->()
    return type(r), count(*)
''').head(10)

# Note: Explore the "CONNECTED_TO" relationships!

Unnamed: 0,type(r),count(*)
0,REGISTERED_ADDRESS,77155
1,OFFICER_OF,193474
2,CONNECTED_TO,709
3,SAME_ID_AS,2
4,SAME_NAME_AS,89


In [59]:
G, project_stats = gds.graph.project(
    'test', 
    node_spec=['Person','Company','Entity','Intermediary'], 
    relationship_spec=['REGISTERED_ADDRESS','OFFICER_OF']
)

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

In [60]:
project_stats

nodeProjection            {'Entity': {'label': 'Entity', 'properties': {...
relationshipProjection    {'OFFICER_OF': {'orientation': 'NATURAL', 'agg...
graphName                                                              test
nodeCount                                                            104000
relationshipCount                                                    217388
projectMillis                                                          1075
Name: 0, dtype: object

In [64]:
estimate = gds.nodeSimilarity.stats.estimate(G)

In [69]:
sysinfo = gds.debug.sysInfo()

In [83]:
assert sysinfo[ sysinfo["key"] == "heapTotalInBytes" ]["value"].values[0] > estimate.bytesMin

In [85]:
gds.run_cypher('call dbms.setConfigValue("dbms.transaction.timeout","10m")')

In [86]:
ns_stats = gds.nodeSimilarity.stats(G)

NodeSimilarity:   0%|          | 0/100 [00:00<?, ?%/s]

In [58]:
# Clean up
G = gds.graph.get('test') # Get our graph projection in case we accidently lost G
G.drop() # Drop it from the graph catalogue to release memory allocation

graphName                                                         test
database                                                         neo4j
memoryUsage                                                           
sizeInBytes                                                         -1
nodeCount                                                       104000
relationshipCount                                               228130
configuration        {'relationshipProjection': {'__ALL__': {'orien...
density                                                       0.000021
creationTime                       2022-10-31T09:48:28.016403000+00:00
modificationTime                   2022-10-31T09:48:29.064127000+00:00
schema               {'graphProperties': {}, 'relationships': {'__A...
Name: 0, dtype: object

## Excerciese

Merge nodes if they have the sae ID
```
MATCH p=()-[r:SAME_ID_AS]->() RETURN p LIMIT 25
```
Use apoc to merge nodes and keep relationships
