# Experiments on ICIJ Paradise Papers (Neo4j sandbox)

## Setting up

In [1]:
%%capture
%pip install graphdatascience pandas ipython

In [2]:
import pandas as pd
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience

In [3]:
# Neo4j Sandbox Connection details
DB_ULR = 'bolt://3.237.238.94:7687'
DB_USER = 'neo4j'
DB_PASS = 'knives-swords-cloths'
gds = GraphDataScience.from_neo4j_driver(DB_ULR, auth=(DB_USER, DB_PASS))

## A subset of the "Power players in the data"
https://offshoreleaks.icij.org/power-players

In [None]:
peps = [
    'JEAN CHRÉTIEN',
    'MUKHTAR ABLYAZOV',
    'BEIBUT ATAMKULOV',
    'HENRIQUE DE CAMPOS MEIRELLES',
    'WESLEY K. CLARK',
    'SHAUKAT AZIZ',
    'ALEJANDRO GERTZ MANERO',
    'ANTANAS GUOGA',
    'HAKAINDE SAMMY HICHILEMA',
    'RAVINDRA KISHORE (RK) SINHA',
    'VALERIY VOSHCHEVSKY',
    'SALLY KOSGEI',
    'REX TILLERSON',
    'PAUL MARTIN',
    'ELLEN JOHNSON SIRLEAF',
    'BRIAN MULRONEY',
    'MUDHAR GHASSAN SHAWKAT',
    'SAUAT MUKHAMETBAYEVICH MYNBAYEV',
    'JAMES MEYER SASSOON',
    'PRABOWO SUBIANTO',
    'SAM KAHAMBA KUTESA',
    'WILBUR LOUIS ROSS, JR.',
    'PRINCE KHALED BIN SULTAN BIN ABDULAZIZ',   
    'CARLOS QUINTANILLA SCHMIDT',
    'ALFRED GUSENBAUER',
    'YUKIO HATOYAMA',
    'ANTON PRIGODSKY',
    'PENNY PRITZKER',
    'JOHN MAHAMA',
    'BINALI YILDIRIM',
    'SUHARTO',
    'BASHAR AL-ASSAD',
    'BLAIRO BORGES MAGGI',
    'JAYANT SINHA',
    'JUAN MANUEL SANTOS'
]

In [None]:
# Create fulltext index
gds.run_cypher('''
    call db.index.fulltext.createNodeIndex('ftx_offshore',
        ['Officer'],
        ['name']
    )
''')


In [None]:
# Check index fully polulated
gds.run_cypher('''call db.indexDetails('ftx_offshore') yield state, populationPercent''').head()

In [None]:
# Flag politically exposed persons (PEP)
test = gds.run_cypher(''' 
    unwind $peps as pep
    call db.index.fulltext.queryNodes('ftx_offshore', 'name: ' + pep + '',{limit:1}) yield node,score
    with node, score, pep
    set node:PEP
    return pep, node.name as name, node.type as type, labels(node) as labels, score, id(node) as id    
''', params = { 'peps': peps})
test.head(40)

## Who is who

In [None]:
companies = gds.run_cypher(''' 
    MATCH path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments" OR
          p.name contains "Investments" OR
          p.name contains "L.L.C." OR
          p.name contains "LIMITED" OR
          p.name contains "Investors"
        )
    return distinct p.name limit 50999
''')
companies.head(50)

In [41]:
# Label Officers as Companies
gds.run_cypher(''' 
    MATCH path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments" OR
          p.name contains "Investments" OR
          p.name contains "L.L.C." OR
          p.name contains "LIMITED" OR
          p.name contains "Investors" OR
          p.name contains "CORPORATE" OR
          p.name contains "LTD." OR
          p.name contains "MANAGERS" OR 
          p.name contains "REGISTER" 
        )
    with distinct p
    set p:Company remove p:Person
''')

In [42]:
# Label Officers that ar not labeled Company with Person
gds.run_cypher(''' 
    MATCH (p:Officer)
    where not p:Company 
    set p:Person
''')

In [None]:
# Check what we have
gds.run_cypher(''' 
    MATCH (p:Officer)
    return labels(p), count(*)
''').head(10)


## Playground

In [None]:
# Check what outgoing rels we have for Persons
# Why? Node similarity expects a directed graph
gds.run_cypher(''' 
    MATCH p=(:Person)-[r]->()
    return type(r), count(*)
''').head(10)

# Note: Explore the "CONNECTED_TO" relationships!

In [43]:
# Cleanup
gds.run_cypher(''' 
    MATCH ()-[a:AFFILIATED]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    MATCH ()-[a:STAKEHOLDER]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    MATCH ()-[a:LIVES_AT]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
    
''')

In [44]:
# Prep: To focus on persons, let's say they are STAKEHOLDER's and LIVES_AT
# (i.e. avoid similarity to be computed between Companies)
gds.run_cypher(''' 
    match (p:Person)-[:OFFICER_OF]->(c)
    call {
        with p,c
        merge (p)-[:STAKEHOLDER]->(c)
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    match (p:Person)-[:REGISTERED_ADDRESS]->(c)
    call {
        with p,c
        merge (p)-[:LIVES_AT]->(c)
    } in transactions of 5000 rows
''')

In [45]:
G, project_stats = gds.graph.project(
    'test', 
    node_spec=['Person','Company','Entity','Intermediary'], 
    relationship_spec=['STAKEHOLDER','LIVES_AT']
)

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

In [46]:
project_stats

nodeProjection            {'Entity': {'label': 'Entity', 'properties': {...
relationshipProjection    {'STAKEHOLDER': {'orientation': 'NATURAL', 'ag...
graphName                                                              test
nodeCount                                                            104000
relationshipCount                                                    140835
projectMillis                                                           816
Name: 0, dtype: object

In [7]:
estimate = gds.nodeSimilarity.stats.estimate(G)
sysinfo = gds.debug.sysInfo()
assert sysinfo[ sysinfo["key"] == "heapTotalInBytes" ]["value"].values[0] > estimate.bytesMin

In [51]:
gds.run_cypher('call dbms.setConfigValue("dbms.transaction.timeout","20m")')

In [34]:
ns_stats = gds.nodeSimilarity.stats(G,
    similarityCutoff = 0.6,
    degreeCutoff = 2,
    topK = 3
)

NodeSimilarity:   0%|          | 0/100 [00:00<?, ?%/s]

In [21]:
ns_stats.similarityDistribution

{'p1': 0.5999984741210938,
 'max': 1.0000038146972656,
 'p5': 0.6666641235351562,
 'p90': 1.0000038146972656,
 'p50': 1.0000038146972656,
 'p95': 1.0000038146972656,
 'p10': 0.6666641235351562,
 'p75': 1.0000038146972656,
 'p99': 1.0000038146972656,
 'p25': 0.7777748107910156,
 'p100': 1.0000038146972656,
 'min': 0.5999984741210938,
 'mean': 0.9081031645172395,
 'stdDev': 0.1414988467536248}

In [47]:
gds.nodeSimilarity.write(G,
    similarityCutoff = 0.6,
    degreeCutoff = 2,
    topK = 3,
    writeRelationshipType = 'AFFILIATED',
    writeProperty = 'score'
)

NodeSimilarity:   0%|          | 0/100 [00:00<?, ?%/s]

preProcessingMillis                                                       0
computeMillis                                                         48907
writeMillis                                                            2419
postProcessingMillis                                                     -1
nodesCompared                                                         15380
relationshipsWritten                                                  25760
similarityDistribution    {'p1': 0.5999984741210938, 'max': 1.0000038146...
configuration             {'topK': 3, 'writeConcurrency': 4, 'similarity...
Name: 0, dtype: object

In [48]:
# Clean up
G = gds.graph.get('test') # Get our graph projection in case we accidently lost G
G.drop() # Drop it from the graph catalogue to release memory allocation

graphName                                                         test
database                                                         neo4j
memoryUsage                                                           
sizeInBytes                                                         -1
nodeCount                                                       104000
relationshipCount                                               140835
configuration        {'relationshipProjection': {'STAKEHOLDER': {'o...
density                                                       0.000013
creationTime                       2022-11-01T12:00:34.389781000+00:00
modificationTime                   2022-11-01T12:00:35.246972000+00:00
schema               {'graphProperties': {}, 'relationships': {'STA...
Name: 0, dtype: object

In [50]:
gds.run_cypher(''' 
    MATCH (p:Person)-[a:AFFILIATED]->(o)
    return p.name as pep, o.name as affilated_with, a.score as score limit 10
''').head(10)


Unnamed: 0,pep,affilated_with,score
0,Pliska - Bernard F.,NIKE Elevate C.V.,0.666667
1,Pliska - Bernard F.,Carter - James Cecil,0.666667
2,Plotkin - Roger B.,Weiss - John A.,1.0
3,Plotkin - Roger B.,May - William H.,1.0
4,Plotkin - Roger B.,Widergren - James F.,1.0
5,Pocius - Victor F,Kane - Thomas P,0.666667
6,Pocius - Victor F,Hedley - Nicholas,1.0
7,Pogue - Charles M,Imle - John F.,0.73913
8,Pogue - Charles M,Laughbaum - Graydon H.,0.690476
9,Pogue - Charles M,Fawthrop - Andrew L,0.77907


## Excerciese

Merge nodes if they have the same id
```
MATCH p=()-[r:SAME_ID_AS]->() RETURN p LIMIT 25
```
Use apoc to merge nodes and keep relationships
