# Experiments with ICIJ Paradise Papers (Neo4j sandbox)

## Setting up

In [None]:
%%capture
%pip install graphdatascience pandas ipython

In [None]:
import pandas as pd
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience

In [None]:
# Neo4j Sandbox Connection details
DB_ULR = 'bolt://18.205.6.212:7687'
DB_USER = 'neo4j'
DB_PASS = 'passage-originator-merchant'
gds = GraphDataScience.from_neo4j_driver(DB_ULR, auth=(DB_USER, DB_PASS))

In [None]:
# Increase the transaction time out, since we will do some long running transactions
gds.run_cypher('call dbms.setConfigValue("dbms.transaction.timeout","20m")')

## A subset of the "Power players in the data"
https://offshoreleaks.icij.org/power-players

In [None]:
peps = [
    'JEAN CHRÉTIEN',
    'MUKHTAR ABLYAZOV',
    'BEIBUT ATAMKULOV',
    'HENRIQUE DE CAMPOS MEIRELLES',
    'WESLEY K. CLARK',
    'SHAUKAT AZIZ',
    'ALEJANDRO GERTZ MANERO',
    'ANTANAS GUOGA',
    'HAKAINDE SAMMY HICHILEMA',
    'RAVINDRA KISHORE (RK) SINHA',
    'VALERIY VOSHCHEVSKY',
    'SALLY KOSGEI',
    'REX TILLERSON',
    'PAUL MARTIN',
    'ELLEN JOHNSON SIRLEAF',
    'BRIAN MULRONEY',
    'MUDHAR GHASSAN SHAWKAT',
    'SAUAT MUKHAMETBAYEVICH MYNBAYEV',
    'JAMES MEYER SASSOON',
    'PRABOWO SUBIANTO',
    'SAM KAHAMBA KUTESA',
    'WILBUR LOUIS ROSS, JR.',
    'PRINCE KHALED BIN SULTAN BIN ABDULAZIZ',   
    'CARLOS QUINTANILLA SCHMIDT',
    'ALFRED GUSENBAUER',
    'YUKIO HATOYAMA',
    'ANTON PRIGODSKY',
    'PENNY PRITZKER',
    'JOHN MAHAMA',
    'BINALI YILDIRIM',
    'SUHARTO',
    'BASHAR AL-ASSAD',
    'BLAIRO BORGES MAGGI',
    'JAYANT SINHA',
    'JUAN MANUEL SANTOS'
]

In [None]:
# Create fulltext index
gds.run_cypher('''
    call db.index.fulltext.createNodeIndex('ftx_offshore',
        ['Officer'],
        ['name']
    )
''')


In [None]:
# Check index fully polulated
gds.run_cypher('''call db.indexDetails('ftx_offshore') yield state, populationPercent''').head()

In [None]:
# Flag politically exposed persons (PEP)
test = gds.run_cypher(''' 
    unwind $peps as pep
    call db.index.fulltext.queryNodes('ftx_offshore', 'name: ' + pep + '',{limit:1}) yield node,score
    with node, score, pep
    set node:PEP
    return pep, node.name as name, node.type as type, labels(node) as labels, score, id(node) as id    
''', params = { 'peps': peps})
test.head(40)

## Who is who

In [None]:
companies = gds.run_cypher(''' 
    match path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments" OR
          p.name contains "Investments" OR
          p.name contains "L.L.C." OR
          p.name contains "LIMITED" OR
          p.name contains "Investors" OR
          p.name contains "CORPORATE" OR
          p.name contains "LTD." OR
          p.name contains "MANAGERS" OR 
          p.name contains "REGISTER" 
        )
    return distinct p.name limit 50
''')
companies.head(50)

In [None]:
# Label Officers as Companies
gds.run_cypher(''' 
    match path=(p:Officer)-[:OFFICER_OF|REGISTERED_ADDRESS]->(o)
    where (o:Address or o:Entity) 
    and ( p.name contains "Ltd." OR
          p.name contains "Limited" OR
          p.name contains "Trust" OR
          p.name contains "Investments" OR
          p.name contains "Investments" OR
          p.name contains "L.L.C." OR
          p.name contains "LIMITED" OR
          p.name contains "Investors" OR
          p.name contains "CORPORATE" OR
          p.name contains "LTD." OR
          p.name contains "MANAGERS" OR 
          p.name contains "REGISTER" 
        )
    with distinct p
    set p:Company remove p:Person
''')

In [None]:
# Label Officers that ar not labeled Company with Person
gds.run_cypher(''' 
    match (p:Officer)
    where not p:Company 
    set p:Person
''')

In [None]:
# Check what we have
gds.run_cypher(''' 
    match (p:Officer)
    return labels(p), count(*)
''').head(10)


## Playground

In [None]:
# Check what outgoing rels we have for Persons
# Why? Node similarity expects a directed graph
gds.run_cypher(''' 
    match p=(:Person)-[r]->()
    return type(r), count(*)
''').head(10)

# Note: Explore the "CONNECTED_TO" relationships!

In [None]:
# Cleanup
gds.run_cypher(''' 
    match (:Person)-[a:AFFILIATED]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    match (:Person)-[a:STAKEHOLDER]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    match (:Person)-[a:LIVES_AT]->()
    call {
        with a
        delete a
    } in transactions of 5000 rows
    
''')

In [None]:
# Prep: To focus on persons, let's say they are STAKEHOLDER's and LIVES_AT
# (i.e. avoid similarity to be computed between Companies)
gds.run_cypher(''' 
    match (p:Person)-[:OFFICER_OF]->(c)
    call {
        with p,c
        merge (p)-[:STAKEHOLDER]->(c)
    } in transactions of 5000 rows
''')
gds.run_cypher(''' 
    match (p:Person)-[:REGISTERED_ADDRESS]->(c)
    call {
        with p,c
        merge (p)-[:LIVES_AT]->(c)
    } in transactions of 5000 rows
''')

In [None]:
G, project_stats = gds.graph.project(
    'test', 
    node_spec=['Person','Company','Entity','Intermediary'], 
    relationship_spec=['STAKEHOLDER','LIVES_AT']
)

In [None]:
project_stats

In [None]:
# This is how we would estimate how much resources it takes to run
# an algorithm in a specific mode (stats, write, mutate).
# We don't have to do it, it is done behind the scenes when using the gds client
estimate = gds.nodeSimilarity.stats.estimate(G)
sysinfo = gds.debug.sysInfo()
assert sysinfo[ sysinfo["key"] == "heapTotalInBytes" ]["value"].values[0] > estimate.bytesMin

In [None]:
ns_stats = gds.nodeSimilarity.stats(G,
    similarityCutoff = 0.6,
    degreeCutoff = 2,
    topK = 3
)

In [None]:
ns_stats.similarityDistribution

In [None]:
gds.nodeSimilarity.write(G,
    similarityCutoff = 0.6,
    degreeCutoff = 2,
    topK = 3,
    writeRelationshipType = 'AFFILIATED',
    writeProperty = 'score'
)

In [None]:
# Clean up
G = gds.graph.get('test') # Get our graph projection in case we accidently lost G
G.drop() # Drop it from the graph catalogue to release memory allocation

In [None]:
gds.run_cypher(''' 
    MATCH (p:Person)-[a:AFFILIATED]->(o)
    return p.name as pep, o.name as affilated_with, a.score as score limit 10
''').head(10)


In [None]:
# Delete any symmetric relationships
gds.run_cypher(''' 
    match (p:Person)-[a:AFFILIATED]->(o)
    where exists { (p)<-[:AFFILIATED]-(o) }
    delete a
    return count(*) as numberOfRelsDeleted
''').head(1)

## Excerciese

Explore the `(:Person)-[:AFFILIATED]-(:Person)` network
- Communities?
- Central, important or influencial Persons?

## Excerciese

Merge nodes if they have th e same id
```
MATCH p=()-[r:SAME_ID_AS]->() RETURN p LIMIT 25
```
Use apoc to merge nodes and keep relationships
