In [1]:
# Install Neo4j GDS Python Client - Install Output Suppressed 
import sys
!{sys.executable} -m pip install graphdatascience



In [2]:
# Neo4j GDS Python Client
from graphdatascience import GraphDataScience

In [3]:
# Instantiate your GDS Session
# Use Neo4j/Bolt URI and credentials according to your setup
# For local standalone instance Bolt connection without auth   
# gds = GraphDataScience("bolt://localhost:7687", auth=None)
# For local standalone instance Bolt connection with auth   
# gds = GraphDataScience("bolt://localhost:7687", auth=("neo4j", "test123"))
# For Remote Cluster Neo4j connection with auth 
# gds = GraphDataScience("neo4j://<FQDN or IP Address>:7687", auth=("neo4j", "removal-knock-heart"))
# For Remote standalone instance Bolt connection with auth 
# -- The below instance might have been nuked by the time you work on this notebook. 
# Roll out a new Neo4j GDS instance to work this notebook to it's end!
gds = GraphDataScience("bolt://3.210.198.179:7687", auth=("neo4j", "apostrophe-knives-rice"))

In [4]:
print(f"Neo4j GDS Version: {gds.version()}")

Neo4j GDS Version: 2.3.1


In [5]:
licenseState = "is" if gds.is_licensed() else "is not"
print(f"Neo4j GDS {licenseState} licensed")

Neo4j GDS is not licensed


In [6]:
# Optional - Set database if you're not using the default DB. 
# Not applicable for AuraDS, Neo4j Desktop and Neo4j Sandbox.
# gds.set_database("my-db")

In [7]:
RELOAD_DATA = False

In [9]:
# Optional - Clean up the Sandbox example graph
# Uncomment to execute
if RELOAD_DATA:
    gds.run_cypher(
        """
        MATCH (n) CALL {
          WITH n
          DETACH DELETE n
        } IN TRANSACTIONS OF 10 ROWS;
        """
    )
else:
    gds.run_cypher(
        """
        MATCH (c:Client) SET c.fraud_group = null, c.fraud_group_2 = null, c.score = null;
        """
    )

In [10]:
if RELOAD_DATA:
    nodeListCSV = gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/clients.csv" AS row
    RETURN row limit 5
    """
    )
    # The object returned is a Pandas Data Frame, so we can explore using standard Pandas methods
    # The output is not what we expected but remember here were reading the CSV file and loading it into the Data Frame
    nodeListCSV.head(5)

In [11]:
if RELOAD_DATA:
    # First we create index
    CONSTRAINTS = [
      "CREATE CONSTRAINT ClientConstraint IF NOT EXISTS FOR (p:Client) REQUIRE p.id IS UNIQUE;",
      "CREATE CONSTRAINT EmailConstraint IF NOT EXISTS FOR (p:Email) REQUIRE p.email IS UNIQUE;",
      "CREATE CONSTRAINT PhoneConstraint IF NOT EXISTS FOR (p:Phone) REQUIRE p.phoneNumber IS UNIQUE;",
      "CREATE CONSTRAINT SSNConstraint IF NOT EXISTS FOR (p:SSN) REQUIRE p.ssn IS UNIQUE;",
      "CREATE CONSTRAINT MerchantConstraint IF NOT EXISTS FOR (p:Merchant) REQUIRE p.id IS UNIQUE;",
      "CREATE CONSTRAINT BankConstraint IF NOT EXISTS FOR (p:Bank) REQUIRE p.id IS UNIQUE;",
      "CREATE CONSTRAINT TransactionConstraint IF NOT EXISTS FOR (p:Transaction) REQUIRE p.globalStep IS UNIQUE;",
      "CREATE CONSTRAINT DebitConstraint IF NOT EXISTS FOR (p:Transaction) REQUIRE p.globalStep IS UNIQUE;",
      "CREATE CONSTRAINT CashInConstraint IF NOT EXISTS FOR (p:CashIn) REQUIRE p.globalStep IS UNIQUE;",
      "CREATE CONSTRAINT CashOutConstraint IF NOT EXISTS FOR (p:CashOut) REQUIRE p.globalStep IS UNIQUE;",
      "CREATE CONSTRAINT TransferConstraint IF NOT EXISTS FOR (p:Transfer) REQUIRE p.globalStep IS UNIQUE;",
      "CREATE CONSTRAINT PaymentConstraint IF NOT EXISTS FOR (p:Payment) REQUIRE p.globalStep IS UNIQUE;"
    ]
    for c in CONSTRAINTS:
        gds.run_cypher(c)

In [12]:
if RELOAD_DATA:
    # Load Clients
    gds.run_cypher(
    """
        LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/clients.csv" AS row
        WITH row
        MERGE (c:Client { id: row.ID })
        SET c.name = row.NAME, c.isFraud = toBoolean(row.ISFRAUD)
        MERGE (p:Phone { phoneNumber: row.PHONENUMBER })
        MERGE (c)-[:HAS_PHONE]->(p)
        MERGE (s:SSN { ssn: row.SSN })
        MERGE (c)-[:HAS_SSN]->(s)
        MERGE (e:Email { email: row.EMAIL })
        MERGE (c)-[:HAS_EMAIL]->(e);
    """
    )

In [13]:
if RELOAD_DATA:
    # Load Merchants
    gds.run_cypher(
    """
        LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/merchants.csv" AS row
        WITH row
        MERGE (m:Merchant { id: row.ID })
        SET m.name = row.NAME, m.highRisk = toBoolean(row.HIGHRISK);
    """
    )

In [14]:
if RELOAD_DATA:
    # Load Debit
    gds.run_cypher(
    """
        LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/debit.csv" AS row
        WITH row
        MERGE (b:Bank { id: row.IDDEST })
        SET b.name = row.NAMEDEST
        MERGE (c:Client { id: row.IDORIG })
        MERGE (t:Transaction:Debit { globalStep: row.GLOBALSTEP })
        SET t.amount = toFloat(row.AMOUNT), t.isFraud = toBoolean(row.ISFRAUD)
        MERGE (t)-[:TO]->(b)
        MERGE (c)-[:PERFORMED]->(t);
    """
    )

In [15]:
if RELOAD_DATA:
    # Load CashIn
    gds.run_cypher(
    """
        LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/cashin.csv" AS row
        CALL {
            WITH row
            MERGE (m:Merchant { id: row.IDDEST })
            SET m.name = row.NAMEDEST
            MERGE (c:Client { id: row.IDORIG })
            MERGE (t:Transaction:CashIn { globalStep: row.GLOBALSTEP })
            SET t.amount = toFloat(row.AMOUNT), t.isFraud = toBoolean(row.ISFRAUD)
            MERGE (t)-[:TO]->(m)
            MERGE (c)-[:PERFORMED]->(t)
        } IN TRANSACTIONS OF 10 ROWS;
    """
    )

In [16]:
if RELOAD_DATA:
    # Load CashOut
    gds.run_cypher(
    """
        LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/cashout.csv" AS row
        CALL {
            WITH row
            MERGE (m:Merchant { id: row.IDDEST })
            SET m.name = row.NAMEDEST
            MERGE (c:Client { id: row.IDORIG })
            SET c.name = row.NAMEORIG
            MERGE (t:Transaction:CashOut { globalStep: row.GLOBALSTEP })
            SET t.amount = toFloat(row.AMOUNT), t.isFraud = toBoolean(row.ISFRAUD)
            MERGE (t)-[:TO]->(m)
            MERGE (c)-[:PERFORMED]->(t)
        } IN TRANSACTIONS OF 10 ROWS;
    """
    )

In [17]:
if RELOAD_DATA:
    # Load Payment
    gds.run_cypher(
    """
        LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/payment.csv" AS row
        CALL {
            WITH row
            MERGE (m:Merchant { id: row.IDDEST })
            SET m.name = row.NAMEDEST
            MERGE (c:Client { id: row.IDORIG })
            SET c.name = row.NAMEORIG
            MERGE (t:Transaction:Payment { globalStep: row.GLOBALSTEP })
            SET t.amount = toFloat(row.AMOUNT), t.isFraud = toBoolean(row.ISFRAUD)
            MERGE (t)-[:TO]->(m)
            MERGE (c)-[:PERFORMED]->(t)
            FOREACH (ignoreMe in CASE WHEN row.TYPEORIG = 'MULE' THEN [1] ELSE [] END | SET c :Mule)
        } IN TRANSACTIONS OF 10 ROWS;
    """
    )

In [18]:
if RELOAD_DATA:
    # Load Transfer
    gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/neo4j-field/graph-summit-apac-2023/main/data/transfer.csv" AS row
    CALL {
        WITH row
        MERGE (cd:Client { id: row.IDDEST })
        SET cd.name = row.NAMEDEST
        MERGE (co:Client { id: row.IDORIG })
        SET co.name = row.NAMEORIG
        MERGE (t:Transaction:Transfer { globalStep: row.GLOBALSTEP })
        SET t.amount = toFloat(row.AMOUNT), t.isFraud = toBoolean(row.ISFRAUD)
        MERGE (t)-[:TO]->(cd)
        MERGE (co)-[:PERFORMED]->(t)
        FOREACH (ignoreMe in CASE WHEN row.TYPEDEST = 'MULE' THEN [1] ELSE [] END | SET cd :Mule)
        FOREACH (ignoreMe in CASE WHEN row.TYPEORIG = 'MULE' THEN [1] ELSE [] END | SET co :Mule)
    } IN TRANSACTIONS OF 10 ROWS;
    """
    )

In [19]:
if RELOAD_DATA:
    # Update data model
    gds.run_cypher(
    """
    MATCH (c:Client) with c.id as clientId
    CALL {
        WITH clientId
        MATCH (c:Client {id: clientId})-[:PERFORMED]->(tx:Transaction)
        WITH c, tx ORDER BY tx.globalStep
        WITH c, collect(tx) AS txs
        WITH c, txs, head(txs) AS _start, last(txs) AS _last

        MERGE (c)-[:FIRST_TX]->(_start)
        MERGE (c)-[:LAST_TX]->(_last)
        WITH c, apoc.coll.pairsMin(txs) AS pairs

        UNWIND pairs AS pair
          WITH pair[0] AS a, pair[1] AS b
          MERGE (a)-[n:NEXT]->(b)
    } IN TRANSACTIONS OF 10 ROWS;
    """
    )

In [20]:
graphName = 'wccGroups'
fraudGroupMinSize = 5

In [21]:
if gds.graph.exists(graphName).exists:
    gds.graph.drop(gds.graph.get(graphName))

In [22]:
projection, projectionPandas = gds.graph.project(graphName, ['Client', 'SSN', 'Email', 'Phone'], ['HAS_SSN', 'HAS_EMAIL', 'HAS_PHONE'])

In [23]:
#Graph Projection Stats from Pandas object
print(projectionPandas)

nodeProjection            {'Email': {'label': 'Email', 'properties': {}}...
relationshipProjection    {'HAS_SSN': {'orientation': 'NATURAL', 'indexI...
graphName                                                         wccGroups
nodeCount                                                              8047
relationshipCount                                                      7749
projectMillis                                                           696
Name: 0, dtype: object


In [24]:
result = gds.wcc.stream(projection)
result.groupby(['componentId']).count().sort_values('nodeId', ascending=False).head(10)

Unnamed: 0_level_0,nodeId
componentId,Unnamed: 1_level_1
5649,25
5207,25
5231,23
5191,23
5221,23
6164,22
5218,21
5698,21
5631,21
5626,21


In [25]:
result = gds.wcc.write(projection, writeProperty='fraud_group', minComponentSize=fraudGroupMinSize)
result

writeMillis                                                            143
nodePropertiesWritten                                                 2631
componentCount                                                        1527
componentDistribution    {'p99': 20, 'min': 4, 'max': 25, 'mean': 5.269...
postProcessingMillis                                                     8
preProcessingMillis                                                      0
computeMillis                                                            9
configuration            {'jobId': '28d79a8b-7f89-4087-84eb-526a582b299...
Name: 0, dtype: object

In [26]:
# Create an index on the new property
gds.run_cypher("CREATE INDEX ClientFraudIndex IF NOT EXISTS FOR (c:Client) on c.fraud_group;")

In [27]:
result = gds.run_cypher("""
  MATCH (c:Client) WHERE c.fraud_group IS NOT NULL
  WITH c.fraud_group AS groupId, collect(c.id) AS members
  WITH groupId, size(members) AS groupSize
  WITH collect(groupId) AS groupsOfSize, groupSize
  RETURN groupSize, size(groupsOfSize) AS numOfGroups
  ORDER BY groupSize DESC;
""")
result.head(10)

Unnamed: 0,groupSize,numOfGroups
0,16,2
1,14,3
2,13,1
3,12,7
4,11,3
5,10,19
6,9,14
7,8,20
8,7,22
9,6,30


In [28]:
result = gds.run_cypher("""
  MATCH (c:Client) WHERE c.fraud_group IS NOT NULL
  WITH c.fraud_group AS groupId, collect(c.id) AS members
  WITH groupId, size(members) AS groupSize WHERE groupSize > $gs
  MATCH (:Client {fraud_group:groupId})-[]-(txn:Transaction)-[]-(c:Client)      // Build our network as before
  WHERE c.fraud_group IS NULL
  UNWIND labels(txn) AS txnType                                                 // Since our PaySim demo stacks labels, let's look at our txn reference
  RETURN distinct(txnType), count(txnType);
""", params= {'gs': fraudGroupMinSize} )
result

Unnamed: 0,txnType,count(txnType)
0,Transaction,3535
1,Transfer,3535


In [29]:
result = gds.run_cypher("""
  MATCH (c:Client) WHERE c.fraud_group IS NOT NULL
  WITH c.fraud_group AS groupId, collect(c.id) AS members
  WITH groupId, size(members) AS groupSize WHERE groupSize > $gs
  MATCH (c1:Client {fraud_group:groupId})-[]-(t:Transaction)-[]-(c2:Client)     // Expand our search to Clients one Transaction away
  WHERE c2.fraud_group IS NULL
  SET c1.suspect = true, c2.suspect = true                                      // Set these Clients as suspects for easier recall
  MERGE (c1)-[r:TRANSACTED_WITH]->(c2)                                          // Merge a relationship directly between Clients and copy some of the Transaction properties over in case we need them.
  ON CREATE SET r += t
  RETURN count(r);
""", params= {'gs': fraudGroupMinSize})
result

Unnamed: 0,count(r)
0,3535


In [30]:
graphName2 = 'wccGroups2'

In [31]:
if gds.graph.exists(graphName2).exists:
    gds.graph.drop(gds.graph.get(graphName2))

In [32]:
projection2, projectionPandas2 = gds.graph.project.cypher(graphName2, 
          'MATCH (c:Client {suspect:true}) RETURN id(c) AS id', 
          'MATCH (c1:Client {suspect:true})-[r:TRANSACTED_WITH]->(c2:Client) RETURN id(c1) AS source, id(c2) as target')

In [33]:
#Graph Projection Stats from Pandas object
print(projectionPandas2)

nodeQuery            MATCH (c:Client {suspect:true}) RETURN id(c) A...
relationshipQuery    MATCH (c1:Client {suspect:true})-[r:TRANSACTED...
graphName                                                   wccGroups2
nodeCount                                                         1278
relationshipCount                                                 1165
projectMillis                                                       24
Name: 0, dtype: object


In [34]:
result = gds.wcc.write(projection2, writeProperty='fraud_group_2');

In [35]:
# Create an index on the new property
gds.run_cypher("CREATE INDEX ClientFraud2Index IF NOT EXISTS FOR (c:Client) on c.fraud_group_2;")

In [36]:
result = gds.run_cypher("""
MATCH (c:Client) WHERE c.fraud_group_2 IS NOT NULL
WITH c.fraud_group_2 AS secondGroupId, collect(c.id) AS members
RETURN secondGroupId, size(members) AS groupSize
ORDER BY groupSize DESC;
""")
result.head(5)

Unnamed: 0,secondGroupId,groupSize
0,56,29
1,68,28
2,2,26
3,38,23
4,14,23


In [37]:
graphName3 = 'betweenness'

In [38]:
if gds.graph.exists(graphName3).exists:
    gds.graph.drop(gds.graph.get(graphName3))

In [39]:
projection3, projectionPandas3 = gds.graph.project.cypher(graphName3, 
    'MATCH (c:Client) WHERE c.fraud_group_2 IS NOT NULL WITH c.fraud_group_2 AS secondGroupId, collect(c.id) AS members WITH secondGroupId, size(members) AS groupSize ORDER BY groupSize DESC LIMIT 1 WITH secondGroupId MATCH (c:Client {fraud_group_2:secondGroupId})-[r:TRANSACTED_WITH]-(c2:Client) RETURN id(c) AS id',
    'MATCH (c:Client) WHERE c.fraud_group_2 IS NOT NULL WITH c.fraud_group_2 AS secondGroupId, collect(c.id) AS members WITH secondGroupId, size(members) AS groupSize ORDER BY groupSize DESC LIMIT 1 WITH secondGroupId MATCH (c1:Client {fraud_group_2:secondGroupId})-[:TRANSACTED_WITH]-(c2:Client) RETURN id(c1) AS source, id(c2) AS target'
)

In [40]:
#Graph Projection Stats from Pandas object
print(projectionPandas3)

nodeQuery            MATCH (c:Client) WHERE c.fraud_group_2 IS NOT ...
relationshipQuery    MATCH (c:Client) WHERE c.fraud_group_2 IS NOT ...
graphName                                                  betweenness
nodeCount                                                           29
relationshipCount                                                   56
projectMillis                                                       20
Name: 0, dtype: object


In [41]:
result = gds.betweenness.write(projection3, writeProperty='score')
result

nodePropertiesWritten                                                    29
writeMillis                                                               5
centralityDistribution    {'p99': 520.003662109375, 'min': 0.0, 'max': 5...
postProcessingMillis                                                     19
preProcessingMillis                                                       0
computeMillis                                                             2
configuration             {'jobId': 'b7d24474-8d48-4576-ae14-909b1b1fd8d...
Name: 0, dtype: object

In [42]:
gds.run_cypher("""
  CALL gds.betweenness.stream('betweenness') YIELD nodeId, score
  WITH gds.util.asNode(nodeId) AS c, score WHERE score > 0                       // Filter 0 scores again
  MATCH (c)-[r:TRANSACTED_WITH]-(:Client)                                        // Retrieve the relationships
  WITH c.name AS name, score, collect(r) AS txns                                 // Collect and count the number of relationships
  WITH name, score AS original, score/size(txns) AS newScore
  RETURN name, newScore, original ORDER BY newScore DESC;  
""")

Unnamed: 0,name,newScore,original
0,Austin Cotton,173.333333,520.0
1,Kevin Bridges,171.0,342.0
2,Tyler Turner,171.0,342.0
3,Scarlett Stephenson,147.0,294.0
4,John Cochran,46.75,374.0
5,Dominic Hatfield,42.0,294.0
6,Brooklyn Riddle,41.777778,376.0
7,Peyton Keller,27.0,54.0
