In [20]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "e2edemo"

# Create driver
Also set the DB_NAME

Hint: Use conda if you get  No module named 'pyarrow._flight' on Apple Silicon

In [21]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from neo4j import GraphDatabase # Python database driver
import pyarrow
import pyarrow.flight
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.set_database(DB_NAME)

In [22]:
# Prepare databases needed
# Our target database sould be removed
with driver.session(database = "system") as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            "DROP DATABASE {dbname} IF EXISTS".format(dbname = DB_NAME)
        ).data()
    )

In [23]:
# We need one database during the import (just so we can reference the projection in the graph catalogue)
with driver.session(database = "system") as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            "CREATE DATABASE neo4j IF NOT EXISTS"
        ).data()
    )

In [24]:
# Check if apache arrow is enabled (if not, add gds.arrow.enabled=true to neo4j.conf)
gds.set_database("neo4j")
arrow_running = gds.run_cypher("call gds.debug.arrow() yield running return running")
assert arrow_running["running"][0] == True, "Arrow not running"


#  Load data

In [25]:
# Read csv file
# Source https://www.kaggle.com/retailrocket/ecommerce-dataset?select=events.csv
csv = pd.read_csv('/Users/haklof/datasets/events.csv')
print(csv)

             timestamp  visitorid event  itemid  transactionid
0        1433221332117     257597  view  355908            NaN
1        1433224214164     992329  view  248676            NaN
2        1433221999827     111016  view  318965            NaN
3        1433221955914     483717  view  253185            NaN
4        1433221337106     951259  view  367447            NaN
...                ...        ...   ...     ...            ...
2756096  1438398785939     591435  view  261427            NaN
2756097  1438399813142     762376  view  115946            NaN
2756098  1438397820527    1251746  view   78144            NaN
2756099  1438398530703    1184451  view  283392            NaN
2756100  1438400163914     199536  view  152913            NaN

[2756101 rows x 5 columns]


# Construct graph using Apache Arrow

In [26]:
# Select all unique visitors
visitors = csv['visitorid'].drop_duplicates().dropna()
print(visitors)

0           257597
1           992329
2           111016
3           483717
4           951259
            ...   
2756083    1392454
2756093     226214
2756096     591435
2756097     762376
2756099    1184451
Name: visitorid, Length: 1407580, dtype: int64


In [27]:
# Select all unique Items
items = csv['itemid'].drop_duplicates().dropna()
print(items)

0          355908
1          248676
2          318965
3          253185
4          367447
            ...  
2756002     19206
2756039    172413
2756042       613
2756060     52086
2756062    177353
Name: itemid, Length: 235061, dtype: int64


In [28]:
# Select all VIEWD relationships
viewed = csv[csv['event'] == 'view'][['visitorid','itemid', 'timestamp']]
print(viewed)

         visitorid  itemid      timestamp
0           257597  355908  1433221332117
1           992329  248676  1433224214164
2           111016  318965  1433221999827
3           483717  253185  1433221955914
4           951259  367447  1433221337106
...            ...     ...            ...
2756096     591435  261427  1438398785939
2756097     762376  115946  1438399813142
2756098    1251746   78144  1438397820527
2756099    1184451  283392  1438398530703
2756100     199536  152913  1438400163914

[2664312 rows x 3 columns]


In [29]:
# Select all ADDED relationships
added = csv[csv['event'] == 'addtocart'][['visitorid','itemid', 'timestamp']]
print(added)

         visitorid  itemid      timestamp
17          287857    5206  1433223236124
19          158090   10572  1433221078505
63         1193904  255275  1433223543021
112         599528  356475  1433221941632
179         105775  312728  1433220880956
...            ...     ...            ...
2755956     831605   57810  1438400400805
2756056      10670  419736  1438398156086
2756074     144106  141241  1438400994744
2756078     804736  447661  1438399807937
2756090     804736  346534  1438399811281

[69332 rows x 3 columns]


In [30]:
# Select all BOUGHT relationships
bought = csv[csv['event'] == 'transaction'][['visitorid','itemid', 'timestamp', 'transactionid']]
bought.transactionid = bought.transactionid.astype(int)
print(bought)

         visitorid  itemid      timestamp  transactionid
130         599528  356475  1433222276276           4000
304         121688   15335  1433193500981          11117
418         552148   81345  1433193915008           5444
814         102019  150318  1433176736375          13556
843         189384  310791  1433174518180           7244
...            ...     ...            ...            ...
2755294    1050575   31640  1438377176570           8354
2755349     861299  456602  1438379878779           3643
2755508     855941  235771  1438357730123           4385
2755603     548772   29167  1438355560300          13872
2755607    1051054  312728  1438358989163          17579

[22457 rows x 4 columns]


In [31]:
nodes_id = pd.DataFrame(
     {
        "id": visitors.tolist() + items.tolist(),
        "labels": ['Visitor'] * visitors.shape[0] + ['Item'] * items.shape[0]
     }
).reset_index()
nodes_id.head()

Unnamed: 0,index,id,labels
0,0,257597,Visitor
1,1,992329,Visitor
2,2,111016,Visitor
3,3,483717,Visitor
4,4,951259,Visitor


In [32]:
# Nodes DataFrame
nodes = pd.DataFrame(
    {
        "nodeId": nodes_id["index"].tolist(),
        "labels": nodes_id["labels"].tolist(),
        "id": nodes_id["id"].tolist()
    }
)
nodes.head()

Unnamed: 0,nodeId,labels,id
0,0,Visitor,257597
1,1,Visitor,992329
2,2,Visitor,111016
3,3,Visitor,483717
4,4,Visitor,951259


In [33]:
# Relationships DataFrame
relationships = pd.DataFrame(
    {
        "sourceNodeId": 
            viewed.merge(nodes_id[nodes_id['labels'] == "Visitor" ], left_on="visitorid", right_on="id")["index"].to_list()
             + added.merge(nodes_id[nodes_id['labels'] == "Visitor" ], left_on="visitorid", right_on="id")["index"].to_list()
             + bought.merge(nodes_id[nodes_id['labels'] == "Visitor" ], left_on="visitorid", right_on="id")["index"].to_list(),
        "targetNodeId": 
            viewed.merge(nodes_id[nodes_id['labels'] == "Item" ], left_on="itemid", right_on="id")["index"].to_list()
             + added.merge(nodes_id[nodes_id['labels'] == "Item" ], left_on="itemid", right_on="id")["index"].to_list()
             + bought.merge(nodes_id[nodes_id['labels'] == "Item" ], left_on="itemid", right_on="id")["index"].to_list(), 
        "relationshipType": ['VIEWED']*viewed.shape[0] + ['ADDED']*added.shape[0] + ['BOUGHT']*bought.shape[0],
        "timestamp": viewed['timestamp'].to_list() + added['timestamp'].to_list() + bought['timestamp'].to_list(),
        "transactionid": [0]*viewed.shape[0] + [0]*added.shape[0] + bought['transactionid'].to_list()
    }
)

In [34]:
# Construct the graph
gds.set_database("neo4j")
G = gds.alpha.graph.construct(
    'items_raw',
    nodes,
    relationships
)

In [35]:
# Persist graph into target database
gds.set_database("neo4j")
gds.graph.export(G, dbName=DB_NAME)

dbName                         e2edemo
graphName                    items_raw
nodeCount                      1642641
relationshipCount              2756101
relationshipTypeCount                3
nodePropertyCount              3285282
relationshipPropertyCount      5512202
writeMillis                      11143
Name: 0, dtype: object

In [36]:
# Create entry for the target database in the system database
with driver.session(database = "system") as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            "CREATE DATABASE {dbname}".format(dbname = DB_NAME)
        ).data()
    )

In [37]:
gds.set_database("neo4j")
gds.graph.drop(G)

graphName                                                    items_raw
database                                                         neo4j
memoryUsage                                                           
sizeInBytes                                                         -1
nodeCount                                                      1642641
relationshipCount                                              2756101
configuration                                                       {}
density                                                       0.000001
creationTime                       2022-09-13T10:25:40.820057000+02:00
modificationTime                   2022-09-13T10:25:40.819903000+02:00
schema               {'graphProperties': {}, 'relationships': {'VIE...
Name: 0, dtype: object

In [38]:
# Create indexes and constraints
with driver.session(database = DB_NAME) as session:
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Visitor) REQUIRE (n.id) IS NODE KEY"
        ).consume()
    )
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Item) REQUIRE (n.id) IS NODE KEY"
        ).consume()     
    )

# Some basic queries

In [39]:
# Top sellers
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item)<-[:BOUGHT]-()
            RETURN  i.id as itemid, 
                    count(*) as times_bought,
                    size( (i:Item)<-[:ADDED]-() ) as times_added,
                    size( (i:Item)<-[:VIEWED]-() ) as times_viewed
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   itemid  times_bought  times_added  times_viewed
0  461686           133          306          2539
1  119736            97           44           752
2  213834            92           17           293
3    7943            46           97          1346
4  312728            46          162           947
5  445351            45           89           939
6   48030            41           95           986
7  248455            38           52           575
8  420960            38           60           796
9   17478            37           72           631


In [40]:
# Collaberative filtering
# Suggest what other Items Visitors buy for a Visitor veiwing an Item (where the Visitor has not added/bought the Item already)
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct bought) as times_bought
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=201110
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   product_bought  times_bought
0          259227             2
1          289915             2
2          133907             2
3          360487             1
4          167126             1
5          200854             1
6          393419             1
7          370390             1
8          429907             1
9          222294             1


In [45]:
# Collaberative filtering (alternative)
# Suggest based on what other items that were checked out in same transaction
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[b1:BOUGHT]-(visitor)-[b2:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            //AND b1.transactionid = b2.transactionid
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct b1.transactionid) as times_bought_together
            ORDER BY times_bought_together desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=201110
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   product_bought  times_bought_together
0          179494                      3
1          245449                      3


# Questions so far?

# Graph data science

In [46]:
# Do we have any "abnormal visitors"
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            RETURN  max(number_of_events) as `max`,
                    percentileCont(number_of_events, 0.999) as `p0.999`,
                    percentileCont(number_of_events, 0.99) as `p0.99`,
                    percentileCont(number_of_events, 0.9) as `p0.90`,
                    percentileCont(number_of_events, 0.75) as `p0.75`,
                    min(number_of_events) as `min`
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

    max  p0.999  p0.99  p0.90  p0.75  min
0  7757    47.0   13.0    3.0      2    1


In [47]:
# Re-label abnormal visitors
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            WHERE number_of_events > 50
            SET v:AbnormalVisitor
            REMOVE v:Visitor
            RETURN count(*) as number_of_abnormal_visitors
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   number_of_abnormal_visitors
0                         1225


# What Items are important?

In [48]:
gds.set_database(DB_NAME)
G2, project_stats = gds.graph.project.cypher(
    'items',
    """
    MATCH (i:Item) return id(i) as id
    """,
    """
        MATCH (i1:Item)<-[r1:VIEWED]-(v:Visitor)-[r2:VIEWED|ADDED|BOUGHT]->(i2:Item)
        WHERE r1.timestamp<r2.timestamp
        WITH i1, i2, r1, case type(r2) when "BOUGHT" then 1.0 when "ADDED" then 0.7 else 0.2 end as weight
        RETURN id(i1) as target, id(i2) as source, weight
     """,
    readConcurrency=16
)

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

ClientError: Failed to invoke procedure `gds.beta.listProgress`: Caused by: java.lang.IllegalArgumentException: count is negative: -2

In [49]:
gds.set_database(DB_NAME)
G2 = gds.graph.get('items')

In [50]:
pagerankRes = gds.pageRank.write(G2, maxIterations=200, relationshipWeightProperty='weight', concurrency=16, writeProperty='pagerank')

PageRank:   0%|          | 0/100 [00:00<?, ?%/s]

In [51]:
print(pagerankRes)

writeMillis                                                             416
nodePropertiesWritten                                                235061
ranIterations                                                           114
didConverge                                                            True
centralityDistribution    {'p99': 3.1381826400756836, 'min': 0.149999618...
postProcessingMillis                                                    106
preProcessingMillis                                                       0
computeMillis                                                          1285
configuration             {'maxIterations': 200, 'writeConcurrency': 16,...
Name: 0, dtype: object


In [52]:
G2.drop()

graphName                                                        items
database                                                       e2edemo
memoryUsage                                                           
sizeInBytes                                                         -1
nodeCount                                                       235061
relationshipCount                                              4986625
configuration        {'relationshipQuery': 'MATCH (i1:Item)<-[r1:VI...
density                                                        0.00009
creationTime                       2022-09-13T10:34:44.654322000+02:00
modificationTime                   2022-09-13T10:34:48.918867000+02:00
schema               {'graphProperties': {}, 'relationships': {'__A...
Name: 0, dtype: object

In [53]:
# Can we suggest an Item with high probability of conversion (central to conversion)
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought,
                    avg(otherItem.pagerank) as rank,
                    count(distinct bought) as times_bought
            ORDER BY rank desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   product_bought      rank  times_bought
0          222294  2.308461             1
1          133907  1.455093             2
2          360487  1.414051             1
3          444931  0.554927             1
4          393419  0.408643             1
5          167126  0.288781             1
6          259227  0.282716             2
7          200854  0.246120             1
8          429907  0.225957             1
9          289915  0.212452             2


In [None]:
# For presentation, create aggregated viewed, added and bougtht relationships?
