In [1]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "e2edemo"

# Create driver
Also set the DB_NAME

In [2]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from neo4j import GraphDatabase # Python database driver
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.set_database(DB_NAME)

# Create databse and schema

In [5]:
# Create (or replace) database
with driver.session(database = "system") as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            "CREATE IF NOT EXISTS DATABASE {dbname}".format(dbname = DB_NAME)
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

Empty DataFrame
Columns: []
Index: []


In [6]:
# Create indexes and constraints
with driver.session(database = DB_NAME) as session:
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Visitor) REQUIRE (n.id) IS NODE KEY"
        ).consume()
    )
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Item) REQUIRE (n.id) IS NODE KEY"
        ).consume()     
    )

#  Load data

In [7]:
# Read csv file
# Source https://www.kaggle.com/retailrocket/ecommerce-dataset?select=events.csv
csv = pd.read_csv('/Users/haklof/datasets/events.csv')
print(csv)

             timestamp  visitorid event  itemid  transactionid
0        1433221332117     257597  view  355908            NaN
1        1433224214164     992329  view  248676            NaN
2        1433221999827     111016  view  318965            NaN
3        1433221955914     483717  view  253185            NaN
4        1433221337106     951259  view  367447            NaN
...                ...        ...   ...     ...            ...
2756096  1438398785939     591435  view  261427            NaN
2756097  1438399813142     762376  view  115946            NaN
2756098  1438397820527    1251746  view   78144            NaN
2756099  1438398530703    1184451  view  283392            NaN
2756100  1438400163914     199536  view  152913            NaN

[2756101 rows x 5 columns]


In [8]:
# Select all unique visitors
visitors = csv['visitorid'].drop_duplicates().dropna()
print(visitors)

0           257597
1           992329
2           111016
3           483717
4           951259
            ...   
2756083    1392454
2756093     226214
2756096     591435
2756097     762376
2756099    1184451
Name: visitorid, Length: 1407580, dtype: int64


In [9]:
# Create Visitor nodes
for chunk in np.array_split(visitors, 10):
    with driver.session(database = DB_NAME) as session:
        result = session.write_transaction( lambda tx: 
            tx.run(
                """
                UNWIND $visitors as visitorId
                MERGE (:Visitor{id: visitorId})
                RETURN count(*) as nodesCreated
                """,
                visitors = chunk.to_list()
            ).data()
        )
        df = pd.DataFrame(result)
        print(df)

   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758
   nodesCreated
0        140758


In [10]:
# Select all unique Items
items = csv['itemid'].drop_duplicates().dropna()
print(items)

0          355908
1          248676
2          318965
3          253185
4          367447
            ...  
2756002     19206
2756039    172413
2756042       613
2756060     52086
2756062    177353
Name: itemid, Length: 235061, dtype: int64


In [11]:
# Create Item nodes
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            UNWIND $items as itemId
            MERGE (:Item{id: itemId})
            RETURN count(*) as nodesCreated
            """,
            items = items.to_list()
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   nodesCreated
0        235061


In [12]:
# Select all VIEWD relationships
viewed = csv[csv['event'] == 'view'][['visitorid','itemid', 'timestamp']]
print(viewed)

         visitorid  itemid      timestamp
0           257597  355908  1433221332117
1           992329  248676  1433224214164
2           111016  318965  1433221999827
3           483717  253185  1433221955914
4           951259  367447  1433221337106
...            ...     ...            ...
2756096     591435  261427  1438398785939
2756097     762376  115946  1438399813142
2756098    1251746   78144  1438397820527
2756099    1184451  283392  1438398530703
2756100     199536  152913  1438400163914

[2664312 rows x 3 columns]


In [13]:
# Create VIEWED relationships
for chunk in np.array_split(viewed, 20):
    with driver.session(database = DB_NAME) as session:
        result = session.write_transaction( lambda tx: 
            tx.run(
                """
                UNWIND $data as rel
                MATCH (i:Item{id: rel.itemid}), (v:Visitor{id: rel.visitorid})
                MERGE (v)-[:VIEWED{timestamp: rel.timestamp}]->(i)
                RETURN count(*) as relsCreated
                """,
                data = chunk.to_dict('records')
            ).data()
        )
        df = pd.DataFrame(result)
        print(df)

   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133216
   relsCreated
0       133215
   relsCreated
0       133215
   relsCreated
0       133215
   relsCreated
0       133215
   relsCreated
0       133215
   relsCreated
0       133215
   relsCreated
0       133215
   relsCreated
0       133215


In [14]:
# Select all ADDED relationships
added = csv[csv['event'] == 'addtocart'][['visitorid','itemid', 'timestamp']]
print(added)

         visitorid  itemid      timestamp
17          287857    5206  1433223236124
19          158090   10572  1433221078505
63         1193904  255275  1433223543021
112         599528  356475  1433221941632
179         105775  312728  1433220880956
...            ...     ...            ...
2755956     831605   57810  1438400400805
2756056      10670  419736  1438398156086
2756074     144106  141241  1438400994744
2756078     804736  447661  1438399807937
2756090     804736  346534  1438399811281

[69332 rows x 3 columns]


In [15]:
# Create ADDED relationships
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            UNWIND $data as rel
            MATCH (i:Item{id: rel.itemid}), (v:Visitor{id: rel.visitorid})
            CREATE (v)-[:ADDED{timestamp: rel.timestamp}]->(i)
            RETURN count(*) as relsCreated
            """,
            data = added.to_dict('records')
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   relsCreated
0        69332


In [16]:
# Select all BOUGHT relationships
bought = csv[csv['event'] == 'transaction'][['visitorid','itemid', 'timestamp', 'transactionid']]
bought.transactionid = bought.transactionid.astype(int)
print(bought)

         visitorid  itemid      timestamp  transactionid
130         599528  356475  1433222276276           4000
304         121688   15335  1433193500981          11117
418         552148   81345  1433193915008           5444
814         102019  150318  1433176736375          13556
843         189384  310791  1433174518180           7244
...            ...     ...            ...            ...
2755294    1050575   31640  1438377176570           8354
2755349     861299  456602  1438379878779           3643
2755508     855941  235771  1438357730123           4385
2755603     548772   29167  1438355560300          13872
2755607    1051054  312728  1438358989163          17579

[22457 rows x 4 columns]


In [17]:
# Create BOUGHT relationships
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            UNWIND $data as rel
            MATCH (i:Item{id: rel.itemid}), (v:Visitor{id: rel.visitorid})
            CREATE (v)-[:BOUGHT{timestamp: rel.timestamp, transactionid: rel.transactionid}]->(i)
            RETURN count(*) as relsCreated
            """,
            data = bought.to_dict('records')
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   relsCreated
0        22457


# Some basic queries

In [4]:
# Top sellers
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item)<-[:BOUGHT]-()
            RETURN  i.id as itemid, 
                    count(*) as times_bought,
                    size( (i:Item)<-[:ADDED]-() ) as times_added,
                    size( (i:Item)<-[:VIEWED]-() ) as times_viewed
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   itemid  times_bought  times_added  times_viewed
0  461686           133          306          2538
1  119736            97           44           752
2  213834            92           17           293
3    7943            46           97          1346
4  312728            46          162           947
5  445351            45           89           939
6   48030            41           95           986
7  248455            38           52           575
8  420960            38           60           795
9   17478            37           72           631


In [6]:
# Collaberative filtering
# Suggest what other Items Visitors buy for a Visitor veiwing an Item (where the Visitor has not added/bought the Item already)
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct bought) as times_bought
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   product_bought  times_bought
0          119736            59
1          213834            18
2          171878            13
3          248455            12
4           10572            11
5          218794            10
6          369158            10
7             546             9
8           32581             8
9          320130             8


In [29]:
# Collaberative filtering (alternative)
# Suggest based on what other items that were checked out in same transaction
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[b1:BOUGHT]-(visitor)-[b2:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND b1.transactionid = b2.transactionid
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct b1.transactionid) as times_bought_together
            ORDER BY times_bought_together desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   product_bought  times_bought_together
0          171878                      9
1          218794                      8
2           32581                      8
3           10572                      8
4          124081                      4
5          447067                      3
6           75392                      3
7          108924                      3
8           40630                      2
9          192043                      2


# Questions so far?

# Graph data science

In [30]:
# Do we have any "abnormal visitors"
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            RETURN  max(number_of_events) as `max`,
                    percentileCont(number_of_events, 0.999) as `p0.999`,
                    percentileCont(number_of_events, 0.99) as `p0.99`,
                    percentileCont(number_of_events, 0.9) as `p0.90`,
                    percentileCont(number_of_events, 0.75) as `p0.75`,
                    min(number_of_events) as `min`
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

    max  p0.999  p0.99  p0.90  p0.75  min
0  7757    47.0   13.0    3.0    2.0    1


In [31]:
# Re-label abnormal visitors
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            WHERE number_of_events > 50
            SET v:AbnormalVisitor
            REMOVE v:Visitor
            RETURN count(*) as number_of_abnormal_visitors
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   number_of_abnormal_visitors
0                         1225


# How to do it from the neo4j browser / cypher shell
```cypher
call gds.graph.create.cypher(
    'items',
    'MATCH (i:Item) return id(i) as id',
    'MATCH (i1:Item)<-[r1:VIEWED]-(v:Visitor)-[r2:VIEWED|ADDED|BOUGHT]->(i2:Item)
     WHERE r1.timestamp<r2.timestamp
     WITH i1, i2, r1, case type(r2) when "BOUGHT" then 1.0 when "ADDED" then 0.7 else 0.2 end as weight
     RETURN id(i1) as target, id(i2) as source, weight',
     {readConcurrency:16}
)

call gds.pageRank.stats('items', {maxIterations:200, relationshipWeightProperty:'weight', concurrency:16})

call gds.pageRank.write('items', {maxIterations:200, relationshipWeightProperty:'weight', concurrency:16, writeProperty:'pagerank'})

call gds.graph.drop('items')

```

# How to do it with the GraphDataScience python wrapper

In [32]:
G = gds.graph.create.cypher(
    'items',
    """
    MATCH (i:Item) return id(i) as id
    """,
    """
        MATCH (i1:Item)<-[r1:VIEWED]-(v:Visitor)-[r2:VIEWED|ADDED|BOUGHT]->(i2:Item)
        WHERE r1.timestamp<r2.timestamp
        WITH i1, i2, r1, case type(r2) when "BOUGHT" then 1.0 when "ADDED" then 0.7 else 0.2 end as weight
        RETURN id(i1) as target, id(i2) as source, weight
     """,
    readConcurrency=16
)

In [33]:
print( "Nodes: {nodes:,} Rels: {rels:,} ".format(nodes = G.node_count(), rels=G.relationship_count()) )

Nodes: 235,061 Rels: 5,018,193 


In [34]:
pagerankRes = gds.pageRank.write(G, maxIterations=200, relationshipWeightProperty='weight', concurrency=16, writeProperty='pagerank')

In [35]:
print(pagerankRes)

{'writeMillis': 392, 'nodePropertiesWritten': 235061, 'ranIterations': 121, 'didConverge': True, 'centralityDistribution': {'p99': 5.043791770935059, 'min': 0.14999961853027344, 'max': 110.65331935882568, 'mean': 0.5944112732294387, 'p90': 1.292281150817871, 'p50': 0.21144485473632812, 'p999': 13.032469749450684, 'p95': 2.079451560974121, 'p75': 0.6121091842651367}, 'postProcessingMillis': 83, 'preProcessingMillis': 0, 'computeMillis': 10164, 'configuration': {'maxIterations': 200, 'writeConcurrency': 16, 'relationshipWeightProperty': 'weight', 'concurrency': 16, 'sourceNodes': [], 'writeProperty': 'pagerank', 'scaler': 'NONE', 'nodeLabels': ['*'], 'sudo': False, 'dampingFactor': 0.85, 'relationshipTypes': ['*'], 'tolerance': 1e-07, 'username': None}}


In [36]:
G.drop()

In [37]:
# Can we suggest an Item with high probability of conversion (central to conversion)
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought,
                    otherItem.pagerank as rank,
                    count(distinct bought) as times_bought
            ORDER BY rank desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

   product_bought       rank  times_bought
0          257040  66.998162             4
1          309778  65.395879             2
2            9877  45.231441             8
3          320130  41.140090             8
4          445351  38.658325             6
5          409804  38.333878             4
6           29196  37.918723             2
7           37029  36.671724             7
8          369447  35.673712             1
9          234255  35.013012             6


# Evaluate results
Bloom, search for 