In [None]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "events"

# Create driver
Also set the DB_NAME

In [None]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from neo4j import GraphDatabase # Python database driver
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience.from_neo4j_driver(driver=driver)
gds.set_database(DB_NAME)

# Create databse and schema

In [None]:
# Create (or replace) database
with driver.session(database = "system") as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            #"create database {dbname} if not exists options {{ storeFormat: 'freki'}}".format(dbname = DB_NAME)
            "create database {dbname} if not exists options {{ storeFormat: 'aligned'}}".format(dbname = DB_NAME)
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Create indexes and constraints
with driver.session(database = DB_NAME) as session:
    session.execute_write( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Visitor) REQUIRE (n.id) IS NODE KEY"
        ).consume()
    )
    session.execute_write( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Item) REQUIRE (n.id) IS NODE KEY"
        ).consume()     
    )

#  Load data

In [None]:
# Read csv file
# Source https://www.kaggle.com/retailrocket/ecommerce-dataset?select=events.csv
csv = pd.read_csv('/Users/haklof/datasets/events.csv')
print(csv)

In [None]:
# Select all unique visitors
visitors = csv['visitorid'].drop_duplicates().dropna()
print(visitors)

In [None]:
# Create Visitor nodes
for chunk in np.array_split(visitors, 10):
    with driver.session(database = DB_NAME) as session:
        result = session.execute_write( lambda tx: 
            tx.run(
                """
                UNWIND $visitors as visitorId
                MERGE (v:Visitor{id: visitorId})
                RETURN count(*) as nodesCreated
                """,
                visitors = chunk.to_list()
            ).data()
        )
        df = pd.DataFrame(result)
        print(df)

In [None]:
# Select all unique Items
items = csv['itemid'].drop_duplicates().dropna()
print(items)

In [None]:
# Create Item nodes
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            UNWIND $items as itemId
            MERGE (:Item{id: itemId})
            RETURN count(*) as nodesCreated
            """,
            items = items.to_list()
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Select all VIEWD relationships
viewed = csv[csv['event'] == 'view'][['visitorid','itemid', 'timestamp']]
print(viewed)

In [None]:
# Create VIEWED relationships
for chunk in np.array_split(viewed, 20):
    with driver.session(database = DB_NAME) as session:
        result = session.execute_write( lambda tx: 
            tx.run(
                """
                UNWIND $data as rel
                MATCH (i:Item{id: rel.itemid}), (v:Visitor{id: rel.visitorid})
                MERGE (v)-[:VIEWED{timestamp:  datetime({epochMillis: rel.timestamp})}]->(i)
                RETURN count(*) as relsCreated
                """,
                data = chunk.to_dict('records')
            ).data()
        )
        df = pd.DataFrame(result)
        print(df)

In [None]:
# Select all ADDED relationships
added = csv[csv['event'] == 'addtocart'][['visitorid','itemid', 'timestamp']]
print(added)

In [None]:
# Create ADDED relationships
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            UNWIND $data as rel
            MATCH (i:Item{id: rel.itemid}), (v:Visitor{id: rel.visitorid})
            CREATE (v)-[:ADDED{timestamp:  datetime({epochMillis: rel.timestamp})}]->(i)
            RETURN count(*) as relsCreated
            """,
            data = added.to_dict('records')
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Select all BOUGHT relationships
bought = csv[csv['event'] == 'transaction'][['visitorid','itemid', 'timestamp', 'transactionid']]
bought.transactionid = bought.transactionid.astype(int)
print(bought)

In [None]:
# Create BOUGHT relationships
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            UNWIND $data as rel
            MATCH (i:Item{id: rel.itemid}), (v:Visitor{id: rel.visitorid})
            CREATE (v)-[:BOUGHT{timestamp: datetime({epochMillis: rel.timestamp}), transactionid: rel.transactionid}]->(i)
            RETURN count(*) as relsCreated
            """,
            data = bought.to_dict('records')
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

# Some basic queries

In [None]:
# Top sellers
with driver.session(database = DB_NAME) as session:
    result = session.execute_read( lambda tx: 
        tx.run(
            """
            MATCH (i:Item)<-[:BOUGHT]-()
            RETURN  i.id as itemid, 
                    count(*) as times_bought,
                    size( (i:Item)<-[:ADDED]-() ) as times_added,
                    size( (i:Item)<-[:VIEWED]-() ) as times_viewed
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Collaberative filtering
# Suggest what other Items Visitors buy for a Visitor veiwing an Item (where the Visitor has not added/bought the Item already)
result = None
with driver.session(database = DB_NAME) as session:
    result = session.execute_read( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct bought) as times_bought
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=201110
        ).to_df()
    )
result.head()

In [None]:
# Collaberative filtering (alternative)
# Suggest based on what other items that were checked out in same transaction
with driver.session(database = DB_NAME) as session:
    result = session.execute_read( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[b1:BOUGHT]-(visitor)-[b2:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND b1.transactionid = b2.transactionid
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct b1.transactionid) as times_bought_together
            ORDER BY times_bought_together desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=201110
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

# Questions so far?

# Graph data science

In [None]:
# Do we have any "abnormal visitors"
with driver.session(database = DB_NAME) as session:
    result = session.execute_read( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            RETURN  max(number_of_events) as `max`,
                    percentileCont(number_of_events, 0.999) as `p0.999`,
                    percentileCont(number_of_events, 0.99) as `p0.99`,
                    percentileCont(number_of_events, 0.9) as `p0.90`,
                    percentileCont(number_of_events, 0.75) as `p0.75`,
                    min(number_of_events) as `min`
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Re-label abnormal visitors
with driver.session(database = DB_NAME) as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            WHERE number_of_events > 50
            SET v:AbnormalVisitor
            REMOVE v:Visitor
            RETURN count(*) as number_of_abnormal_visitors
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

# How to do it from the neo4j browser / cypher shell
```cypher
call gds.graph.create.cypher(
    'items',
    'MATCH (i:Item) return id(i) as id',
    'MATCH (i1:Item)<-[r1:VIEWED]-(v:Visitor)-[r2:VIEWED|ADDED|BOUGHT]->(i2:Item)
     WHERE r1.timestamp<r2.timestamp
     WITH i1, i2, r1, case type(r2) when "BOUGHT" then 1.0 when "ADDED" then 0.7 else 0.2 end as weight
     RETURN id(i1) as target, id(i2) as source, weight',
     {readConcurrency:16}
)

call gds.pageRank.stats('items', {maxIterations:200, relationshipWeightProperty:'weight', concurrency:16})

call gds.pageRank.write('items', {maxIterations:200, relationshipWeightProperty:'weight', concurrency:16, writeProperty:'pagerank'})

call gds.graph.drop('items')

```

# How to do it with the GraphDataScience python wrapper

In [None]:
G, project_stats = gds.graph.project.cypher(
    'items',
    """
    MATCH (i:Item) return id(i) as id
    """,
    """
        MATCH (i1:Item)<-[r1:VIEWED]-(v:Visitor)-[r2:VIEWED|ADDED|BOUGHT]->(i2:Item)
        WHERE r1.timestamp<r2.timestamp
        WITH i1, i2, r1, case type(r2) when "BOUGHT" then 1.0 when "ADDED" then 0.7 else 0.2 end as weight
        RETURN id(i1) as target, id(i2) as source, weight
     """,
    readConcurrency=16
)
project_stats

In [None]:
G = gds.graph.get('items')
pagerankRes = gds.pageRank.write(G, maxIterations=200, relationshipWeightProperty='weight', concurrency=16, writeProperty='pagerank')
print(pagerankRes)

In [None]:
G = gds.graph.get('items')
G.drop()

In [None]:
# Can we suggest an Item with high probability of conversion (central to conversion)
with driver.session(database = DB_NAME) as session:
    result = session.execute_read( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought,
                    otherItem.pagerank as rank,
                    count(distinct bought) as times_bought
            ORDER BY rank desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
df = pd.DataFrame(result)
print(df)