In [None]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "arrow"

# Create driver
Also set the DB_NAME

Hint: Use conda if you get  No module named 'pyarrow._flight' on Apple Silicon

In [None]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from neo4j import GraphDatabase # Python database driver
import pyarrow
import pyarrow.flight
from graphdatascience import GraphDataScience # Python GDS client

# And some cermony to create the driver and gds objects
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.set_database(DB_NAME)

In [None]:
# Prepare databases needed
# Our target database sould be removed
with driver.session(database = "system") as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            "DROP DATABASE {dbname} IF EXISTS".format(dbname = DB_NAME)
        ).data()
    )

In [None]:
# We need one database during the import (just so we can reference the projection in the graph catalogue)
with driver.session(database = "system") as session:
    result = session.execute_write( lambda tx: 
        tx.run(
            "CREATE DATABASE neo4j IF NOT EXISTS"
        ).data()
    )

In [None]:
# Check if apache arrow is enabled (if not, add gds.arrow.enabled=true to neo4j.conf)
gds.set_database("neo4j")
arrow_running = gds.run_cypher("call gds.debug.arrow() yield running return running")
assert arrow_running["running"][0] == True, "Arrow not running"


#  Load data

In [None]:
# Read csv file
# Source https://www.kaggle.com/retailrocket/ecommerce-dataset?select=events.csv
csv = pd.read_csv('/Users/haklof/datasets/events.csv')
print(csv)

# Construct graph using Apache Arrow

In [None]:
# Select all unique visitors
visitors = csv['visitorid'].drop_duplicates().dropna()
print(visitors)

In [None]:
# Select all unique Items
items = csv['itemid'].drop_duplicates().dropna()
print(items)

In [None]:
# Select all VIEWD relationships
viewed = csv[csv['event'] == 'view'][['visitorid','itemid', 'timestamp']]
print(viewed)

In [None]:
# Select all ADDED relationships
added = csv[csv['event'] == 'addtocart'][['visitorid','itemid', 'timestamp']]
print(added)

In [None]:
# Select all BOUGHT relationships
bought = csv[csv['event'] == 'transaction'][['visitorid','itemid', 'timestamp', 'transactionid']]
bought.transactionid = bought.transactionid.astype(int)
print(bought)

In [None]:
nodes_id = pd.DataFrame(
     {
        "id": visitors.tolist() + items.tolist(),
        "labels": ['Visitor'] * visitors.shape[0] + ['Item'] * items.shape[0]
     }
).reset_index()
nodes_id.head()

In [None]:
# Nodes DataFrame
nodes = pd.DataFrame(
    {
        "nodeId": nodes_id["index"].tolist(),
        "labels": nodes_id["labels"].tolist(),
        "id": nodes_id["id"].tolist()
    }
)
nodes.head()

In [None]:
# Relationships DataFrame
relationships = pd.DataFrame(
    {
        "sourceNodeId": 
            viewed.merge(nodes_id[nodes_id['labels'] == "Visitor" ], left_on="visitorid", right_on="id")["index"].to_list()
             + added.merge(nodes_id[nodes_id['labels'] == "Visitor" ], left_on="visitorid", right_on="id")["index"].to_list()
             + bought.merge(nodes_id[nodes_id['labels'] == "Visitor" ], left_on="visitorid", right_on="id")["index"].to_list(),
        "targetNodeId": 
            viewed.merge(nodes_id[nodes_id['labels'] == "Item" ], left_on="itemid", right_on="id")["index"].to_list()
             + added.merge(nodes_id[nodes_id['labels'] == "Item" ], left_on="itemid", right_on="id")["index"].to_list()
             + bought.merge(nodes_id[nodes_id['labels'] == "Item" ], left_on="itemid", right_on="id")["index"].to_list(), 
        "relationshipType": ['VIEWED']*viewed.shape[0] + ['ADDED']*added.shape[0] + ['BOUGHT']*bought.shape[0],
        "timestamp": viewed['timestamp'].to_list() + added['timestamp'].to_list() + bought['timestamp'].to_list(),
        "transactionid": [0]*viewed.shape[0] + [0]*added.shape[0] + bought['transactionid'].to_list()
    }
)

In [None]:
relationships.head()

In [None]:
# Construct the graph
gds.set_database("neo4j")
G = gds.graph.construct(
    'items_raw',
    nodes,
    relationships
)

In [None]:
# Persist graph into target database
gds.set_database("neo4j")
G = gds.graph.get('items_raw')
gds.graph.export(G, dbName=DB_NAME)

In [None]:
# Create entry for the target database in the system database
with driver.session(database = "system") as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            "CREATE DATABASE {dbname}".format(dbname = DB_NAME)
        ).data()
    )

In [None]:
gds.set_database("neo4j")
gds.graph.drop(G)

In [None]:
# Create indexes and constraints
with driver.session(database = DB_NAME) as session:
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Visitor) REQUIRE (n.id) IS NODE KEY"
        ).consume()
    )
    session.write_transaction( lambda tx: 
        tx.run(
            "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Item) REQUIRE (n.id) IS NODE KEY"
        ).consume()     
    )

# Some basic queries

In [None]:
# Top sellers
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item)<-[:BOUGHT]-()
            RETURN  i.id as itemid, 
                    count(*) as times_bought,
                    count{ (i:Item)<-[:ADDED]-() } as times_added,
                    count{ (i:Item)<-[:VIEWED]-() } as times_viewed
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Collaberative filtering
# Suggest what other Items Visitors buy for a Visitor veiwing an Item (where the Visitor has not added/bought the Item already)
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct bought) as times_bought
            ORDER BY times_bought desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=201110
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Collaberative filtering (alternative)
# Suggest based on what other items that were checked out in same transaction
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[b1:BOUGHT]-(visitor)-[b2:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            //AND b1.transactionid = b2.transactionid
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought, 
                    count(distinct b1.transactionid) as times_bought_together
            ORDER BY times_bought_together desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=201110
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

# Questions so far?

# Graph data science

In [None]:
# Do we have any "abnormal visitors"
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            RETURN  max(number_of_events) as `max`,
                    percentileCont(number_of_events, 0.999) as `p0.999`,
                    percentileCont(number_of_events, 0.99) as `p0.99`,
                    percentileCont(number_of_events, 0.9) as `p0.90`,
                    percentileCont(number_of_events, 0.75) as `p0.75`,
                    min(number_of_events) as `min`
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

In [None]:
# Re-label abnormal visitors
with driver.session(database = DB_NAME) as session:
    result = session.write_transaction( lambda tx: 
        tx.run(
            """
            MATCH (v:Visitor)-[:VIEWED|ADDED|BOUGHT]->()
            WITH v, 
                 count(*) as number_of_events
            WHERE number_of_events > 50
            SET v:AbnormalVisitor
            REMOVE v:Visitor
            RETURN count(*) as number_of_abnormal_visitors
            """,
            limit = 10
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)

# What Items are important?

In [None]:
gds.set_database(DB_NAME)
G2, project_stats = gds.graph.project.cypher(
    'items',
    """
    MATCH (i:Item) return id(i) as id
    """,
    """
        MATCH (i1:Item)<-[r1:VIEWED]-(v:Visitor)-[r2:VIEWED|ADDED|BOUGHT]->(i2:Item)
        WHERE r1.timestamp<r2.timestamp
        WITH i1, i2, r1, case type(r2) when "BOUGHT" then 1.0 when "ADDED" then 0.7 else 0.2 end as weight
        RETURN id(i1) as target, id(i2) as source, weight
     """,
    readConcurrency=16
)

In [None]:
gds.set_database(DB_NAME)
G2 = gds.graph.get('items')

In [None]:
pagerankRes = gds.pageRank.write(G2, maxIterations=200, relationshipWeightProperty='weight', concurrency=16, writeProperty='pagerank')

In [None]:
print(pagerankRes)

In [None]:
G2.drop()

In [None]:
# Can we suggest an Item with high probability of conversion (central to conversion)
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
            """
            MATCH (i:Item{id: $itemid})<-[:VIEWED]-(visitor)-[bought:BOUGHT]->(otherItem)
            WHERE i <> otherItem
            AND NOT (:Visitor{id: $visitorid})-[:BOUGHT|ADDED]->(otherItem)
            RETURN  otherItem.id as product_bought,
                    avg(otherItem.pagerank) as rank,
                    count(distinct bought) as times_bought
            ORDER BY rank desc limit $limit
            """,
            limit = 10, itemid = 461686, visitorid=684514
        ).data()
    )
    df = pd.DataFrame(result)
    print(df)