In [285]:
!pip install graphdatascience



In [286]:
from graphdatascience import GraphDataScience
import pandas as pd
import configparser
from datetime import datetime, timedelta

In [287]:
config = configparser.RawConfigParser()
config.read('aura.ini')
HOST = config['NEO4J']['HOST']
USERNAME = config['NEO4J']['USERNAME']
PASSWORD = config['NEO4J']['PASSWORD']

gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

In [288]:
def clear_graph_by_name(g_name):
    if gds.graph.exists(g_name).exists:
        g = gds.graph.get(g_name)
        gds.graph.drop(g)

def clear_all_graphs():
    g_names = gds.graph.list().graphName.tolist()
    for g_name in g_names:
        g = gds.graph.get(g_name)
        gds.graph.drop(g)

In [289]:
gds.run_cypher('CREATE INDEX purchase_date IF NOT EXISTS FOR ()-[r:PURCHASED]-() ON (r.transactionDate)')

In [290]:
max_purchase_date = gds.run_cypher('MATCH(:Customer)-[r:PURCHASED]->() RETURN max(r.transactionDate) AS maxDate')['maxDate'][0]
cutoff_date = datetime(year=max_purchase_date.year, month=max_purchase_date.month, day=max_purchase_date.day) - timedelta(days=7)

In [291]:
gds.run_cypher('''
    MATCH(c:Customer)-[r:PURCHASED]->(a) 
    WHERE r.transactionDate >= date($cutoffDate)
    WITH c, a, r
    CALL {
    WITH c, a, r
        CREATE(c)-[h:RECENTLY_PURCHASED {transactionDate:r.transactionDate, transactionId:r.transactionId}]->(a) 
    }  IN TRANSACTIONS OF 100000 ROWS
    RETURN count(*)
''', params={'cutoffDate':str(cutoff_date)[:10]})

Unnamed: 0,count(*)
0,266364


In [292]:
%%time
gds.run_cypher('''
    MATCH(c:Customer)-[r:PURCHASED]->(a) 
    WHERE r.transactionDate < date($cutoffDate)
    WITH c, a, r
    CALL {
        WITH c, a, r
        CREATE(c)-[h:HISTORICALLY_PURCHASED {transactionDate:r.transactionDate, transactionId:r.transactionId}]->(a)
    }  IN TRANSACTIONS OF 100000 ROWS
    RETURN count(*)
''', params={'cutoffDate':str(cutoff_date)[:10]})

CPU times: user 47.6 ms, sys: 19.4 ms, total: 67 ms
Wall time: 19min 15s


Unnamed: 0,count(*)
0,31521960


In [293]:
g, _ = gds.graph.project('proj',['Customer', 'Article', 'Product'],{
    'HISTORICALLY_PURCHASED':{'orientation':'UNDIRECTED'},
    'IS_PRODUCT':{'orientation':'UNDIRECTED'},
})

ERROR:neo4j:Failed to write data to connection IPv4Address(('280dbb0d.databases.neo4j.io', 7687)) (IPv4Address(('35.240.86.240', 7687)))
ERROR:neo4j:Failed to write data to connection IPv4Address(('280dbb0d.databases.neo4j.io', 7687)) (IPv4Address(('35.240.86.240', 7687)))
ERROR:neo4j:Failed to write data to connection IPv4Address(('280dbb0d.databases.neo4j.io', 7687)) (IPv4Address(('35.240.86.240', 7687)))
ERROR:neo4j:Failed to write data to connection IPv4Address(('280dbb0d.databases.neo4j.io', 7687)) (IPv4Address(('35.240.86.240', 7687)))
ERROR:neo4j:Failed to write data to connection IPv4Address(('280dbb0d.databases.neo4j.io', 7687)) (IPv4Address(('35.240.86.240', 7687)))
ERROR:neo4j:Failed to write data to connection IPv4Address(('280dbb0d.databases.neo4j.io', 7687)) (IPv4Address(('35.240.86.240', 7687)))


In [294]:
gds.wcc.write(g, writeProperty='histComponent')

writeMillis                                                           2890
nodePropertiesWritten                                              1524746
componentCount                                                       16572
componentDistribution    {'p99': 3, 'min': 1, 'max': 1507255, 'mean': 9...
postProcessingMillis                                                   184
preProcessingMillis                                                      0
computeMillis                                                          323
configuration            {'writeConcurrency': 4, 'seedProperty': None, ...
Name: 0, dtype: object

In [295]:
g.drop()

In [296]:
gds.run_cypher('''
    MATCH(n) 
    WITH n.histComponent AS maxComponent, count(n) AS cnt ORDER BY cnt DESC LIMIT 1
    MATCH(n:Customer) WHERE n.histComponent = maxComponent
    SET n:HistEstCustomer
    RETURN count(n)
''')

Unnamed: 0,count(n)
0,1356117


In [297]:
gds.run_cypher('''
    MATCH(n) 
    WITH n.histComponent AS maxComponent, count(n) AS cnt ORDER BY cnt DESC LIMIT 1
    MATCH(n:Article) WHERE n.histComponent = maxComponent
    SET n:HistEstArticle
    RETURN count(n)
''')

Unnamed: 0,count(n)
0,104632


In [298]:
gds.run_cypher('''
    MATCH(n) 
    WITH n.histComponent AS maxComponent, count(n) AS cnt ORDER BY cnt DESC LIMIT 1
    MATCH(n:Product) WHERE n.histComponent = maxComponent
    SET n:HistEstProduct
    RETURN count(n)
''')

Unnamed: 0,count(n)
0,46500


In [352]:
%%time
g, _ = gds.graph.project('proj',['HistEstCustomer', 'HistEstArticle', 'HistEstProduct'],{
    'HISTORICALLY_PURCHASED':{'orientation':'UNDIRECTED'},
    'IS_PRODUCT':{'orientation':'UNDIRECTED'}}, readConcurrency=20)

gds.fastRP.mutate(g, mutateProperty='embedding', embeddingDimension=256, randomSeed=7474, concurrency=20)
gds.graph.writeNodeProperties(g, ['embedding'], ['HistEstArticle'])
knn_stats = gds.knn.write(g, nodeProperties=['embedding'], nodeLabels=['HistEstArticle'],
                  writeRelationshipType='HIST_CUSTOMERS_ALSO_PURCHASED', writeProperty='score', similarityCutoff=0.82,
                  sampleRate=1.0,maxIterations=1000, concurrency=20);

CPU times: user 42.3 ms, sys: 469 µs, total: 42.8 ms
Wall time: 1min 35s


In [353]:
g.drop()

In [354]:
%%time
pred_df = gds.run_cypher('''
    MATCH(c:HistEstCustomer)-[:RECENTLY_PURCHASED]->()
    WITH DISTINCT c
    MATCH(c)-[r:HISTORICALLY_PURCHASED]->(a0) WHERE r.transactionDate > date($cutOffDate)
    WITH c, a0, r
    MATCH(a0)-[s:HIST_CUSTOMERS_ALSO_PURCHASED]->(a)
    RETURN c.customerId AS customerId, a.articleId AS articleId, sum(s.score) AS aggScore, max(r.transactionDate)
    ORDER BY customerId, aggScore DESC
''', params = {'cutOffDate':str(cutoff_date - timedelta(days=42))[:10]})

CPU times: user 3.65 s, sys: 24.2 ms, total: 3.67 s
Wall time: 24.6 s


In [355]:
pred_df

Unnamed: 0,customerId,articleId,aggScore,max(r.transactionDate)
0,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,0621381020,0.827440,2020-09-12
1,000fb6e772c5d0023892065e659963da90b1866035558e...,0864297004,0.829778,2020-09-10
2,001324f693acaea0dea35333ba00ccddd0162d8bc81e76...,0786187003,1.729961,2020-08-24
3,001b09b8679f1efaefebab00c6d7fdeb036e2ca015605f...,0848438002,0.835271,2020-09-03
4,001b09b8679f1efaefebab00c6d7fdeb036e2ca015605f...,0766495003,0.834960,2020-09-03
...,...,...,...,...
65837,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0709301001,0.820328,2020-08-12
65838,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0908492004,0.836459,2020-08-08
65839,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0752945002,0.826642,2020-08-08
65840,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,0746344002,0.823165,2020-08-08


In [315]:
obs_df = gds.run_cypher('''
    MATCH(c:HistEstCustomer)-[r:RECENTLY_PURCHASED]->(a)
    WITH c.customerId AS customerId, a.articleId AS articleId
    RETURN customerId, articleId
''')

In [356]:
obs_eval_df = obs_df.groupby('customerId').agg({'articleId': lambda x: x.tolist()}).reset_index().rename(columns={'articleId':'observedPurchases'})

In [357]:
pred_eval_df = pred_df.groupby('customerId').agg({'articleId': lambda x: x.tolist()}).reset_index().rename(columns={'articleId':'predictedPurchases'})

In [358]:
eval_df = obs_eval_df.merge(pred_eval_df, on='customerId', how='inner')
eval_df

Unnamed: 0,customerId,observedPurchases,predictedPurchases
0,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[0827487003],[0621381020]
1,000fb6e772c5d0023892065e659963da90b1866035558e...,"[0889669006, 0913272003, 0786022008, 091327200...",[0864297004]
2,001324f693acaea0dea35333ba00ccddd0162d8bc81e76...,"[0868874006, 0707269004, 0707269004]",[0786187003]
3,001b09b8679f1efaefebab00c6d7fdeb036e2ca015605f...,"[0909357001, 0762063001]","[0848438002, 0766495003, 0783738001, 077754100..."
4,001c1f8d70782f450524d3b3f404474dbd4a7d0d2ad78a...,[0898886002],"[0798524001, 0717773005, 0712409005]"
...,...,...,...
13169,fff052a0464cf292d6b451da9dba86a299c52698a3b8c3...,"[0783346023, 0783346023, 0783346001, 0783346001]",[0946748004]
13170,fff2282977442e327b45d8c89afde25617d00124d0f999...,"[0891322004, 0759054001, 0697564010, 089132200...",[0809238001]
13171,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,[0833459002],"[0809238005, 0809238001, 0739569001, 079734300..."
13172,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,"[0865624003, 0826150005, 0396135007, 081747200...","[0797892005, 0797892004, 0855148001, 090656900..."


In [359]:
from collections import Counter

def average_precision(true_list,predicted_list,at_k):
    if not isinstance(predicted_list, list):
        return 0.0
    true_dict = dict(Counter(true_list))
    true_set = set(true_dict.keys())
    length_pred = len(predicted_list)
    p=0
    K = min(at_k,length_pred)
    for k in range(1,(K+1)):
        v = predicted_list[k-1]
        if v in true_list:
            p += true_dict[v]*len(true_set.intersection(predicted_list[:k]))/k
    return p/min(len(true_set),at_k)

In [360]:
eval_df['averagePrecisions'] = eval_df.apply(lambda row: average_precision(row.observedPurchases, row.predictedPurchases, 12), axis=1)
eval_df

Unnamed: 0,customerId,observedPurchases,predictedPurchases,averagePrecisions
0,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[0827487003],[0621381020],0.0
1,000fb6e772c5d0023892065e659963da90b1866035558e...,"[0889669006, 0913272003, 0786022008, 091327200...",[0864297004],0.0
2,001324f693acaea0dea35333ba00ccddd0162d8bc81e76...,"[0868874006, 0707269004, 0707269004]",[0786187003],0.0
3,001b09b8679f1efaefebab00c6d7fdeb036e2ca015605f...,"[0909357001, 0762063001]","[0848438002, 0766495003, 0783738001, 077754100...",0.0
4,001c1f8d70782f450524d3b3f404474dbd4a7d0d2ad78a...,[0898886002],"[0798524001, 0717773005, 0712409005]",0.0
...,...,...,...,...
13169,fff052a0464cf292d6b451da9dba86a299c52698a3b8c3...,"[0783346023, 0783346023, 0783346001, 0783346001]",[0946748004],0.0
13170,fff2282977442e327b45d8c89afde25617d00124d0f999...,"[0891322004, 0759054001, 0697564010, 089132200...",[0809238001],0.0
13171,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,[0833459002],"[0809238005, 0809238001, 0739569001, 079734300...",0.0
13172,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,"[0865624003, 0826150005, 0396135007, 081747200...","[0797892005, 0797892004, 0855148001, 090656900...",0.0


In [361]:
eval_df['averagePrecisions'].sum()/eval_df.shape[0]

0.005806638831563467

In [362]:
pd.set_option("display.max_rows", 4000)
eval_df[eval_df['averagePrecisions'] > 0.0].shape[0]

265