# https://www.kernix.com/blog/an-efficient-recommender-system-based-on-graph-database_p9

In [6]:
import pandas as pd
import numpy as np
import matplotlib as plt
from tqdm import tqdm
from py2neo import Graph
import pickle
from os import path
import time
%matplotlib inline

In [8]:
process_directory = '../data/processed'
if path.exists(process_directory + '/' + 'repos.pkl'):
    with open(process_directory + '/' + 'repos.pkl', 'rb') as f:
        repo_set = pickle.load(f)        
else:
    repo_set = set()

In [9]:
len(repo_set)

754488

In [10]:
process_directory = '../data/processed'
if path.exists(process_directory + '/' + 'users.pkl'):
    with open(process_directory + '/' + 'users.pkl', 'rb') as f:
        user_set = pickle.load(f)        
else:
    repo_set = set()

In [11]:
len(user_set)

589035

In [9]:
[r for r in repo_set if r[0] == 843222]

[]

In [38]:
def create_graph_db():
    #Still need to figure out if it's possible to launch neo4j from python.
    graph = Graph('http://neo4j:neo4j@localhost:11003/db/data/', auth=("neo4j", "saveme"))
    #create user and repo node constraint
    graph.run('CREATE INDEX ON :User(user_id)')
    graph.run('CREATE INDEX ON :Repo(repo_id)')
    return graph

def store_in_graph_db(date, graph_db, process_directory):
    triplets = pd.read_csv(process_directory + '/' + date + '/triplets.csv')
    users = pd.read_csv(process_directory + '/' + date + '/users.csv')
    repos = pd.read_csv(process_directory + '/' + date + '/repos.csv')
    if path.exists(process_directory + '/' + 'users.pkl'):
        with open(process_directory + '/' + 'users.pkl', 'rb') as f:
            user_set = pickle.load(f)        
    else:
        user_set = set()

    if path.exists(process_directory + '/' + 'repos.pkl'):
        with open(process_directory + '/' + 'repos.pkl', 'rb') as f:
            repo_set = pickle.load(f)        
    else:
        repo_set = set()
    
#     tx = graph.begin(autocommit=False)
#     user_statement = "MERGE (a:`User`{user_id:{A}, user_name:{B}}) RETURN a"
#     for i, row in tqdm(users.iterrows(), total = len(users)):
#         user_id = row['user_id']
#         user_name = row['user_name']
#         if (user_id, user_name) not in user_set:
#             user_set.add((user_id, user_name))
#             tx.run(user_statement, {"A": user_id, "B": user_name})
#             if i % 200 == 0:
#                 tx.process()
#     tx.commit()

#     with open(process_directory + '/' + 'users.pkl', 'wb') as f:
#         pickle.dump(user_set, f)     
    
#     tx = graph.begin(autocommit=False)
#     repo_statement = "MERGE (a:`Repo`{repo_id:{A}, repo_name:{B}}) RETURN a"
#     for i, row in tqdm(repos.iterrows(), total = len(repos)):
#         repo_id = row['repo_id']
#         repo_name = row['repo_name']
#         if (repo_id, repo_name) not in repo_set:
#             repo_set.add((repo_id, repo_name))
#             tx.run(repo_statement, {"A": repo_id, "B": repo_name})
#             if i % 200 == 0:
#                 tx.process()
#     tx.commit()
#     with open(process_directory + '/' + 'repos.pkl', 'wb') as f:
#         pickle.dump(repo_set, f)     
    
    tx = graph.begin(autocommit=False)
    edge_statement1 = ("MATCH (u:`User`{user_id:{A}}) "
                         "MATCH (r:`Repo`{repo_id:{B}}) MERGE (u)-[e:`")
    edge_statement2 =  "`{date:{C}}]->(r) RETURN e"
    for i, row in tqdm(triplets.iterrows(), total = len(triplets)) :
        edge_statement = edge_statement1 + row['event_type'] + edge_statement2
        tx.run(edge_statement, {"A": row['user_id'], "B": row['repo_id'], "C": date})
        if i % 200 == 0:
            tx.process()
    tx.commit()
    


In [58]:
create_graph = False
if create_graph:
    graph = create_graph_db()
else: 
    graph = Graph('http://neo4j:neo4j@localhost:11003/db/data/', auth=("neo4j", "saveme"))

# for date in ['2019-10-01', '2019-10-02']:
for date in ['2019-10-02']:
    print(date)
    store_in_graph_db(date, graph, '../data/processed/')

2019-10-02


100%|██████████| 1968591/1968591 [55:26<00:00, 591.78it/s]  


In [40]:
triplets = pd.read_csv('../data/processed/2019-10-01/triplets.csv')

In [5]:
triplets.shape

(1984662, 5)

In [6]:
triplets.head()

Unnamed: 0,user_id,user_name,event_type,repo_id,repo_name
0,56006075,abbi2019-1776875,PushEvent,211975219,abbi2019-1776875/abbi2019-1776875.github.io
1,2333883,jancajthaml,CreateEvent,125103867,jancajthaml-openbank/health-check
2,32782481,vascor1,PushEvent,106878356,vascor1/vascor1.github.io
3,29764541,jeronimo-schreyer,PullRequestEvent,15634981,godotengine/godot
4,48139682,sjcondon,PushEvent,211766257,sjcondon/school-domain-online-web-pt-090819


In [7]:
triplets[triplets.user_id == 100]

Unnamed: 0,user_id,user_name,event_type,repo_id,repo_name
532381,100,kmarsh,WatchEvent,186024298,nushell/nushell


In [8]:
triplets[triplets.repo_id == 6761380]

Unnamed: 0,user_id,user_name,event_type,repo_id,repo_name
270931,7001637,tidus2102,WatchEvent,6761380,vitalets/x-editable


In [61]:
triplets[triplets['repo_name'].str.contains('pytorch')]

Unnamed: 0,user_id,user_name,event_type,repo_id,repo_name
1767,21957446,pytorchbot,PushEvent,65600975,pytorch/pytorch
2599,7608630,xush6528,PushEvent,65600975,pytorch/pytorch
2684,7608630,xush6528,PushEvent,65600975,pytorch/pytorch
2758,7608630,xush6528,PushEvent,65600975,pytorch/pytorch
3645,20346844,ceyzaguirre4,PushEvent,211686877,ceyzaguirre4/mac-network-pytorch
...,...,...,...,...,...
1983475,39814207,pull[bot],PullRequestEvent,119666381,Pandinosaurus/pytorch
1983520,23072278,daejungkim,WatchEvent,173498120,Daniil-Osokin/lightweight-human-pose-estimatio...
1983577,9958665,pritamdamania87,PushEvent,65600975,pytorch/pytorch
1983628,9958665,pritamdamania87,PushEvent,65600975,pytorch/pytorch


In [76]:
triplets[triplets['repo_name'].str.contains('mermaid')]

Unnamed: 0,user_id,user_name,event_type,repo_id,repo_name
69035,5697227,ayush987goyal,WatchEvent,26066727,knsv/mermaid
82852,462244,rhz,IssueCommentEvent,26066727,knsv/mermaid
113320,24523235,anlei-fu,WatchEvent,26066727,knsv/mermaid
144415,5067153,saanobhaai,CreateEvent,190568677,data-mermaid/mermaid-api
144752,5067153,saanobhaai,PullRequestEvent,190568677,data-mermaid/mermaid-api
...,...,...,...,...,...
1882152,6288799,tienhoah,PushEvent,188250918,data-mermaid/mermaid-dash
1886107,6288799,tienhoah,PushEvent,188250918,data-mermaid/mermaid-dash
1887554,1131070,juancri,IssuesEvent,185192018,tomoyukim/vscode-mermaid-editor
1892624,1131070,juancri,IssuesEvent,185192018,tomoyukim/vscode-mermaid-editor


In [72]:
triplets[triplets.event_type == 'WatchEvent'].groupby(['user_id'])['user_id'].count()

user_id
100         1
118         1
137         1
160         5
185         1
           ..
10367163    1
10367166    1
10367239    1
10367292    1
10367421    1
Name: user_id, Length: 12548, dtype: int64

In [68]:
triplets[triplets.user_id == 7576559]

Unnamed: 0,user_id,user_name,event_type,repo_id,repo_name
50853,7576559,wakaryry,WatchEvent,4920442,congmo/jQuery-Tags-Input
51437,7576559,wakaryry,WatchEvent,11534941,TimSchlechter/bootstrap-tagsinput
51675,7576559,wakaryry,WatchEvent,1606665,xoxco/jQuery-Tags-Input
51781,7576559,wakaryry,WatchEvent,11686981,sliptree/bootstrap-tokenfield
59662,7576559,wakaryry,WatchEvent,2715671,fancyapps/fancyBox
...,...,...,...,...,...
73165,7576559,wakaryry,WatchEvent,1211978,metafizzy/isotope
73172,7576559,wakaryry,WatchEvent,1308789,javve/list.js
73180,7576559,wakaryry,WatchEvent,9775802,patrickkunka/mixitup
73181,7576559,wakaryry,WatchEvent,3598373,square/crossfilter


In [55]:
users = pd.read_csv('../data/processed/2019-10-01/users.csv')

In [56]:
len(users)

367698

In [10]:
user_set = set(tuple(x) for x in users[['user_id', 'user_name']].values)
with open('../data/processed/users.pkl', 'wb') as f:
    pickle.dump(user_set, f)

In [6]:
users.head()

Unnamed: 0,user_id,user_name,CheckRunEvent,CheckSuiteEvent,CommitCommentEvent,ContentReferenceEvent,CreateEvent,DeleteEvent,DeployKeyEvent,DeploymentEvent,...,RepositoryDispatchEvent,RepositoryEvent,RepositoryImportEvent,RepositoryVulnerabilityAlertEvent,SecurityAdvisoryEvent,StarEvent,StatusEvent,TeamEvent,TeamAddEvent,WatchEvent
0,7.0,evanphx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,17.0,vanpelt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,26.0,topfunky,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,29.0,lukas,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,45.0,mojodna,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
users[users.user_id == 201798]

Unnamed: 0,user_id,user_name,CheckRunEvent,CheckSuiteEvent,CommitCommentEvent,ContentReferenceEvent,CreateEvent,DeleteEvent,DeployKeyEvent,DeploymentEvent,...,RepositoryDispatchEvent,RepositoryEvent,RepositoryImportEvent,RepositoryVulnerabilityAlertEvent,SecurityAdvisoryEvent,StarEvent,StatusEvent,TeamEvent,TeamAddEvent,WatchEvent
9809,201798.0,Christian-Kr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9810,201798.0,ckrippendorf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
repos = pd.read_csv('../data/processed/2019-10-02/repos.csv')

In [33]:
repos.head()

Unnamed: 0,repo_id,repo_name,CheckRunEvent,CheckSuiteEvent,CommitCommentEvent,ContentReferenceEvent,CreateEvent,DeleteEvent,DeployKeyEvent,DeploymentEvent,...,RepositoryDispatchEvent,RepositoryEvent,RepositoryImportEvent,RepositoryVulnerabilityAlertEvent,SecurityAdvisoryEvent,StarEvent,StatusEvent,TeamEvent,TeamAddEvent,WatchEvent
0,363,collectiveidea/audited,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,426,haml/haml,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,507,sferik/twitter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,682,tobi/delayed_job,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,912,collectiveidea/awesome_nested_set,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# #### Create the user, repo nodes and edges between them
# # "MERGE" request : creates a new node if it does not exist already

# tx = graph.begin(autocommit=False)

# user_statement = "MERGE (a:`User`{user_id:{A}, user_name:{B}}) RETURN a"
# for i, row in tqdm(users.iterrows(), total = len(users)):
#     tx.run(user_statement, {"A": row['user_id'], "B": row['user_name']})
# tx.commit()


In [11]:
# tx = graph.begin(autocommit=False)

# repo_statement = "MERGE (a:`Repo`{repo_id:{A}, repo_name:{B}}) RETURN a"
# for i, row in tqdm(repos.iterrows(), total = len(repos)):
#     tx.run(repo_statement, {"A": row['repo_id'], "B": row['repo_name']})
# tx.commit()


100%|██████████| 66475/66475 [45:21<00:00, 24.43it/s]  


In [17]:
# tx = graph.begin(autocommit=False)

# edge_statement1 = ("MATCH (u:`User`{user_id:{A}}) "
#                      "MATCH (r:`Repo`{repo_id:{B}}) MERGE (u)-[e:`")
# edge_statement2 =  "`]->(r) RETURN e"
# for i, row in tqdm(triplets.iterrows(), total = len(triplets)) :
#     edge_statement = edge_statement1 + row['event_type'] + edge_statement2
#     tx.run(edge_statement, {"A": row['user_id'], "B": row['repo_id']})
#     if i % 200 == 0:
#         tx.process()
# tx.commit()

100%|██████████| 218939/218939 [06:59<00:00, 522.12it/s]


In [16]:
# graph.run('CREATE INDEX ON :User(user_id)')
# graph.run('CREATE INDEX ON :Repo(repo_id)')


<py2neo.database.Cursor at 0x132402d10>

In [96]:
user_id = 100
threshold = 0.5

# In Strategy 1, the similarity between two users u1 and u2 is the proportion of starred repos they have in common
# The score of one given movie m is the proportion of users similar to u1 who rated m

query = (### Similarity normalization : count number of repos starred by u1 ###
  # Count movies rated by u1 as countm
  'MATCH (u1:`User` {user_id:{user_id}})-[:`WatchEvent`]->(r1:`Repo`) '
  'WITH count(r1) as count1 '
  ### Score normalization : count number of users who are considered similar to u1 ###
  # Retrieve all users u2 who share at least one starred repo with u1
  'MATCH (u1:`User` {user_id:{user_id}})-[:`WatchEvent`]->(r1:`Repo`) '
  'MATCH (r1)<-[w:`WatchEvent`]-(u2:`User`) '
  'WHERE NOT u2=u1 '
  # Compute similarity
  'WITH u2, count1, tofloat(count(w)) / count1 as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
#   'WHERE sim>{threshold} '
  # Compute score and return the list of suggestions ordered by score
  'RETURN DISTINCT u2.user_name, sim ORDER BY sim DESC ')
  
tx = graph.begin()
result = tx.run(query, {'user_id': user_id, 'threshold': threshold}).data()
result

[{'u2.user_name': 'comxd', 'sim': 1.0},
 {'u2.user_name': 'altmer', 'sim': 1.0},
 {'u2.user_name': 'rajeshmeniya', 'sim': 1.0}]

In [84]:
start = time.time()
# #sklearn
# repo_id = 843222
# #pytorch
repo_id = 65600975
#mermaid-js
# repo_id = 26066727
threshold = 1

# Find similar repos

query = (### Similarity normalization : count number of repos starred by u1 ###
  # Count movies rated by u1 as countm
  'MATCH (u1:`User`)-[:`WatchEvent`]->(r1:`Repo` {repo_id: {repo_id}}) '
  'WITH count(u1) as count1 '
  ### Score normalization : count number of users who are considered similar to u1 ###
  # Retrieve all users u2 who share at least one starred repo with u1
  'MATCH (u1:`User`)-[:`WatchEvent`]->(r1:`Repo` {repo_id: {repo_id}}) '
  'MATCH (r2:`Repo`)<-[w:`WatchEvent`]-(u1) '
  'WHERE NOT r1=r2 '
  # Compute similarity
#   'WITH r2, count1, tofloat(count(w)) / count1 as sim '
  'WITH r2, count1, tofloat(count(w)) as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
  'WHERE sim>={threshold} '
  # Compute score and return the list of suggestions ordered by score
  'RETURN DISTINCT r2.repo_name, sim ORDER BY sim DESC ')
  
tx = graph.begin()
result = tx.run(query, {'repo_id': repo_id, 'threshold': threshold}).data()
print("Similar libraries to pytorch")
result[:10]

# end = time.time()
# end-start

Similar libraries to pytorch


[{'r2.repo_name': 'tensorflow/tensorflow', 'sim': 9.0},
 {'r2.repo_name': 'keras-team/keras', 'sim': 4.0},
 {'r2.repo_name': 'opencv/opencv', 'sim': 4.0},
 {'r2.repo_name': 'pandas-dev/pandas', 'sim': 3.0},
 {'r2.repo_name': 'apache/incubator-mxnet', 'sim': 3.0},
 {'r2.repo_name': 'laravel/laravel', 'sim': 2.0},
 {'r2.repo_name': 'vuejs/vue', 'sim': 1.0},
 {'r2.repo_name': 'galihx11/dotfiles', 'sim': 1.0},
 {'r2.repo_name': 'netblue30/firetools', 'sim': 1.0},
 {'r2.repo_name': 'tensorflow/io', 'sim': 1.0}]

In [None]:
threshold = 0.5

# In Strategy 1, the similarity between two users u1 and u2 is the proportion of starred repos they have in common
# The score of one given movie m is the proportion of users similar to u1 who rated m

query = (### Similarity normalization : count number of repos starred by u1 ###
  # Count movies rated by u1 as countm
  'MATCH (u1:`User` {user_id:{user_id}})-[:`WatchEvent`]->(r1:`Repo`) '
  'WITH count(r1) as count1 '
  ### Score normalization : count number of users who are considered similar to u1 ###
  # Retrieve all users u2 who share at least one starred repo with u1
  'MATCH (u1:`User` {user_id:{user_id}})-[:`WatchEvent`]->(r1:`Repo`) '
  'MATCH (r1)<-[w:`WatchEvent`]-(u2:`User`) '
  'WHERE NOT u2=u1 '
  # Compute similarity
  'WITH u2, count1, tofloat(count(w))/count1 as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
  'WHERE sim>{threshold} '
  # Count number of similar users as countu
  'WITH count(u2) as count2, count1 '
  ### Recommendation ###
  # Retrieve all users u2 who share at least one movie with u1
  'MATCH (u1:`User` {user_id:{user_id}})-[:`Has_rated`]->(m1:`Movie`) '
  'MATCH (m1)<-[r:`Has_rated`]-(u2:`User`) '
  'WHERE NOT u2=u1 '
  # Compute similarity
  'WITH u1, u2,countu, tofloat(count(r))/countm as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
  'WHERE sim>{threshold} '
  # Retrieve movies m that were rated by at least one similar user, but not by u1
  'MATCH (m:`Movie`)<-[r:`Has_rated`]-(u2) '
  'WHERE NOT (m)<-[:`Has_rated`]-(u1) '
  # Compute score and return the list of suggestions ordered by score
  'RETURN DISTINCT m, tofloat(count(r))/countu as score ORDER BY score DESC ')
tx = graph.cypher.begin()
tx.append(query, {'user_id': user_id, 'threshold': threshold})
result = tx.commit()