In [101]:
import os
import sys
sys.path.append(os.path.pardir)
from utils import utils
import time
import random
import neo4j
from neo4j import GraphDatabase
import babelnet as bn
from babelnet import BabelSynsetID, Language
from babelnet.data.relation import BabelPointer
from zerorpc import TimeoutExpired, LostRemote

In [18]:
URI = "bolt://localhost:7687"
AUTH = ("giovanni", "BabeldistGraph")

In [19]:
def run_no_exception(session: neo4j.Session, query: str):
    try:
        session.run(query)
    except Exception as e:
        pass

In [127]:
merge_graph_query = """
MERGE (s:Synset {synsetID: $synsetID_1})
MERGE (hyponym:Synset {synsetID: $synsetID_2})
WITH s, hyponym
WHERE s.synsetID <> hyponym.synsetID
MERGE (s)<-[:IS_A]-(hyponym) """

count_nodes_query = """
MATCH (s:Synset)
RETURN count(s) """

count_edges_query = """
MATCH ()-[r:IS_A]->()
RETURN count(r) """

shortestPath_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = shortestPath((s1)-[:IS_A*..10]-(s2))
RETURN p """

first_common_node_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = (s1)-[:IS_A*..5]->(common_node:Synset)<-[:IS_A*..5]-(s2) 
RETURN p, common_node """

In [93]:
# EXPORTING BABELNET TO NEO4J - ONLY SYNSET IDs, NO LEMMA OR OTHER PROPERTIES

fname = utils.get_current_logfile_number('exporting_neo4j', extension='.log')

visited = set()
max_visits, n = 500, 0
start_synset_id = 'bn:00062164n'
q = [BabelSynsetID(start_synset_id)]

# creating driver isn't lightweight, but for this case is ok...
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session() as session:
        run_no_exception(session, 'CREATE CONSTRAINT FOR (s:Synset) REQUIRE s.synsetID IS UNIQUE')
        tx = session.begin_transaction()
        
        start_n, start_r = tx.run(count_nodes_query).values()[0][0], tx.run(count_edges_query).values()[0][0]
        
        with open(fname, 'w') as logfname:
            start_t = time.time()
            while q and n < max_visits:
                pb = utils.get_progress_bar(int((n / max_visits) * 100))
                print(pb, end='\r')
                n += 1

                try:
                    synset = q.pop(0).to_synset()
                    hyponym_edges = synset.outgoing_edges(BabelPointer.ANY_HYPONYM)        
                except (TimeoutExpired, LostRemote) as e:
                    e.with_traceback()
                    hyponym_edges = []

                for edge in hyponym_edges:
                    try:
                        result = tx.run(merge_graph_query, {
                            'synsetID_1': str(synset.id), 
                            'synsetID_2': str(edge.id_target) })
                    except Exception as e:
                        e.with_traceback()

                    if edge.id_target not in visited and edge.id_target not in q:                        
                        q.append(edge.id_target)
                        visited.add(edge.id_target)

                if n % 1000 == 0:
                    tx.commit()
                    tx = session.begin_transaction()

            end_n, end_r = tx.run(count_nodes_query).values()[0][0], tx.run(count_edges_query).values()[0][0]
            tx.commit()
            print(f'Added {end_n - start_n} nodes, added {end_r - start_r} edges.')
            
            logfname.write(f'start_node={start_synset_id}\n')
            logfname.write(f'max_visits={max_visits}\n')
            if q == []: logfname.write('Queue empy\n')
            if n == max_visits: logfname.write('Reached max visits\n')
            logfname.write(f'Added {end_n - start_n} nodes, added {end_r - start_r} edges.')
            end_t = time.time()
            min, sec = divmod(end_t - start_t, 60)
            logfname.write(f'total_time,{int(min)}m,{int(sec)}s') 

0 

Added 5092 nodes, added 5251 edges.


In [96]:
len(visited)

16083

In [111]:
visited = list(visited)

In [None]:
id1 = random.choice(visited)
id2 = random.choice(visited)

In [125]:
id1.to_synset(), id1.id, id2.to_synset(), id2.id

(234820238__WIKI:EN:San_Vittore,_Calcio,
 'bn:17745081n',
 77765359__WIKI:EN:Metsä_Board,
 'bn:03835703n')

In [126]:
s1, s2 = id1.to_synset(), id2.to_synset()
s1.main_gloss(), s2.main_gloss()

(San Vittore is a neoclassical-style Roman Catholic parish church in the town of Calcio, province of Bergamo, region of Lombardy, Italy.,
 Metsä Board Oyj, previously known as M-real Corporation, is a leading European producer of premium fresh fibre paperboards including folding boxboards, food service boards and white kraftliners.)

In [122]:
# SHORTEST PATH
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    #records, summary, keys 
    result = driver.execute_query(
        shortestPath_query, 
        {'synsetID_1': id1.id,
         'synsetID_2': id2.id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)
result

[{'p': [{'synsetID': 'bn:17745081n'},
   'IS_A',
   {'synsetID': 'bn:00051760n'},
   'IS_A',
   {'synsetID': 'bn:00058442n'},
   'IS_A',
   {'synsetID': 'bn:00062164n'},
   'IS_A',
   {'synsetID': 'bn:00053867n'},
   'IS_A',
   {'synsetID': 'bn:00060464n'},
   'IS_A',
   {'synsetID': 'bn:03835703n'}]}]

In [123]:
result = result[0]['p']

In [124]:
for item in result:
    if type(item) is dict and 'synsetID' in item.keys():
        print(BabelSynsetID(item['synsetID']).to_synset().main_sense(language=Language.EN).full_lemma)

San_Vittore,_Calcio
location
object
physical_entity
matter
paper
Metsä_Board


In [138]:
# FIRST COMMON NODE BETWEEN TWO SYNSETS

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    #records, summary, keys 
    result = driver.execute_query(
        first_common_node_query,
        {'synsetID_1': id1.id,
         'synsetID_2': id2.id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)

In [139]:
len(result)

2

In [140]:
int((len(result[0]['p'])-1)/2),int((len(result[1]['p'])-1)/2)    # choose the shortest path

(6, 8)

In [None]:
result = result[0]

In [145]:
result

{'p': [{'synsetID': 'bn:17745081n'},
  'IS_A',
  {'synsetID': 'bn:00051760n'},
  'IS_A',
  {'synsetID': 'bn:00058442n'},
  'IS_A',
  {'synsetID': 'bn:00062164n'},
  'IS_A',
  {'synsetID': 'bn:00053867n'},
  'IS_A',
  {'synsetID': 'bn:00060464n'},
  'IS_A',
  {'synsetID': 'bn:03835703n'}],
 'common_node': {'synsetID': 'bn:00062164n'}}

In [149]:
path, common_node = [s['synsetID'] for s in result['p'] if type(s) is dict], result['common_node']
path, common_node

(['bn:17745081n',
  'bn:00051760n',
  'bn:00058442n',
  'bn:00062164n',
  'bn:00053867n',
  'bn:00060464n',
  'bn:03835703n'],
 {'synsetID': 'bn:00062164n'})

In [154]:
from_s1 = path[0:path.index(common_node['synsetID']) + 1]
from_s2 = path[path.index(common_node['synsetID']):]
from_s2.reverse()
from_s1, from_s2

(['bn:17745081n', 'bn:00051760n', 'bn:00058442n', 'bn:00062164n'],
 ['bn:03835703n', 'bn:00060464n', 'bn:00053867n', 'bn:00062164n'])