In [17]:
import os
import sys
sys.path.append(os.path.pardir)
from utils import utils
import time
import neo4j
from neo4j import GraphDatabase
import babelnet as bn
from babelnet import BabelSynsetID, Language
from babelnet.data.relation import BabelPointer
from zerorpc import TimeoutExpired, LostRemote

In [18]:
URI = "bolt://localhost:7687"
AUTH = ("giovanni", "BabeldistGraph")

In [19]:
def run_no_exception(session: neo4j.Session, query: str):
    try:
        session.run(query)
    except Exception as e:
        pass

In [41]:
merge_graph_query = """
MERGE (s:Synset {synsetID: $synsetID_1})
MERGE (hyponym:Synset {synsetID: $synsetID_2})
MERGE (s)<-[:IS_A]-(hyponym) """

count_nodes_query = """
MATCH (s:Synset)
RETURN count(s) """

count_edges_query = """
MATCH ()-[r:IS_A]->()
RETURN count(r) """

shortestPath_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = shortestPath((s1)-[:IS_A*..10]-(s2))
RETURN p """

In [36]:
# EXPORTING BABELNET TO NEO4J - ONLY SYNSET IDs, NO LEMMA OR OTHER PROPERTIES

fname = utils.get_current_logfile_number('exporting_neo4j', extension='.log')

visited = set()
max_visits, n = 100, 0
start_synset_id = 'bn:00062164n' # physical entity
q = [BabelSynsetID(start_synset_id)]

# creating driver isn't lightweight, but for this case is ok...
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session() as session:
        run_no_exception(session, 'CREATE CONSTRAINT FOR (s:Synset) REQUIRE s.synsetID IS UNIQUE')
        tx = session.begin_transaction()
        
        start_n, start_r = tx.run(count_nodes_query).values()[0][0], tx.run(count_edges_query).values()[0][0]
        
        with open(fname, 'w') as logfname:
            start_t = time.time()
            while q and n < max_visits:
                n += 1
                pb = utils.get_progress_bar(int((n / max_visits) * 100))
                print(pb, end='\r')

                try:
                    synset = q.pop(0).to_synset()
                    hyponym_edges = synset.outgoing_edges(BabelPointer.ANY_HYPONYM)        
                except (TimeoutExpired, LostRemote) as e:
                    e.with_traceback()
                    hyponym_edges = []

                for edge in hyponym_edges:
                    try:
                        result = tx.run(merge_graph_query, {
                            'synsetID_1': str(synset.id), 
                            'synsetID_2': str(edge.id_target) })
                    except Exception as e:
                        e.with_traceback()

                    if edge.id_target not in visited and edge.id_target not in q:                        
                        q.append(edge.id_target)
                        visited.add(edge.id_target)
                if n % 1000 == 0:
                    tx.commit()
                    tx = session.begin_transaction()

            end_n, end_r = tx.run(count_nodes_query).values()[0][0], tx.run(count_edges_query).values()[0][0]
            tx.commit()
            print(f'Added {end_n - start_n} nodes, added {end_r - start_r} edges.')
            
            logfname.write(f'start_node={start_synset_id}\n')
            logfname.write(f'max_visits={max_visits}\n')
            if q == []: logfname.write('Queue empy\n')
            if n == max_visits: logfname.write('Reached max visits\n')
            logfname.write(f'Added {end_n - start_n} nodes, added {end_r - start_r} edges.')
            end_t = time.time()
            min, sec = divmod(end_t - start_t, 60)
            logfname.write(f'total_time,{int(min)}m,{int(sec)}s') 

Added 10994 nodes, added 11046 edges.


In [37]:
visited

{bn:15088619n,
 bn:01354293n,
 bn:13860447n,
 bn:15365424n,
 bn:16065594n,
 bn:15361808n,
 bn:02063665n,
 bn:16181452n,
 bn:08860223n,
 bn:15146556n,
 bn:16084811n,
 bn:02448737n,
 bn:23882717n,
 bn:00067319n,
 bn:16123255n,
 bn:08960646n,
 bn:25610039n,
 bn:16221175n,
 bn:09266541n,
 bn:16399429n,
 bn:14770561n,
 bn:00022875n,
 bn:15479027n,
 bn:16174695n,
 bn:16391749n,
 bn:11805040n,
 bn:00005109n,
 bn:14946113n,
 bn:02735455n,
 bn:00034104n,
 bn:13512269n,
 bn:18643830n,
 bn:00030144n,
 bn:00046620n,
 bn:16924506n,
 bn:16117649n,
 bn:04709834n,
 bn:01353081n,
 bn:16584098n,
 bn:17760443n,
 bn:15366166n,
 bn:14957061n,
 bn:15365754n,
 bn:16178104n,
 bn:16350542n,
 bn:00021396n,
 bn:09345749n,
 bn:14334673n,
 bn:16597964n,
 bn:02491227n,
 bn:01359208n,
 bn:18098794n,
 bn:00032322n,
 bn:23110305n,
 bn:00072147n,
 bn:16091891n,
 bn:01294332n,
 bn:00003342n,
 bn:16063007n,
 bn:16072448n,
 bn:16187547n,
 bn:01885981n,
 bn:00060819n,
 bn:16402931n,
 bn:16223141n,
 bn:01354242n,
 bn:176693

In [81]:
# TESTING SHORTEST PATH
import pandas as pd
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    #records, summary, keys 
    result = driver.execute_query(
        shortestPath_query, 
        {'synsetID_1': 'bn:00067319n',
         'synsetID_2': 'bn:14770561n'},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)
result

[{'p': [{'synsetID': 'bn:00067319n'},
   'IS_A',
   {'synsetID': 'bn:00053867n'},
   'IS_A',
   {'synsetID': 'bn:00062164n'},
   'IS_A',
   {'synsetID': 'bn:00058442n'},
   'IS_A',
   {'synsetID': 'bn:00035942n'},
   'IS_A',
   {'synsetID': 'bn:14770561n'}]}]

In [82]:
result = result[0]['p']

In [83]:
for item in result:
    if type(item) is dict and 'synsetID' in item.keys():
        print(BabelSynsetID(item['synsetID']).to_synset().main_sense(language=Language.EN).full_lemma)

residue
matter
physical_entity
object
geological_formation
Williamsville_Formation


In [61]:
bn.BabelSynset.main_sense

ResolvedIPv4Address(('127.0.0.1', 7687))