In [141]:
import os
import sys
sys.path.append(os.path.pardir)
import numpy as np

import neo4j
from neo4j import GraphDatabase

from babelnet import BabelSynsetID, Language
from babelnet.data.relation import BabelPointer

from zerorpc import TimeoutExpired, LostRemote

In [142]:
URI = "bolt://localhost:7687"
AUTH = ("giovanni", "BabeldistGraph")

In [143]:
random_node_query = " MATCH (a:Synset) RETURN a.synsetID, rand() as r ORDER BY r LIMIT 1"

count_nodes_query = " MATCH (s:Synset) RETURN count(s) "
count_edges_query = " MATCH ()-[r:IS_A]->() RETURN count(r) "

shortestPath_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = shortestPath((s1)-[:IS_A*..12]-(s2))
RETURN p """

first_common_node_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = (s1)-[:IS_A*..5]->(common_node:Synset)<-[:IS_A*..5]-(s2) 
RETURN p, common_node """

max_length_taxonomy_query = """ 
MATCH p=(root:Synset {synsetID: $root_node})<-[:IS_A*1..12]-(child:Synset) 
WHERE NOT (child)<-[:IS_A]-()
RETURN p ORDER BY length(p) DESC LIMIT 1 """

In [144]:
root_node_id_str = 'bn:00062164n' # physical entity

In [145]:
driver = GraphDatabase.driver(URI, auth=AUTH)

In [146]:
id1_str = driver.execute_query(random_node_query, result_transformer_=neo4j.Result.data)
id2_str = driver.execute_query(random_node_query, result_transformer_=neo4j.Result.data)
id1_str, id2_str = id1_str[0]['a.synsetID'], id2_str[0]['a.synsetID']
id1_str, id2_str

('bn:16072839n', 'bn:03321682n')

In [147]:
id1, id2 = BabelSynsetID(id1_str), BabelSynsetID(id2_str)
s1, s2 = id1.to_synset(), id2.to_synset()
s1, s2

(113975308__WIKI:EN:Crow_Mountain_Formation, 50313521__WIKI:EN:Treadmilling)

In [148]:
def get_shortest_path(driver, s1_id: str, s2_id: str):
    return driver.execute_query(
        shortestPath_query, 
        {'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)

In [149]:
def get_shortest_path_length(driver, s1_id: str, s2_id: str):
    return get_shortest_path(driver, s1_id, s2_id)[0]['p'].count('IS_A')

In [150]:
def shortest_path_synsets_lemmas(shortest_path):
    for item in shortest_path:
        if type(item) is dict and 'synsetID' in item.keys():
            try:
                print(BabelSynsetID(item['synsetID']).to_synset().main_sense().full_lemma)
            except Exception as e:
                print(e.args[0])

In [151]:
def get_lcs_depth(driver, s1_id: str, s2_id: str):
    result = driver.execute_query(
        first_common_node_query,
        {'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)
    
    # TO-DO handle multiple results
    result = result[0]
    path, common_node = [s['synsetID'] for s in result['p'] if type(s) is dict], result['common_node']
    
    from_s1 = path[0:path.index(common_node['synsetID']) + 1]
    from_s2 = path[path.index(common_node['synsetID']):]
    return max(len(from_s1)-1, len(from_s2)-1)

In [152]:
def get_distance_from_root_node(driver, s1_id: str):
    return driver.execute_query(
        shortestPath_query,
        {'synsetID_1': s1_id,
         'synsetID_2': root_node_id_str},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)[0]['p'].count('IS_A')

In [153]:
def get_max_taxonomy_length(driver, root_node_id: str):
    return driver.execute_query(
        max_length_taxonomy_query, 
        {'root_node': root_node_id_str},
        result_transformer_=neo4j.Result.data)[0]['p'].count('IS_A')

In [154]:
def wup_similarity(driver, s1_id: str, s2_id: str):
    lcs_depth = get_lcs_depth(driver, s1_id, s2_id)
    dist_s1_root = get_distance_from_root_node(driver, s1_id)
    dist_s2_root = get_distance_from_root_node(driver, s2_id)
    return lcs_depth / (dist_s1_root + dist_s2_root)

In [156]:
def lcs_similarity(driver, s1_id: str, s2_id: str):
    d = get_max_taxonomy_length(driver, root_node_id_str)
    sp_length = get_shortest_path_length(driver, s1_id, s2_id)
    return -np.log(sp_length / (2 * d))

In [155]:
wup_similarity(driver, id1_str, id2_str)

0.5

In [157]:
lcs_similarity(driver, id1_str, id2_str)

1.0986122886681098

In [158]:
driver.close()