In [7]:
import os
import sys
sys.path.append('..')
from utils import utils
import numpy as np

import neo4j
from neo4j import GraphDatabase

In [23]:
from babelnet import BabelSynsetID, Language

2023-09-11 09:55:06,447 [babelnet.conf] INFO: Loaded configuration from ['/home/giovanni/unimore/2_graph_analytics/tesina/babeldist/remote/babelnet_conf.yml']
2023-09-11 09:55:06,453 [babelnet.api] INFO: BabelNet Remote Procedure Call API v1.1.0


TimeoutExpired: timeout after 60s, when calling remote method version

In [2]:
URI = "bolt://localhost:7687"
AUTH = ("giovanni", "BabeldistGraph")

In [3]:
random_node_query = " MATCH (a:Synset) RETURN a.synsetID as synsetID, rand() as r ORDER BY r LIMIT 1"

count_nodes_query = " MATCH (s:Synset) RETURN count(s) as numNodes"
count_edges_query = " MATCH ()-[r:IS_A]->() RETURN count(r) as numEdges"

shortestPath_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = shortestPath((s1)-[:IS_A*..12]-(s2))
RETURN p as shortestPath"""

first_common_node_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = (s1)-[:IS_A*..5]->(common_node:Synset)<-[:IS_A*..5]-(s2) 
RETURN p as path, common_node """

max_length_taxonomy_query = """ 
MATCH p=(root:Synset {synsetID: $root_node})<-[:IS_A*1..12]-(child:Synset) 
WHERE NOT (child)<-[:IS_A]-()
RETURN p as path ORDER BY length(p) DESC LIMIT 1 """

In [4]:
root_node_id_str = 'bn:00062164n' # physical entity

In [7]:
def get_random_synset_id(driver):
    return driver.execute_query(random_node_query, result_transformer_=neo4j.Result.data)[0]['synsetID']

In [8]:
def get_shortest_path(driver, s1_id: str, s2_id: str):
    return driver.execute_query(
        shortestPath_query, 
        {'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)

In [9]:
def get_shortest_path_length(driver, s1_id: str, s2_id: str):
    return get_shortest_path(driver, s1_id, s2_id)[0]['shortestPath'].count('IS_A')

In [10]:
def shortest_path_synsets_lemmas(driver, s1_id: str, s2_id: str):
    shortest_path = get_shortest_path(driver, s1_id, s2_id)[0]['shortestPath']
    for item in shortest_path:
        if type(item) is dict and 'synsetID' in item.keys():
            try:
                print(BabelSynsetID(item['synsetID']).to_synset().main_sense().full_lemma)
            except Exception as e:
                print(e.args[0])

In [11]:
def get_lcs_depth(driver, s1_id: str, s2_id: str):
    result = driver.execute_query(
        first_common_node_query,
        {'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)
    
    # TO-DO handle multiple results
    result = result[0]
    path, common_node = [s['synsetID'] for s in result['path'] if type(s) is dict], result['common_node']
    
    from_s1 = path[0:path.index(common_node['synsetID']) + 1]
    from_s2 = path[path.index(common_node['synsetID']):]
    return max(len(from_s1)-1, len(from_s2)-1)

In [12]:
def get_distance_from_root_node(driver, s1_id: str):
    return driver.execute_query(
        shortestPath_query,
        {'synsetID_1': s1_id,
         'synsetID_2': root_node_id_str},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)[0]['shortestPath'].count('IS_A')

In [13]:
def get_max_taxonomy_length(driver, root_node_id: str):
    return driver.execute_query(
        max_length_taxonomy_query, 
        {'root_node': root_node_id_str},
        result_transformer_=neo4j.Result.data)[0]['path'].count('IS_A')

In [14]:
def wup_similarity(driver, s1_id: str, s2_id: str):
    lcs_depth = get_lcs_depth(driver, s1_id, s2_id)
    dist_s1_root = get_distance_from_root_node(driver, s1_id)
    dist_s2_root = get_distance_from_root_node(driver, s2_id)
    return lcs_depth / (dist_s1_root + dist_s2_root)

In [15]:
def lch_similarity(driver, s1_id: str, s2_id: str):
    d = get_max_taxonomy_length(driver, root_node_id_str)
    sp_length = get_shortest_path_length(driver, s1_id, s2_id)
    return -np.log(sp_length / (2 * d))

In [16]:
def path_similarity(driver, s1_id: str, s2_id: str):
    return 1 / get_shortest_path_length(driver, s1_id, s2_id)

In [18]:
driver = GraphDatabase.driver(URI, auth=AUTH)

In [19]:
id1_str, id2_str = get_random_synset_id(driver), get_random_synset_id(driver) 
id1_str, id2_str

('bn:03724276n', 'bn:00041965n')

In [20]:
id1, id2 = BabelSynsetID(id1_str), BabelSynsetID(id2_str)
s1, s2 = id1.to_synset(), id2.to_synset()
s1, s2

(77436857__WIKI:EN:Country_changes_in_figure_skating, 39088034__grout#n#1)

In [21]:
s1.main_gloss(), s2.main_gloss()

(Country changing is a phenomenon in the figure skating world in which skaters change the country they represent in competition.,
 A thin mortar that can be poured and used to fill cracks in masonry or brickwork)

In [22]:
wup_similarity(driver, id1_str, id2_str)

0.5

In [23]:
lch_similarity(driver, id1_str, id2_str)

1.0986122886681098

In [24]:
shortest_path_synsets_lemmas(driver, id1_str, id2_str)

Country_changes_in_figure_skating
phenomenon
process
physical_entity
matter
building_material
grout


In [52]:
# Try to make a great quantity of comparisons
fname = utils.get_next_logfile_number('dist_comparisons', extension='.csv')
max_comparisons = 3
sep = ';'
names = sep.join(('id1', 'id2', 'lemma1', 'lemma2' 'wup', 'lch', 'path'))
with open(fname, 'w') as f:
    f.write(names + '\n')
    for _ in range(max_comparisons):
        id1_str, id2_str = get_random_synset_id(driver), get_random_synset_id(driver) 
        id1, id2 = BabelSynsetID(id1_str), BabelSynsetID(id2_str)
        s1, s2 = id1.to_synset(), id2.to_synset()
        record = sep.join((id1_str, id2_str, s1.main_sense().full_lemma, s2.main_sense().full_lemma),
                          wup_similarity(driver, id1_str, id2_str), lch_similarity(driver, id1_str, id2_str), path_similarity(driver, id1_str, id2_str))
        f.write(record + '\n')

In [53]:
driver.close()

In [2]:
import pandas as pd

In [5]:
df = pd.read_csv('../log/dist_comparisons.csv', sep=';', on_bad_lines='skip')
df.head(5)

Unnamed: 0,id1,id2,lemma1,lemma2,wup,lch,path
0,bn:15653856n,bn:21384886n,Cibao_Marl,Levally_Lough,0.5,1.504077,0.25
1,bn:08864746n,bn:06947256n,Kaustifikace,Laguna_Madre_Sal,0.5,1.098612,0.166667
2,bn:00081683n,bn:00075601n,wreckage,Swiss,0.5,1.098612,0.166667
3,bn:00010453n,bn:00076232n,bilocation,tax_haven,0.166667,2.197225,0.5
4,bn:10842407n,bn:06160422n,Parque_Comandante_Jacques_Cousteau,Высшая_мера,0.571429,0.944462,0.142857


In [6]:
df_sorted = df.sort_values(by=['wup', 'lch', 'path'], ascending=[True, False, False], ignore_index=True)

In [22]:
i = np.random.randint(0, df_sorted.shape[0]+1)
df_sorted.loc[i:i+10]

Unnamed: 0,id1,id2,lemma1,lemma2,wup,lch,path
86,bn:15195477n,bn:06840746n,Junkerberg_Formation,Parque_Raposo_Tavares,0.333333,1.504077,0.25
87,bn:16555208n,bn:01078384n,toxification,DIDS,0.333333,1.504077,0.25
88,bn:17735781n,bn:13860463n,"Sanctuary_of_the_Madonna_del_Piano,_Ausonia",Cheney_Longville_Formation,0.333333,1.504077,0.25
89,bn:11683676n,bn:02031295n,Haväng,Casselman_Formation,0.333333,1.504077,0.25
90,bn:16123437n,bn:07209802n,Railroad_Canyon_Beds_Formation,Gamlem,0.333333,1.504077,0.25
91,bn:06947206n,bn:00058580n,Laguna_Carrillo,oceanfront,0.333333,1.504077,0.25
92,bn:02161027n,bn:03084281n,Animal_suicide,Hazards_of_outdoor_activities,0.375,1.280934,0.2
93,bn:04879021n,bn:00064661n,Tournoi_de_Vannes,prohibition,0.375,1.280934,0.2
94,bn:01863551n,bn:01508021n,Executions_of_Cossacks_in_Lebedin,4'-Methoxy-α-pyrrolidinopropiophenone,0.375,1.280934,0.2
95,bn:00019762n,bn:16515760n,client,Capital_punishment_in_Luxembourg,0.375,1.098612,0.166667


In [96]:
i = 51
s1, s2 = df_sorted.loc[i]['id1'], df_sorted.loc[i]['id2']
s1, s2 = BabelSynsetID(s1).to_synset(), BabelSynsetID(s2).to_synset()
s1.main_gloss(), s2.main_gloss()

(Someone who gratifies physical appetites (especially for food and drink) with more than the usual freedom,
 Nauruans are a nation and an ethnic group indigenous to the Pacific island country of Nauru.)

In [97]:
shortest_path_synsets_lemmas(driver, str(s1.id), str(s2.id))

free-liver
free_agent
person
Nauruans


In [8]:
# It looks like most of nodes are some sort of geological formation, is that true?
number_of_nodes_with_ancestor_query = "MATCH (n:Synset)-[:IS_A*]->(ancestor:Synset {synsetID: $ancestorID}) RETURN count(n)"

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    n = driver.execute_query(number_of_nodes_with_ancestor_query, 
                         {'ancestorID': 'bn:00035942n'},
                         result_transformer_=neo4j.Result.data)[0]['count(n)']
    tot_nodes = driver.execute_query(count_nodes_query,                          
                         result_transformer_=neo4j.Result.data)[0]['count(s)']
print(f'{n} nodes with ancestor bn:00035942n of {tot_nodes} nodes, {round(n*100/tot_nodes, 2)}%')

5864 nodes with ancestor bn:00035942n of 16367 nodes, 35.83%
