In [33]:
import os
import sys
sys.path.append('..')
from utils import utils
import numpy as np

import neo4j
from neo4j import GraphDatabase

In [34]:
from babelnet import BabelSynsetID

2023-09-19 15:27:02,877 [babelnet.conf] INFO: Loaded configuration from ['/home/giovanni/unimore/2_graph_analytics/tesina/babeldist/remote/babelnet_conf.yml']
2023-09-19 15:27:02,888 [babelnet.api] INFO: BabelNet Remote Procedure Call API v1.1.0


In [35]:
URI = "bolt://localhost:7687"
AUTH = ("giovanni", "BabeldistGraph")

In [107]:
random_node_query = " MATCH (a:Synset) RETURN a.synsetID as synsetID, rand() as r ORDER BY r LIMIT 1"

count_nodes_query = " MATCH (s:Synset) RETURN count(s) as numNodes"
count_edges_query = " MATCH ()-[r:IS_A]->() RETURN count(r) as numEdges"

shortestPath_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = shortestPath((s1)-[:IS_A*..12]-(s2))
RETURN p as shortestPath, length(p) as length_shortestPath """

wup_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = (s1)-[:IS_A*..]->(common_node:Synset)<-[:IS_A*..12]-(s2) 
WITH s1, s2, common_node, length(p) AS len_p ORDER BY len_p ASC LIMIT 1
MATCH (root:Synset {synsetID: $root_id})
MATCH sp_cn=shortestPath((common_node)-[:IS_A*1..12]-(root))
WITH s1, s2, root, length(sp_cn) AS LCS_depth
MATCH s1_sp = shortestPath((s1)-[:IS_A*1..12]-(root))
MATCH s2_sp = shortestPath((s2)-[:IS_A*1..12]-(root))
WITH length(s1_sp) AS dist_s1_root, length(s2_sp) AS dist_s2_root, LCS_depth
RETURN (toFloat(LCS_depth) / (dist_s1_root + dist_s2_root)) AS wup_similarity """

lch_query = """
MATCH t = (root:Synset {synsetID: $root_id})<-[:IS_A*1..12]-(child:Synset) 
WHERE NOT (child)<-[:IS_A]-()
WITH length(t) AS D ORDER BY D DESC LIMIT 1
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH shortest_path = shortestPath((s1)-[:IS_A*..12]-(s2))
WITH D, length(shortest_path) as length_sp
RETURN -log(toFloat(length_sp) / (2*D)) as lch_similarity, D as taxonomy_length, length_sp """

path_query = """
MATCH (s1:Synset {synsetID: $synsetID_1})
MATCH (s2:Synset {synsetID: $synsetID_2})
MATCH p = shortestPath((s1)-[:IS_A*..12]-(s2))
RETURN 1.0/length(shortest_path) as path_similarity """

In [37]:
root_node_id_str = 'bn:00062164n' # physical entity

In [38]:
def get_random_synset_id(driver):
    return driver.execute_query(random_node_query, result_transformer_=neo4j.Result.data)[0]['synsetID']

In [80]:
def get_shortest_path(driver, s1_id: str, s2_id: str):
    return driver.execute_query(
        shortestPath_query, 
        {'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)[0]['shortestPath']

In [92]:
def get_shortest_path_length(driver, s1_id: str, s2_id: str):
    p = driver.execute_query(
        shortestPath_query, 
        {'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)
    return p[0]['length_shortestPath']

In [41]:
def shortest_path_synsets_lemmas(driver, s1_id: str, s2_id: str):
    shortest_path = get_shortest_path(driver, s1_id, s2_id)[0]['shortestPath']
    for item in shortest_path:
        if type(item) is dict and 'synsetID' in item.keys():
            try:
                print(BabelSynsetID(item['synsetID']).to_synset().main_sense().full_lemma)
            except Exception as e:
                print(e.args[0])

In [112]:
def wup_similarity(driver, s1_id: str, s2_id: str):
    return driver.execute_query(
        wup_query,
        {'root_id': root_node_id_str,
         'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)[0]['wup_similarity']

In [109]:
def lch_similarity(driver, s1_id: str, s2_id: str):
    return driver.execute_query(
        lch_query,
        {'root_id': root_node_id_str,
         'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)[0]['lch_similarity']

In [110]:
def path_similarity(driver, s1_id: str, s2_id: str):
    return driver.execute_query(
        path_query,
        {'synsetID_1': s1_id,
         'synsetID_2': s2_id},
        database_='neo4j',
        result_transformer_=neo4j.Result.data)[0]['path_similarity']

In [48]:
driver = GraphDatabase.driver(URI, auth=AUTH)

In [49]:
id1_str, id2_str = get_random_synset_id(driver), get_random_synset_id(driver) 
id1_str, id2_str

('bn:19386169n', 'bn:13817527n')

In [50]:
id1, id2 = BabelSynsetID(id1_str), BabelSynsetID(id2_str)
s1, s2 = id1.to_synset(), id2.to_synset()
s1, s2

(476027474__WIKIDATA:EN:Tiouârdiouîne, 48899778__WIKI:ET:Anniallikas)

In [51]:
s1.main_gloss(), s2.main_gloss()

(ينبوع في الجزائر, ينبوع في إستونيا)

In [113]:
wup_similarity(driver, id1_str, id2_str)

0.375

In [103]:
lch_similarity(driver, id1_str, id2_str)

(2.1972245773362196, 2.1972245773362196)

In [None]:
shortest_path_synsets_lemmas(driver, id1_str, id2_str)

Lake_Canopus
body_of_water
thing
physical_entity
causal_agent
cause_of_death
airstrike
Attack_on_Broome


In [None]:
# Try to make a great quantity of comparisons
fname = utils.get_next_logfile_number('dist_comparisons', extension='.csv')
max_comparisons = 3
sep = ';'
names = sep.join(('id1', 'id2', 'lemma1', 'lemma2' 'wup', 'lch', 'path'))
with open(fname, 'w') as f:
    f.write(names + '\n')
    for _ in range(max_comparisons):
        id1_str, id2_str = get_random_synset_id(driver), get_random_synset_id(driver) 
        id1, id2 = BabelSynsetID(id1_str), BabelSynsetID(id2_str)
        s1, s2 = id1.to_synset(), id2.to_synset()
        record = sep.join((id1_str, id2_str, s1.main_sense().full_lemma, s2.main_sense().full_lemma),
                          wup_similarity(driver, id1_str, id2_str), lch_similarity(driver, id1_str, id2_str), path_similarity(driver, id1_str, id2_str))
        f.write(record + '\n')

In [None]:
driver.close()

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../log/dist_comparisons.csv', sep=';', on_bad_lines='skip')
df.head(5)

Unnamed: 0,id1,id2,lemma1,lemma2,wup,lch,path
0,bn:15653856n,bn:21384886n,Cibao_Marl,Levally_Lough,0.5,1.504077,0.25
1,bn:08864746n,bn:06947256n,Kaustifikace,Laguna_Madre_Sal,0.5,1.098612,0.166667
2,bn:00081683n,bn:00075601n,wreckage,Swiss,0.5,1.098612,0.166667
3,bn:00010453n,bn:00076232n,bilocation,tax_haven,0.166667,2.197225,0.5
4,bn:10842407n,bn:06160422n,Parque_Comandante_Jacques_Cousteau,Высшая_мера,0.571429,0.944462,0.142857


In [None]:
df_sorted = df.sort_values(by=['wup', 'lch', 'path'], ascending=[True, False, False], ignore_index=True)

In [None]:
i = np.random.randint(0, df_sorted.shape[0])
df_sorted.loc[i:i+10]
df_sorted.loc[5:25]

Unnamed: 0,id1,id2,lemma1,lemma2,wup,lch,path
5,bn:16078159n,bn:02927037n,Sheep_Pen_Sandstone,Muav_Limestone,0.166667,2.197225,0.5
6,bn:16181756n,bn:14962530n,Tulare_Formation,Kiltorcan_Formation,0.166667,2.197225,0.5
7,bn:16123273n,bn:14967161n,White_Knob_Formation,Lärchberg_Formation,0.166667,2.197225,0.5
8,bn:16067329n,bn:02452568n,Wiota_Gravels,Red_Crag_Formation,0.166667,2.197225,0.5
9,bn:15082001n,bn:16063583n,Dessa_Dawn_Formation,Deep_River_Formation,0.166667,2.197225,0.5
10,bn:15362748n,bn:16415624n,Falun_de_Pierrefitte,Hindsville_Limestone,0.166667,2.197225,0.5
11,bn:16721865n,bn:00013475n,Absolvent,brunet,0.166667,2.197225,0.5
12,bn:03284816n,bn:06947236n,Prime_meridian,Laguna_Sábalos,0.166667,2.197225,0.5
13,bn:01353217n,bn:22621880n,Kezilesu_Group,Whitehill_Formation,0.166667,2.197225,0.5
14,bn:16670890n,bn:22328757n,Skrinkle_Sandstones_Group,Bokkeveld_Group,0.166667,2.197225,0.5


In [None]:
df_sorted[df_sorted['wup'] > 0.714]

Unnamed: 0,id1,id2,lemma1,lemma2,wup,lch,path
294,bn:06464054n,bn:00015773n,Defenestracja_wrocławska,captor,0.714286,1.280934,0.2
295,bn:06464054n,bn:03556222n,Defenestracja_wrocławska,Barbadians,0.714286,1.280934,0.2
296,bn:00018595n,bn:16391447n,chlorine_water,Clyde_Formation,0.714286,0.944462,0.142857
297,bn:05342818n,bn:17710678n,Reichenhall_Formation,Murder_of_Anastasiya_Meshcheryakova,0.714286,0.944462,0.142857
298,bn:00007311n,bn:02770575n,auto-da-fe,Sky_News_Arabia,0.714286,0.944462,0.142857
299,bn:17244499n,bn:17514528n,Copper_zinc_antimony_sulfide,"Santissima_Annunziata,_Circello",0.833333,1.098612,0.166667


In [None]:
df['wup'].unique()

array([0.5       , 0.16666667, 0.57142857, 0.33333333, 0.28571429,
       0.83333333, 0.71428571, 0.42857143, 0.375     , 0.4       ,
       0.6       , 0.25      ])

In [None]:
i = 51
s1, s2 = df_sorted.loc[i]['id1'], df_sorted.loc[i]['id2']
s1, s2 = BabelSynsetID(s1).to_synset(), BabelSynsetID(s2).to_synset()
s1.main_gloss(), s2.main_gloss()

(Someone who gratifies physical appetites (especially for food and drink) with more than the usual freedom,
 Nauruans are a nation and an ethnic group indigenous to the Pacific island country of Nauru.)

In [None]:
shortest_path_synsets_lemmas(driver, str(s1.id), str(s2.id))

free-liver
free_agent
person
Nauruans


In [None]:
# It looks like most of nodes are some sort of geological formation, is that true?
number_of_nodes_with_ancestor_query = "MATCH (n:Synset)-[:IS_A*]->(ancestor:Synset {synsetID: $ancestorID}) RETURN count(n)"

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    n = driver.execute_query(number_of_nodes_with_ancestor_query, 
                         {'ancestorID': 'bn:00035942n'},
                         result_transformer_=neo4j.Result.data)[0]['count(n)']
    tot_nodes = driver.execute_query(count_nodes_query,                          
                         result_transformer_=neo4j.Result.data)[0]['count(s)']
print(f'{n} nodes with ancestor bn:00035942n of {tot_nodes} nodes, {round(n*100/tot_nodes, 2)}%')

5864 nodes with ancestor bn:00035942n of 16367 nodes, 35.83%
