In [255]:
import plwn
import pickle

import numpy as np
import networkx as nx
from networkx.algorithms.shortest_paths.generic import shortest_path
from networkx.algorithms.simple_paths import all_simple_paths
from statistics import mean

In [2]:
wn = plwn.load_default()

In [171]:
def save_obj(obj, name ):
    with open('wordnet/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('wordnet/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def build_synsets_tree():
    s_edges = np.load('wordnet/edges_synsets.npy')
    g = nx.DiGraph()
    g.add_edges_from(s_edges)
    roots = [ n for n in g.nodes if len(list(g.predecessors(n))) == 0 ]
    connected_roots = [('root', n) for n in roots]
    g.add_edges_from(connected_roots)
    nx.write_gpickle(g, "wordnet/wn_tree.gpickle")

In [175]:
l2s_map = load_obj('lemma_synset_mapping')
g = nx.read_gpickle("wordnet/wn_tree.gpickle")

In [261]:
def count_depths():
    node_depths = [len(shortest_path(g, 'root', node)) for node in g.nodes]
    max_depth = max(node_depths) # 28
    avg_depth = round(mean(node_depths), 2) # 5.88
    
MAX_DEPTH = 28
AVG_DEPTH = 5.88

In [321]:
def paths_distance(x, y):
    x = set(x)
    y = set(y)
    distance = len(x.difference(y)) + len(y.difference(x))
    return distance + 1
    
def count_distance(first, second):
    if check_if_common_synset(first, second):
        return 1
    
    fst_paths = []
    snd_paths = []
    for s_id in l2s_map[first]:
        fst_paths.extend(list(all_simple_paths(g, 'root', s_id)))
    for s_id in l2s_map[second]:
        snd_paths.extend(list(all_simple_paths(g, 'root', s_id)))

    distances = [paths_distance(x, y) for x in fst_paths for y in snd_paths]
    return min(distances)

def check_if_common_synset(first, second):
    return len(set(l2s_map[first]).intersection(set(l2s_map[second]))) > 0

def wordnet_similarity_measure(first, second):
    distance = count_distance(first, second)
    measure = -math.log(1.0 * distance / (2 * MAX_DEPTH))
    return measure

def wordnet_similarity_measure2(first, second):
    distance = count_distance(first, second)
    measure = -math.log(1.0 * distance / (2 * AVG_DEPTH))
    if measure < 0:
        measure = 0
    return measure

In [322]:
print(wordnet_similarity_measure("lodówka", "lodowisko"))
print(wordnet_similarity_measure2("lodówka", "lodowisko"))

1.9459101490553135
0.38526240079064494


In [300]:
def most_similar(word, k):
    synsets = [wn.synset_by_id(int(s_id)) for s_id in l2s_map[word]]
    synset_units = []
    for syn in synsets:
        synset_units.extend([lex_unit.lemma for lex_unit in syn.lexical_units])
    synset_units = list(set(synset_units) - set([word]))
    
    synset_result = synset_units[:k]
    if len(synset_result) < k:
        parents = []
        for s_id in l2s_map[word]:
            parents.extend(g.predecessors(s_id))
            
        neighbors = [lex_unit.lemma for s_id in parents for lex_unit in wn.synset_by_id(int(s_id)).lexical_units]
        neighbors_size = k - len(synset_result)
        synset_result.extend(neighbors[:neighbors_size])
    return synset_result

In [303]:
most_similar('lodówka', 3)

['lodownia', 'chłodziarka', 'urządzenie kuchenne']