In [1]:
import plwn
import pickle
import math

import numpy as np
import networkx as nx
import pandas as pd

from networkx.algorithms.shortest_paths.generic import shortest_path
from statistics import mean
from datetime import datetime

In [2]:
wn = plwn.load_default()

In [3]:
def save_obj(obj, name ):
    with open('wordnet/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('wordnet/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def build_synsets_tree():
    s_edges = np.load('wordnet/edges_synsets.npy')
    g = nx.DiGraph()
    g.add_edges_from(s_edges)
    
    roots = [ n for n in g.nodes if len(list(g.predecessors(n))) == 0 ]
    connected_roots = [('root', n) for n in roots]
    g.add_edges_from(connected_roots)
    
    wn_synsets = wn.synsets()
    edges_flatten = set([node for edge in s_edges for node in edge])
    s_ids = set([s.id for s in wn_synsets])
    free_nodes = s_ids.difference(edges_flatten)
    free_edges = [('root', n) for n in free_nodes]
    g.add_edges_from(free_edges)
    
    nx.write_gpickle(g, "wordnet/wn_tree.gpickle")

In [4]:
l2s_map = load_obj('lemma_synset_mapping')
g = nx.read_gpickle("wordnet/wn_tree.gpickle")

In [5]:
def count_depths():
    node_depths = [len(shortest_path(g, 'root', node)) for node in g.nodes]
    max_depth = max(node_depths) # 28
    avg_depth = round(mean(node_depths), 2) # 5.88
    
MAX_DEPTH = 28
AVG_DEPTH = 5.88

In [6]:
def count_distance(first, second):
    if check_if_common_synset(first, second):
        return 1
    
    def init_branches(word):
        return [[node, parent] for node in l2s_map[word] for parent in g.predecessors(node)]            
    
    fst_branches = init_branches(first)
    snd_branches = init_branches(second)
    
    def expand_branches(branches):
        new_branches = []
        for branch in branches:
            parents = list(g.predecessors(branch[-1]))
            if len(parents) == 0:
                new_branches.append(branch)
            else:
                for parent in parents:
                    new_branch = branch + [parent]
                    new_branches.append(new_branch)
        return new_branches
    
    def common_nodes(fst_path, snd_path):
        return len(set(fst_path).intersection(set(snd_path))) > 0
    
    def get_distance(fst_path, snd_path):
        common_node = list(set(fst_path).intersection(set(snd_path)))[0]
        return fst_path.index(common_node) + snd_path.index(common_node) + 1
    
    min_distance = None

    while not min_distance:
        fst_branches = expand_branches(fst_branches)
        snd_branches = expand_branches(snd_branches)
        
        distances = [get_distance(fst, snd) for fst in fst_branches for snd in snd_branches if common_nodes(fst, snd)]
        if len(distances) > 0:
            min_distance = min(distances)
    
    return min_distance
        
def check_if_common_synset(first, second):
    return len(set(l2s_map[first]).intersection(set(l2s_map[second]))) > 0

def words_exists(first, second):
    if first not in l2s_map:
        print('Word ', first, 'not exist!')
        return False
    if second not in l2s_map:
        print('Word ', second, 'not exist!')
        return False        
    return True

def wordnet_similarity_measure(first, second):
    if not words_exists(first, second):
        return 0
    distance = count_distance(first, second)
    measure = -math.log(1.0 * distance / (2 * MAX_DEPTH))
    return round(measure, 2)

def wordnet_similarity_measure2(first, second):
    if not words_exists(first, second):
        return 0
    distance = count_distance(first, second)
    measure = -math.log(1.0 * distance / (2 * AVG_DEPTH))
    if measure < 0:
        measure = 0
    return round(measure, 2)

In [7]:
print(wordnet_similarity_measure("lodówka", "lodowisko"))
print(wordnet_similarity_measure2("lodówka", "lodowisko"))

1.95
0.39


In [8]:
def most_similar(word, k):
    synsets = [wn.synset_by_id(int(s_id)) for s_id in l2s_map[word]]
    synset_units = []
    for syn in synsets:
        synset_units.extend([lex_unit.lemma for lex_unit in syn.lexical_units])
    synset_units = list(set(synset_units) - set([word]))
    
    synset_result = synset_units[:k]
    if len(synset_result) < k:
        parents = []
        for s_id in l2s_map[word]:
            parents.extend(g.predecessors(s_id))
            
        neighbors = [lex_unit.lemma for s_id in parents for lex_unit in wn.synset_by_id(int(s_id)).lexical_units]
        neighbors_size = k - len(synset_result)
        synset_result.extend(neighbors[:neighbors_size])
    return synset_result

In [9]:
most_similar('lodówka', 3)

['chłodziarka', 'lodownia', 'urządzenie kuchenne']

In [10]:
def run_test():
    test_data = np.load('test_data.npy')
    results = []
    for d_id, data in enumerate(test_data):
        word_1 = data[0]
        word_2 = data[1]
        result_1 = wordnet_similarity_measure(word_1, word_2)
        result_2 = wordnet_similarity_measure2(word_1, word_2)
        
#         print(d_id, datetime.now(), word_1, word_2, result_1, result_2)
        results.append([word_1, word_2, result_1, result_2])
#         np.save('backup_test_wordnet', results)
    np.save('test_wordnet', results)

In [11]:
run_test()

Word  obładowany not exist!
Word  obładowany not exist!
Word  niedawny not exist!
Word  niedawny not exist!
Word  ubrania not exist!
Word  ubrania not exist!
Word  nasiona not exist!
Word  nasiona not exist!
Word  mężczyźni not exist!
Word  mężczyźni not exist!
Word  meble not exist!
Word  meble not exist!
Word  spowiedż not exist!
Word  spowiedż not exist!
Word  wyobrażać not exist!
Word  wyobrażać not exist!
Word  pojawiać not exist!
Word  pojawiać not exist!
Word  modlić not exist!
Word  modlić not exist!


In [12]:
def test_corr():
    test_data = np.load('test_data.npy')
    df_test = pd.DataFrame(test_data, columns = ['word_1', 'word_2', 'simi_base'])
    df_test['simi_base'] = df_test['simi_base'].astype(float)
    
    my_results = np.load('test_wordnet.npy')
    df_my_results = pd.DataFrame(my_results, columns = ['word_1', 'word_2', 'simi_1', 'simi_2'])
    df_my_results['simi_1'] = df_my_results['simi_1'].astype(float)
    df_my_results['simi_2'] = df_my_results['simi_2'].astype(float)
    
    merge = pd.concat([df_test, df_my_results], axis=1, join='inner')[['simi_base', 'simi_1', 'simi_2']]
    return merge.corr()

In [13]:
test_corr()

Unnamed: 0,simi_base,simi_1,simi_2
simi_base,1.0,0.449342,0.48493
simi_1,0.449342,1.0,0.96381
simi_2,0.48493,0.96381,1.0
