In [1]:
from dictionary_graph import DictionaryGraph
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/marko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
graph = DictionaryGraph('OPTED', 'a-z')
print('Broj vrhova: {}\nBroj bridova: {}'.format(*graph.size()))

Broj vrhova: 106282
Broj bridova: 1315090


In [3]:
def similarity_limit(A, B, iterations=50):
    Z = np.full((B.shape[0], A.shape[0]), 1)
    for i in range(2*iterations): # Paran broj iteracija
        Z = B @ Z @ A.T + B.T @ Z @ A
        Z /= np.linalg.norm(Z, 'fro')
    return Z

In [4]:
def get_synonyms(word, n=10, **kwargs):
    vertices, adjacency_matrix = graph.adjacency_matrix(word, **kwargs)
    S = pd.DataFrame(similarity_limit(
        np.array([
            [0, 1, 0],
            [0, 0, 1],
            [0, 0, 0],
        ], dtype=np.float64),
        adjacency_matrix,
    ), index=vertices)
    return S[1].nlargest(n)

def compare_methods(word, n=10):
    df = pd.DataFrame(index=range(n))
    wordnet = [word] + list(set(j.name().replace('_', ' ') for i in nltk.corpus.wordnet.synsets(word) for j in i.lemmas()) - {word})
    df['WordNet'] = pd.Series(wordnet).reindex(df.index)
    df['0'] = get_synonyms(word, n, method=0).reset_index()['value'].reindex(df.index)
    df['1'] = get_synonyms(word, n, method=1).reset_index()['value'].reindex(df.index)
    df['2'] = get_synonyms(word, n, method=2).reset_index()['value'].reindex(df.index)
    df['2, sqrt'] = get_synonyms(word, n, method=2, f=np.sqrt).reset_index()['value'].reindex(df.index)
    df['2, log(1+x)'] = get_synonyms(word, n, method=2, f=lambda x: np.log(1+x)).reset_index()['value'].reindex(df.index)
    return df

print(compare_methods('disappear').to_latex(index=False, na_rep=''))

\begin{tabular}{llllll}
\toprule
   WordNet &          0 &          1 &            2 &    2, sqrt & 2, log(1+x) \\
\midrule
 disappear &         to &  disappear &       evanid &     evanid &   disappear \\
    vanish &         as &     vanish &       vanish &  disappear &      evanid \\
      melt &          a &       pass &    disappear &     vanish &      vanish \\
 evaporate &         or &        die &        faint &      faint &          to \\
   go away &         of &       fade &         fade &       fade &          as \\
           &        the &       wear &          die &        die &        pass \\
           &  disappear &      faint &         pass &       pass &        fade \\
           &         be &       sail &    dissipate &  eliminate &       faint \\
           &       from &  gradually &     evanesce &  dissipate &         die \\
           &         at &      light &  disappeared &   evanesce &          or \\
\bottomrule
\end{tabular}



In [5]:
# Metoda 0: Koristi sve veze među riječima
# Metoda 1: Uklanja riječi iz induciranog grafa koje se pojavljuju u definicijama više od 1000 puta
# Metoda 2: adjacency_matrix[i][j] iz metode 0 je dijelimo s brojem ukupnih pojavljivanja riječi j u definicijama
# Metoda 2, f: adjacency_matrix[i][j] iz metode 0 dijelimo s f(broj ukupnih pojavljivanja riječi j u definicijama)
for method, f in [(0, None), (1, None), (2, None), (2, np.sqrt), (2, lambda x: np.log(1+x))]:
    print((method, f))
    print(get_synonyms('disappear', method=method, f=f))

(0, None)
value
to           0.190090
as           0.187332
a            0.179362
or           0.177176
of           0.172529
the          0.160343
disappear    0.140431
be           0.132365
from         0.127341
at           0.107400
Name: 1, dtype: float64
(1, None)
value
disappear    0.369223
vanish       0.191113
pass         0.163782
die          0.118671
fade         0.103871
wear         0.098721
faint        0.097857
sail         0.087878
gradually    0.085647
light        0.066490
Name: 1, dtype: float64
(2, None)
value
evanid         0.579569
vanish         0.015435
disappear      0.015208
faint          0.007551
fade           0.001621
die            0.001493
pass           0.001353
dissipate      0.001344
evanesce       0.001343
disappeared    0.000805
Name: 1, dtype: float64
(2, <ufunc 'sqrt'>)
value
evanid       0.416421
disappear    0.265976
vanish       0.176820
faint        0.106344
fade         0.100385
die          0.072913
pass         0.063720
eliminate    0.05793