In [7]:
!python3.6 -m pip install pymagnitude

Collecting pymagnitude
  Downloading https://files.pythonhosted.org/packages/0a/a3/b9a34d22ed8c0ed59b00ff55092129641cdfa09d82f9abdc5088051a5b0c/pymagnitude-0.1.120.tar.gz (5.4MB)
[K    100% |████████████████████████████████| 5.4MB 415kB/s eta 0:00:011
[?25hBuilding wheels for collected packages: pymagnitude
  Running setup.py bdist_wheel for pymagnitude ... [?25ldone
[?25h  Stored in directory: /home/maciej/.cache/pip/wheels/a2/c7/98/cb48b9db35f8d1a7827b764dc36c5515179dc116448a47c8a1
Successfully built pymagnitude
Installing collected packages: pymagnitude
Successfully installed pymagnitude-0.1.120


In [2]:
from pymagnitude import *
from scipy.spatial.distance import cityblock, cosine, euclidean
from tqdm import tqdm

In [3]:
modelsDir = "/home/maciej/repos/nlp-2/"
w2v_filtered_file_mag = "skip_gram_v100m8.w2v.filtered.magnitude"
ft_filtered_file_mag = "kgr10.plain.skipgram.dim100.neg10.filtered.magnitude"

In [4]:
ft_vectors = Magnitude(modelsDir+ft_filtered_file_mag)
w2v_vectors = Magnitude(modelsDir+w2v_filtered_file_mag)

models = dict()
models["ft"] = ft_vectors
models["w2v"] = w2v_vectors

measures = dict()
measures["cityblock"] = cityblock
measures["cosine"] = cosine
measures["euclidean"] = euclidean

In [5]:
def test(model, measure):
    s1 = "lodówka"
    s2 = "lodowisko"
    return measure(model.query(s1), model.query(s2))

for model_str in ["ft", "w2v"]:
    model=models[model_str]
    for measure_str in ["cityblock", "cosine", "euclidean"]:
        measure=measures[measure_str]
        print('Test for {} model and {} measure:'.format(model_str, measure_str))
        print(test(model, measure))

Test for ft model and cityblock measure:
9.575574
Test for ft model and cosine measure:
0.69649538397789
Test for ft model and euclidean measure:
1.1802505254745483
Test for w2v model and cityblock measure:
9.575574
Test for w2v model and cosine measure:
0.69649538397789
Test for w2v model and euclidean measure:
1.1802505254745483


In [6]:
def most_similar(word, k, model, measure):
    similars = []
    word_vec = model.query(word)
    no = 1
    max = len(model)
    for key, vector in tqdm(model):
        progress = no * 100 / max
        #print('Progress {}'.format(progress))
        #print(similars)
        no+=1
        key_vec = model.query(key)
        distance = measure(word_vec, key_vec)
        if len(similars)<k:
            similars.append((key, distance))
            similars = sorted(similars, key=operator.itemgetter(1))
        else:
            found=False
            i=0
            while not found and i<len(similars):
                similar = similars[i]
                if distance < similar[1] and distance != 0.0:
                    similars.insert(i, (key, distance))
                    found=True
                i+=1
            if len(similars) == k+1:
                similars = similars[:-1]
    return similars

In [7]:
similars = most_similar("lodówka", 3, ft_vectors, cosine) # if ends not on 100% it's OK
print("DONE")
print(similars)

 37%|███▋      | 925433/2502337 [04:02<06:53, 3813.14it/s]

[('mikrofalówka', 0.09864002466201782), ('zamrażarka', 0.1100584864616394), ('zamrażalka', 0.12743932008743286)]



