In [7]:
!python3.6 -m pip install pymagnitude

Collecting pymagnitude
  Downloading https://files.pythonhosted.org/packages/0a/a3/b9a34d22ed8c0ed59b00ff55092129641cdfa09d82f9abdc5088051a5b0c/pymagnitude-0.1.120.tar.gz (5.4MB)
[K    100% |████████████████████████████████| 5.4MB 415kB/s eta 0:00:011
[?25hBuilding wheels for collected packages: pymagnitude
  Running setup.py bdist_wheel for pymagnitude ... [?25ldone
[?25h  Stored in directory: /home/maciej/.cache/pip/wheels/a2/c7/98/cb48b9db35f8d1a7827b764dc36c5515179dc116448a47c8a1
Successfully built pymagnitude
Installing collected packages: pymagnitude
Successfully installed pymagnitude-0.1.120


In [1]:
from pymagnitude import *
from scipy.spatial.distance import cityblock, cosine, euclidean
from tqdm import tqdm
import pandas as pd

In [2]:
modelsDir = "/home/maciej/repos/nlp-2/"
w2v_filtered_file_mag = "skip_gram_v100m8.w2v.filtered.magnitude"
ft_filtered_file_mag = "kgr10.plain.skipgram.dim100.neg10.filtered.magnitude" 

In [3]:
ft = Magnitude(modelsDir+ft_filtered_file_mag)
w2v = Magnitude(modelsDir+w2v_filtered_file_mag)

models = dict()
models["ft"] = ft
models["w2v"] = w2v

measures = dict()
measures["cityblock"] = cityblock
measures["cosine"] = cosine
measures["euclidean"] = euclidean

In [4]:
def similarity(model, measure, w1, w2):
    return 1.0/measure(model.query(w1), model.query(w2))

In [14]:
def run_test():
    test_data = np.load('test_data.npy')
    results = []
    for d_id, data in enumerate(test_data):
        word_1 = data[0]
        word_2 = data[1]
        result_ft_cb = similarity(ft, cityblock, word_1, word_2)
        result_ft_cos = similarity(ft, cosine, word_1, word_2)
        result_ft_eu = similarity(ft, euclidean, word_1, word_2)
        result_w2v_cb = similarity(w2v, cityblock, word_1, word_2)
        result_w2v_cos = similarity(w2v, cosine, word_1, word_2)
        result_w2v_eu = similarity(w2v, euclidean, word_1, word_2)
        
        results.append([word_1, word_2, result_ft_cb, result_ft_cos, result_ft_eu, result_w2v_cb, result_w2v_cos, result_w2v_eu])
    np.save('test_embeddings', results)

In [15]:
run_test()

In [16]:
def test_corr():
    test_data = np.load('test_data.npy')
    df_test = pd.DataFrame(test_data, columns = ['word_1', 'word_2', 'simi_base'])
    df_test['simi_base'] = df_test['simi_base'].astype(float)
    
    my_results = np.load('test_embeddings.npy')
    df_my_results = pd.DataFrame(my_results, columns = ['word_1', 'word_2', 'fasttext_cityblock', 'fasttext_cosine', 'fasttext_euclidean', 'word2vec_cityblock', 'word2vec_cosine', 'word2vec_euclidean'])
    df_my_results['fasttext_cityblock'] = df_my_results['fasttext_cityblock'].astype(float)
    df_my_results['fasttext_cosine'] = df_my_results['fasttext_cosine'].astype(float)
    df_my_results['fasttext_euclidean'] = df_my_results['fasttext_euclidean'].astype(float)
    df_my_results['word2vec_cityblock'] = df_my_results['word2vec_cityblock'].astype(float)
    df_my_results['word2vec_cosine'] = df_my_results['word2vec_cosine'].astype(float)
    df_my_results['word2vec_euclidean'] = df_my_results['word2vec_euclidean'].astype(float)
    
    merge = pd.concat([df_test, df_my_results], axis=1, join='inner')[['simi_base', 'fasttext_cityblock', 'fasttext_cosine', 'fasttext_euclidean', 'word2vec_cityblock', 'word2vec_cosine', 'word2vec_euclidean']]
    return merge.corr()

In [17]:
test_corr()

Unnamed: 0,simi_base,fasttext_cityblock,fasttext_cosine,fasttext_euclidean,word2vec_cityblock,word2vec_cosine,word2vec_euclidean
simi_base,1.0,0.326752,0.218275,0.328461,0.24592,0.177479,0.247366
fasttext_cityblock,0.326752,1.0,0.91973,0.994925,0.689833,0.683695,0.697864
fasttext_cosine,0.218275,0.91973,1.0,0.92887,0.623891,0.695348,0.632835
fasttext_euclidean,0.328461,0.994925,0.92887,1.0,0.689955,0.687694,0.697905
word2vec_cityblock,0.24592,0.689833,0.623891,0.689955,1.0,0.957396,0.995669
word2vec_cosine,0.177479,0.683695,0.695348,0.687694,0.957396,1.0,0.96295
word2vec_euclidean,0.247366,0.697864,0.632835,0.697905,0.995669,0.96295,1.0


In [6]:
def most_similar(word, k, model, measure):
    similars = []
    word_vec = model.query(word)
    no = 1
    max = len(model)
    for key, vector in tqdm(model):
        progress = no * 100 / max
        #print('Progress {}'.format(progress))
        #print(similars)
        no+=1
        key_vec = model.query(key)
        distance = measure(word_vec, key_vec)
        if len(similars)<k:
            similars.append((key, distance))
            similars = sorted(similars, key=operator.itemgetter(1))
        else:
            found=False
            i=0
            while not found and i<len(similars):
                similar = similars[i]
                if distance < similar[1] and distance != 0.0:
                    similars.insert(i, (key, distance))
                    found=True
                i+=1
            if len(similars) == k+1:
                similars = similars[:-1]
    return similars

In [7]:
similars = most_similar("lodówka", 3, ft, cosine) # if ends not on 100% it's OK
print("DONE")
print(similars)

 37%|███▋      | 925433/2502337 [04:02<06:53, 3813.14it/s]

[('mikrofalówka', 0.09864002466201782), ('zamrażarka', 0.1100584864616394), ('zamrażalka', 0.12743932008743286)]



