In [2]:
import json
import csv
import random
import time
import torch
import torch.nn as nn
import numpy as np

In [5]:

fasttext_sv = np.load('data/snips_processed/FastText-sv.npy')
fasttext_en = np.load('data/snips_processed/FastText-en.npy')
fasttext_fi = np.load('data/snips_processed/FastText-fi.npy')
fasttext_uw_sv = np.load('data/snips_processed/FastText-uw-sv.npy')
fasttext_uw_en = np.load('data/snips_processed/FastText-uw-en.npy')
fasttext_uw_fi = np.load('data/snips_processed/FastText-uw-fi.npy')

elmo_sv = np.load('data/snips_processed/ELMO-sv.npy')
elmo_en = np.load('data/snips_processed/ELMO-en.npy')
use_en = np.load('data/snips_processed/USE-en.npy')

labs = np.load('data/snips_processed/labs.npy')

fasttext_sv = np.squeeze(fasttext_sv, axis=1)
fasttext_en = np.squeeze(fasttext_en, axis=1)
fasttext_fi = np.squeeze(fasttext_fi, axis=1)
fasttext_uw_sv = np.squeeze(fasttext_uw_sv, axis=1)
fasttext_uw_en = np.squeeze(fasttext_uw_en, axis=1)
fasttext_uw_fi = np.squeeze(fasttext_uw_fi, axis=1)
elmo_sv = np.squeeze(elmo_sv, axis=1)
elmo_en = np.squeeze(elmo_en, axis=1)
use_en = np.squeeze(use_en, axis=1)


#appendd bias
fasttext_sv = np.hstack([fasttext_sv, np.array([np.ones(len(fasttext_sv))]).T])
fasttext_uw_sv = np.hstack([fasttext_uw_sv, np.array([np.ones(len(fasttext_uw_sv))]).T])

fasttext_uw_fi = np.hstack([fasttext_uw_fi, np.array([np.ones(len(fasttext_uw_fi))]).T])
elmo_sv = np.hstack([elmo_sv, np.array([np.ones(len(elmo_sv))]).T])


In [6]:
def closest_index(request, dots, forbiden_index=-1):
    dists = np.linalg.norm(dots-request, axis=1)
    res =  np.argmin(dists)
    
    if res == forbiden_index:
        dists[res] = np.inf
        return np.argmin(dists)
    return res

def test_transformation(x, test_in, test_out, test_labels, verbose = False, notransform=False):
    right = 0
    n =  len(test_in)
    for i in range(n):
        vec_in = test_in[i]
        true_lab = test_labels[i]
        
        vec_out =  vec_in if notransform else vec_in@x
        predicted_index = closest_index(vec_out, test_out, forbiden_index=i)
        predicted_lab = test_labels[predicted_index]
        
        if predicted_lab == true_lab:
            right += 1
            
        if verbose and i%100 == 0:
            print('{}/{} tested, right: {}, acc {}'.format(i, n, right, right/n))

    return right/n
        
        
    
def test_transform_pair(vecs_in, vecs_out, labels, start_test_i=0,end_test_i=5000, notransform=False):
    
    train_in = np.vstack((vecs_in[:start_test_i], vecs_in[end_test_i :]))
    train_out = np.vstack((vecs_out[:start_test_i], vecs_out[end_test_i :]))
    
    test_in = vecs_in[start_test_i: end_test_i]
    test_out = vecs_out[start_test_i: end_test_i]
    test_labels = labels[start_test_i: end_test_i]
    
    x, res, rank, sing = np.linalg.lstsq(train_in, train_out)
    
    return test_transformation(x, test_in, test_out, test_labels, notransform=notransform)
        

In [7]:
def cv(vecs_in, vecs_out, labs, folds = 5, notransform=False):
    
    delims = np.arange(0, len(vecs_in), len(vecs_in)//folds)
    results = []
    t = time.time()
    for i in range(folds):
        acc = test_transform_pair(vecs_in, vecs_out,labs,  start_test_i=delims[i],end_test_i=delims[i+1], notransform=notransform)
        
        results.append(acc)
        print('#{:3d}, {:5d} sec. acc = {:.3f}'.format(i+1, int(time.time() - t), results[-1]))

    return(sum(results)/len(results))   
    

In [71]:
cv(elmo_sv, elmo_en, labs)



#  0,    52 sec. acc = 0.901
#  1,   104 sec. acc = 0.897
#  2,   158 sec. acc = 0.897
#  3,   208 sec. acc = 0.872
#  4,   260 sec. acc = 0.892


0.891944847605225

In [81]:
cv(fasttext_sv, elmo_en, labs)



#  1,    86 sec. acc = 0.882
#  2,   158 sec. acc = 0.873
#  3,   217 sec. acc = 0.874
#  4,   264 sec. acc = 0.835
#  5,   310 sec. acc = 0.849


0.8626269956458635

In [73]:
cv(elmo_sv, use_en, labs)



#  0,    24 sec. acc = 0.910
#  1,    48 sec. acc = 0.923
#  2,    72 sec. acc = 0.919
#  3,    93 sec. acc = 0.904
#  4,   115 sec. acc = 0.923


0.9158925979680695

In [82]:
cv(fasttext_sv, use_en, labs)



#  1,    22 sec. acc = 0.890
#  2,    42 sec. acc = 0.907
#  3,    60 sec. acc = 0.906
#  4,    79 sec. acc = 0.880
#  5,    98 sec. acc = 0.899


0.896589259796807

In [85]:
cv(fasttext_uw_sv, elmo_en, labs)



#  1,    46 sec. acc = 0.903
#  2,    96 sec. acc = 0.898
#  3,   145 sec. acc = 0.890
#  4,   188 sec. acc = 0.861
#  5,   261 sec. acc = 0.892


0.8888243831640057

In [84]:
cv(fasttext_uw_sv, use_en, labs)



#  1,    22 sec. acc = 0.910
#  2,    40 sec. acc = 0.926
#  3,    60 sec. acc = 0.918
#  4,    81 sec. acc = 0.890
#  5,   107 sec. acc = 0.918


0.9123367198838895

In [8]:
cv(fasttext_uw_fi, use_en, labs)



#  1,    16 sec. acc = 0.902
#  2,    34 sec. acc = 0.933
#  3,    54 sec. acc = 0.930
#  4,    75 sec. acc = 0.897
#  5,    94 sec. acc = 0.929


0.9182148040638607

In [9]:
cv(fasttext_fi, use_en, labs)



#  1,    19 sec. acc = 0.881
#  2,    37 sec. acc = 0.898
#  3,    55 sec. acc = 0.892
#  4,    71 sec. acc = 0.872
#  5,    87 sec. acc = 0.888


0.8860667634252539

In [103]:
#notransform (single emb test)

#cv(fasttext_uw_en, fasttext_uw_en,labs, notransform=True)
print(cv(fasttext_en, fasttext_en,labs, notransform=True))
print(cv(fasttext_sv, fasttext_sv,labs, notransform=True))
print(cv(fasttext_uw_sv, fasttext_uw_sv,labs, notransform=True))
print(cv(elmo_sv, elmo_sv,labs, notransform=True))
print(cv(elmo_en, elmo_en,labs, notransform=True))



#  1,     6 sec. acc = 0.861
#  2,    15 sec. acc = 0.870
#  3,    24 sec. acc = 0.849
#  4,    31 sec. acc = 0.848
#  5,    39 sec. acc = 0.849
0.8555152394775035
#  1,    14 sec. acc = 0.814
#  2,    32 sec. acc = 0.817
#  3,    47 sec. acc = 0.817
#  4,    62 sec. acc = 0.806
#  5,    78 sec. acc = 0.798
0.8103047895500726
#  1,    16 sec. acc = 0.903
#  2,    32 sec. acc = 0.901
#  3,    47 sec. acc = 0.902
#  4,    61 sec. acc = 0.901
#  5,    75 sec. acc = 0.889
0.8992017416545718
#  1,    54 sec. acc = 0.895
#  2,   108 sec. acc = 0.904
#  3,   154 sec. acc = 0.904
#  4,   201 sec. acc = 0.892
#  5,   250 sec. acc = 0.901
0.8991291727140783
#  1,    25 sec. acc = 0.922
#  2,    51 sec. acc = 0.915
#  3,    76 sec. acc = 0.911
#  4,   102 sec. acc = 0.913
#  5,   130 sec. acc = 0.909
0.9140058055152395


In [7]:
print(cv(use_en, use_en,labs, notransform=True))



#  1,    28 sec. acc = 0.925
#  2,    61 sec. acc = 0.917
#  3,    92 sec. acc = 0.925
#  4,   127 sec. acc = 0.918
#  5,   159 sec. acc = 0.928
0.9223512336719883


In [137]:
print('Transform matrix for w2v {}, \nTransform m for elmo {}'.format(x.shape, x_e.shape))
print('train error of regression for w2w {:.3}, elmo {:.3}, \ntest error w2v {:.3} elmo {:.3}'.format(
    np.linalg.norm(en_train -(sv_train@x)),\
    np.linalg.norm(en_train -(sv_train_e@x_e)), \
    np.linalg.norm(en_val -(sv_val@x)),\
    np.linalg.norm(en_val -(sv_val_e@x_e)))) 


Transform matrix for w2v (301, 512), 
Transform m for elmo (1025, 512)
train error of regression for w2w 54.7, elmo 40.7, 
test error w2v 3.59e+10 elmo 57.0


In [10]:
 def transformation_MSE(vecs_in, vecs_out, start_test_i=0,end_test_i=5000):
    
    train_in = np.vstack((vecs_in[:start_test_i], vecs_in[end_test_i :]))
    train_out = np.vstack((vecs_out[:start_test_i], vecs_out[end_test_i :]))
    
    test_in = vecs_in[start_test_i: end_test_i]
    test_out = vecs_out[start_test_i: end_test_i]
    
    x, res, rank, sing = np.linalg.lstsq(train_in, train_out)
    res_out = test_in@x
    c =  torch.nn.MSELoss()
    i = torch.tensor(test_out).float()
    j = torch.tensor(res_out).float()
    return(c(i, j))

In [11]:
def cv_mse(vecs_in, vecs_out, folds = 5, notransform=False):
    
    delims = np.arange(0, len(vecs_in), len(vecs_in)//folds)
    results = []
    t = time.time()
    for i in range(folds):
        acc = transformation_MSE(vecs_in, vecs_out,  start_test_i=delims[i],end_test_i=delims[i+1])
        
        results.append(acc)
        print('#{:3d}, {:5d} sec. acc = {:.9f}'.format(i+1, int(time.time() - t), results[-1]))

    return(sum(results)/len(results))   


In [12]:
print(cv_mse(fasttext_fi, use_en).item())
print(cv_mse(fasttext_uw_fi, use_en).item())

  if __name__ == '__main__':


#  1,     8 sec. acc = 0.000889627
#  2,    15 sec. acc = 0.000895678
#  3,    20 sec. acc = 0.000893725
#  4,    27 sec. acc = 0.000897559
#  5,    30 sec. acc = 0.000883288
0.0008919754181988537
#  1,     5 sec. acc = 0.000905060
#  2,    12 sec. acc = 0.000930227
#  3,    15 sec. acc = 0.000922849
#  4,    18 sec. acc = 0.000934593
#  5,    21 sec. acc = 0.000937212
0.0009259882499463856


In [32]:
print(cv_mse(fasttext_sv, elmo_en).item())
print(cv_mse(fasttext_sv, use_en).item())
print(cv_mse(fasttext_uw_sv, elmo_en).item())
print(cv_mse(fasttext_uw_sv, use_en).item())
print(cv_mse(elmo_sv, elmo_en).item())
print(cv_mse(elmo_sv, use_en).item())


  if __name__ == '__main__':


#  1,     2 sec. acc = 0.007755050
#  2,     4 sec. acc = 0.007365789
#  3,     7 sec. acc = 0.006656331
#  4,    10 sec. acc = 0.007450757
#  5,    13 sec. acc = 0.008023358
0.0074502574279904366
#  1,     3 sec. acc = 0.000838520
#  2,    12 sec. acc = 0.000832682
#  3,    14 sec. acc = 0.000839353
#  4,    16 sec. acc = 0.000841297
#  5,    18 sec. acc = 0.000835262
0.0008374226163141429
#  1,     1 sec. acc = 0.008533436
#  2,     4 sec. acc = 0.009000858
#  3,     7 sec. acc = 0.008245867
#  4,     9 sec. acc = 0.009304198
#  5,    15 sec. acc = 0.008982901
0.00881345197558403
#  1,     6 sec. acc = 0.000888445
#  2,     8 sec. acc = 0.000968067
#  3,     9 sec. acc = 0.000959471
#  4,    11 sec. acc = 0.001013159
#  5,    13 sec. acc = 0.000896327
0.000945093750488013
#  1,    21 sec. acc = 0.007087712
#  2,    34 sec. acc = 0.006732171
#  3,    53 sec. acc = 0.006150343
#  4,    75 sec. acc = 0.006764330
#  5,    90 sec. acc = 0.007386914
0.006824294570833445
#  1,    17 sec. ac

In [30]:

r = cv_mse(fasttext_sv, use_en, notransform=True)

  if __name__ == '__main__':


#  1,     4 sec. acc = 0.000838520
#  2,    10 sec. acc = 0.000832682
#  3,    12 sec. acc = 0.000839353
#  4,    13 sec. acc = 0.000841297
#  5,    15 sec. acc = 0.000835262


In [31]:
r.item()

0.0008374226163141429

In [14]:
#np.hstack(
fasttext_sv.shape, np.array([np.ones(len(fasttext_sv))]).T.shape
#)

((13784, 1, 300), (13784, 1))

In [38]:
right = 0
exact = 0
n =  len(use_en)
for i in range(n):
        
    predicted_index = closest_index(use_en[i], use_en, forbiden_index=i)
    
    predicted_lab = labs[predicted_index]
        
    if predicted_lab == labs[i]:
        right += 1
            
    if predicted_index == i:
        exact += 1
        
    if  i%100 == 1:
        print('{}/{} tested, right: {}, acc {}, ex {}'.format(i, n, right, right/i, exact))



1/13784 tested, right: 2, acc 2.0, ex 0
101/13784 tested, right: 97, acc 0.9603960396039604, ex 0
201/13784 tested, right: 191, acc 0.9502487562189055, ex 0
301/13784 tested, right: 283, acc 0.9401993355481728, ex 0
401/13784 tested, right: 379, acc 0.9451371571072319, ex 0
501/13784 tested, right: 476, acc 0.9500998003992016, ex 0
601/13784 tested, right: 575, acc 0.956738768718802, ex 0
701/13784 tested, right: 669, acc 0.9543509272467903, ex 0
801/13784 tested, right: 764, acc 0.9538077403245943, ex 0
901/13784 tested, right: 858, acc 0.9522752497225305, ex 0
1001/13784 tested, right: 953, acc 0.952047952047952, ex 0
1101/13784 tested, right: 1045, acc 0.9491371480472298, ex 0
1201/13784 tested, right: 1135, acc 0.9450457951706911, ex 0
1301/13784 tested, right: 1231, acc 0.9461952344350499, ex 0
1401/13784 tested, right: 1326, acc 0.9464668094218416, ex 0
1501/13784 tested, right: 1422, acc 0.9473684210526315, ex 0
1601/13784 tested, right: 1518, acc 0.948157401623985, ex 0
1701/13

13501/13784 tested, right: 12734, acc 0.9431893933782682, ex 0
13601/13784 tested, right: 12832, acc 0.943460039702963, ex 0
13701/13784 tested, right: 12920, acc 0.942996861542953, ex 0


In [39]:
right/13784

0.9429773650609402