# Crosslingual Embeddings Study

### Counting the number of rows of embeddings

In [1]:
def nb_rows_embeddings(emb_path):
    
    with open(emb_path) as f:
        for i, l in enumerate(f):
            pass
    return i     # We don't count the 0th line

In [2]:
nb = nb_rows_embeddings('vecmap/en-crossemb.smalldict.vec')
print("Number of rows of english crosslingual embeddings: {}".format(nb))
# First line of the embedding: 999994 300

Number of rows of english crosslingual embeddings: 999994


In [3]:
nb = nb_rows_embeddings('vecmap/eu-crossemb.smalldict.vec')
print("Number of rows of basque crosslingual embeddings: {}".format(nb))
# First line of the embedding: 1824848 300

Number of rows of basque crosslingual embeddings: 1824848


### Downloading words

In [4]:
import io
import numpy as np

def load_words(emb_path):
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for line in f:
            word, _ = line.rstrip().split(' ', 1)
            assert word not in word2id, 'word found twice'
            word2id[word] = len(word2id)
    return word2id

In [5]:
src_path = 'vecmap/en-crossemb.smalldict.vec'
tgt_path = 'vecmap/eu-crossemb.smalldict.vec'

src_word2id = load_words(src_path)
tgt_word2id = load_words(tgt_path)

In [8]:
def display_dicts_of_words_beginning_with(src_word, tgt_word, src_word2id=src_word2id, tgt_word2id=tgt_word2id):
    
    src_word_dict = [(w,i) for (w,i) in src_word2id.items() if w.startswith(src_word)]
    tgt_word_dict = [(w,i) for (w,i) in tgt_word2id.items() if w.startswith(tgt_word)]
    print("Length of src_dict beginning with {}: {}".format(src_word, len(src_word_dict)))
    print(src_word_dict)
    print("")
    print("Length of trg_dict beginning with {}: {}".format(tgt_word, len(tgt_word_dict)))
    print(tgt_word_dict)

#### "woman" and "emakume"

In [11]:
display_dicts_of_words_beginning_with("woman", "emakume")

Length of src_dict beginning with woman: 31
[('woman', 992), ('womanhood', 52055), ('womanly', 81183), ('womanizing', 89162), ('womanizer', 100443), ('womans', 114365), ('womanising', 128035), ('womaniser', 150283), ('womankind', 152287), ('woman.', 216253), ('womanist', 238793), ('woman-hating', 266212), ('woman-owned', 267998), ('woman-', 295157), ('womanliness', 338554), ('womanish', 346061), ('womanism', 367866), ('woman-centered', 477397), ('woman-friendly', 494400), ('womanizers', 516549), ('womanʼs', 524065), ('woman-hater', 675899), ('woman-to-woman', 686000), ('woman-child', 744032), ('womanisers', 748589), ('woman-centred', 777472), ('woman-only', 862148), ('woman-like', 873558), ('womanless', 925523), ('woman-led', 934298), ('woman--', 962309)]

Length of trg_dict beginning with emakume: 167
[('emakume', 56), ('emakumeak', 125), ('emakumeen', 2864), ('emakumea', 2960), ('emakumezko', 4311), ('emakumeek', 5666), ('emakumezkoen', 6782), ('emakumearen', 8618), ('emakumeei', 143

### Downloading embeddings

In [12]:
import io
import numpy as np

def load_embeddings(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for line in f:
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [13]:
src_path = 'vecmap/en-crossemb.smalldict.vec'
tgt_path = 'vecmap/eu-crossemb.smalldict.vec'

nmax = 100000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_embeddings(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_embeddings(tgt_path, nmax)

### Study of some embeddings

In [14]:
print(src_word2id["two"])
print(src_embeddings[105])

105
[-4.84130e-01 -1.75258e-02  7.75707e-02  1.11623e-01  4.63866e-02
  2.31642e-01  7.40832e-03 -1.87734e-01  7.71220e-02 -2.50158e-02
 -1.41447e-01  3.00059e-03 -5.20999e-02  5.51655e-02 -4.40342e-03
 -6.80108e-02 -1.09424e-01 -1.26838e-02  6.15051e-02  7.48683e-02
 -2.85518e-03 -3.54776e-02  3.02908e-02 -2.07589e-02 -4.89974e-02
  1.00178e-01 -4.92094e-02 -8.51080e-02 -7.68687e-02  4.47452e-02
  2.01065e-02 -3.32071e-02 -3.86746e-02  2.08733e-02 -5.71891e-03
  2.56362e-02  1.39678e-02  1.21209e-02  3.59077e-03  8.15553e-02
  3.26820e-02  2.81385e-02  1.08541e-01 -2.23456e-02 -2.28488e-03
 -6.65323e-02  4.05732e-02 -2.20357e-02  2.70534e-02 -4.59819e-02
 -9.28299e-02 -3.05875e-02 -3.28451e-02 -7.15693e-03 -3.65319e-02
 -9.06376e-02 -1.13887e-02  1.53819e-02 -5.29587e-03  4.18164e-02
 -1.21071e-02 -2.09002e-02 -4.93592e-02  2.86522e-02 -1.29261e-02
  1.16579e-02  2.15741e-02 -5.36216e-02  7.47755e-02  7.19483e-02
 -2.66018e-02  6.56253e-02  3.39228e-02 -3.51786e-02 -1.75284e-02
 -2.16

In [25]:
print(tgt_word2id["bi"])
print(tgt_embeddings[55])

55
[-3.14543e-01  6.38796e-02  9.84498e-02 -4.45751e-01 -4.67128e-01
  1.07583e-01 -1.18810e-01  8.13774e-02  1.85561e-01  1.44222e-03
 -3.37749e-02 -1.53178e-02  3.05862e-02  1.73408e-02  3.25634e-02
 -5.41374e-02 -2.32770e-02  1.15614e-02  3.83423e-02  4.77873e-02
 -2.10600e-02 -2.84459e-02 -3.72212e-02  1.54777e-02 -4.24416e-02
  3.60414e-02  4.66721e-03 -9.36004e-02 -2.85070e-02  4.05079e-03
  3.32663e-02  3.31352e-02 -4.79919e-02  2.09183e-02 -4.08186e-02
  9.07721e-03 -6.23765e-03 -5.58511e-03 -1.64784e-02  1.93405e-02
  2.97173e-02  1.97072e-03  4.71014e-02 -2.88424e-02  2.19457e-02
 -3.99940e-02 -2.08286e-02 -1.52236e-03  6.06685e-03 -2.11577e-02
 -3.40204e-02 -2.29821e-02  3.22431e-03  3.90869e-02 -5.03244e-03
 -4.22195e-02 -3.71943e-02  3.59808e-03  3.26572e-02  1.58905e-02
  2.55556e-02 -1.91363e-02 -5.30314e-02 -4.60428e-03 -1.59061e-02
  1.20528e-02 -2.23527e-02 -3.57533e-02  4.82445e-02  5.83796e-02
  3.50307e-03 -1.79639e-03 -5.87538e-04  3.00156e-02 -1.07642e-02
 -1.207

In [15]:
print(src_embeddings[75000])

[-3.35128e-01 -2.64447e-01  1.11744e-01  3.01152e-01 -1.14684e-01
 -2.45247e-03 -6.66556e-02  1.36981e-01  7.65685e-02  8.82429e-02
  4.28904e-03 -4.89936e-02  8.57952e-03 -1.07057e-01 -5.69875e-02
  1.81897e-02  2.95213e-02 -3.75979e-02  1.15817e-01 -9.67859e-02
  4.51852e-02  1.07415e-02 -5.55106e-02 -1.98081e-02  1.81098e-02
  4.95396e-02 -2.33557e-02 -3.88754e-02  2.27424e-02 -3.50351e-02
  6.32546e-02 -6.68446e-02 -8.04029e-02  2.26518e-02  1.91795e-02
 -2.77655e-02 -1.19510e-01  1.10158e-02 -5.18892e-02 -3.14033e-02
  1.90076e-02 -2.98851e-02  1.12204e-02 -6.19478e-02 -4.51808e-02
 -2.94964e-02  5.97682e-02  5.00663e-03 -7.03018e-03  2.42868e-02
  3.82185e-02  1.68461e-02  6.78522e-02 -4.34230e-02 -4.13948e-02
  1.20345e-02 -1.86167e-02 -4.25276e-02 -2.94085e-02 -8.60062e-02
 -9.83228e-03  1.03952e-02 -2.78639e-02 -7.30548e-02  2.27501e-02
 -5.95617e-02 -6.39597e-02  1.17391e-02  4.00236e-02 -1.07287e-02
  3.00627e-02 -3.98762e-02  1.92755e-02 -1.09353e-02 -2.58424e-02
 -1.05186e

###  NN retrieval

In [23]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of {0}:".format(word))
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

In [25]:
# printing nearest neighbors in the source space
src_word = 'take'
get_nn(src_word, src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of take:
1.0000 - take
0.8654 - give
0.8459 - leave
0.8454 - bring
0.8358 - make


In [26]:
src_word2id['take'], src_word2id['took'], src_word2id['taken']

(226, 424, 741)

In [27]:
tgt_word2id['hartu'], tgt_word2id['hartzen'], tgt_word2id['hartuko']

(179, 295, 1944)

In [28]:
# printing nearest neighbors in the target space
src_word = 'take'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=3)

Nearest neighbors of take:
0.8060 - hartzeko
0.7433 - emateko
0.7400 - lagatzeko


In [29]:
# printing nearest neighbors in the target space
src_word = 'took'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=3)

Nearest neighbors of took:
0.8007 - hartu
0.7930 - egin
0.7390 - utzi


In [30]:
src_word = 'taken'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=3)

Nearest neighbors of taken:
0.7846 - hartu
0.7608 - egin
0.7131 - jaso
