# Word Embeddings Alignment

This file is used to align different word embeddings so that they are comparable.

In [30]:
# import the base word embedding model 2022/pubmed_mesh_test
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from numpy import array
from scipy.linalg import orthogonal_procrustes
import numpy as np
from scipy.spatial import procrustes
from sklearn.preprocessing import normalize

In [8]:
# Load the base fasttext model
base_2022 = KeyedVectors.load_word2vec_format('2022/pubmed_mesh_test.bin', binary=True)

# Load the other fasttext model
other_2023 = KeyedVectors.load_word2vec_format('2023/pubmed_mesh_test.bin', binary=True)


In [11]:
base_2022.most_similar('palliative', topn=10)

[('palliative-care', 0.7936036586761475),
 ('causative', 0.7582331299781799),
 ('pain-postoperative', 0.7546006441116333),
 ('sativa', 0.7240142226219177),
 ('neck-pain', 0.7199922800064087),
 ('palliative-medicine', 0.7172513604164124),
 ('dissipative', 0.7167489528656006),
 ('curative', 0.7110295295715332),
 ('putative', 0.7090983390808105),
 ('cumulative', 0.7028390169143677)]

In [12]:
other_2023.most_similar('palliative', topn=10)

[('palliation', 0.8846188187599182),
 ('palliative-care', 0.7507857084274292),
 ('pain', 0.6960252523422241),
 ('neck-pain', 0.6956680417060852),
 ('painless', 0.6946257948875427),
 ('pallipes', 0.6869878172874451),
 ('painful', 0.6772063970565796),
 ('palliative-medicine', 0.675298273563385),
 ('glossalgia', 0.6746705174446106),
 ('pain-postoperative', 0.660344123840332)]

# Align other embedding to base embeddings via Procrustes.
# Returns best distance-preserving aligned version of other_embed

In [15]:
from gensim.models import KeyedVectors
from sklearn.preprocessing import normalize
from scipy.linalg import orthogonal_procrustes

# Load the base fasttext model
base_2022 = KeyedVectors.load_word2vec_format('2022/pubmed_mesh_test.bin', binary=True)

# Load the other fasttext model
other_2023 = KeyedVectors.load_word2vec_format('2023/pubmed_mesh_test.bin', binary=True)



In [31]:

def intersection_align(base_embed, other_embed, post_normalize=True):
    """ 
        Get the intersection of two embeddings.
        Returns embeddings with common vocabulary and indices.
    """
    # Get the common vocabulary
    common_vocab = list(set(base_embed.index_to_key).intersection(other_embed.index_to_key))
    # Filter the embeddings
    base_embed = {word: base_embed[word] for word in common_vocab}
    other_embed = {word: other_embed[word] for word in common_vocab}
    # Normalize the embeddings
    if post_normalize:
        base_embed = {word: normalize(base_embed[word].reshape(1, -1))[0] for word in common_vocab}
        other_embed = {word: normalize(other_embed[word].reshape(1, -1))[0] for word in common_vocab}
    return base_embed, other_embed


In [47]:
def smart_procrustes_align(base_embed, other_embed, post_normalize=True):
    """ 
        Procrustes align two embeddings.
        Returns the aligned embeddings.
    """
    # Get the intersection of the embeddings
    base_embed, other_embed = intersection_align(base_embed, other_embed, post_normalize = True)
    
    # Get the embeddings as arrays
    base_vecs = np.array([base_embed[word] for word in base_embed])
    other_vecs = np.array([other_embed[word] for word in other_embed])

    # Use the orthogonal procrustes to get the aligned embeddings
    R, _ = orthogonal_procrustes(base_vecs, other_vecs)
    other_vecs = other_vecs @ R # apply the rotation

    # Update the other embeddings
    other_embed = {word: other_vecs[i] for i, word in enumerate(other_embed)}

    # convert the embeddings back to word2vec format
    other_embed = {word: other_embed[word] for word in other_embed}

    return base_embed, other_embed

In [48]:
# Example usage:
base_model, aligned_model = smart_procrustes_align(base_2022, other_2023)

In [49]:
base_model['cancer']

array([ 0.00553191,  0.00658249,  0.04680343,  0.06891567,  0.06009813,
        0.05494593, -0.02380475, -0.00923096, -0.12136691,  0.00614625,
        0.00608535, -0.19401045, -0.00097779,  0.02796242,  0.02396979,
       -0.05970353,  0.0457505 ,  0.02086798, -0.01560881, -0.02480836,
        0.00543756,  0.09524553,  0.11865124,  0.05304628,  0.14279479,
       -0.0574504 , -0.01200654, -0.10503849, -0.01333171,  0.05044436,
       -0.04175837, -0.0705239 , -0.05988171,  0.11426616, -0.07370894,
       -0.06616486, -0.03610885, -0.05010739, -0.02033391,  0.0265418 ,
       -0.0751792 , -0.05227342, -0.14836527,  0.09652365, -0.03234688,
        0.02281607,  0.11713756,  0.06727903, -0.0247967 , -0.02398182,
       -0.08572464,  0.01388761,  0.0417374 ,  0.0696085 , -0.1067835 ,
        0.04137561, -0.09243348, -0.00956628,  0.19666839, -0.04927925,
        0.04281805,  0.01232423,  0.07845285, -0.04139845,  0.07524712,
        0.04571496,  0.02030768, -0.05848851, -0.11516216,  0.00

In [50]:
aligned_model['cancer']

array([-1.70520812e-01,  8.41093529e-03, -8.75078421e-03, -3.57316360e-02,
        2.38331594e-03,  1.32049292e-01,  5.25648259e-02,  1.23429373e-01,
       -1.41079426e-01, -2.61299778e-03,  2.62961630e-02, -1.36666730e-01,
       -1.32432535e-01,  8.50166567e-03,  9.44644213e-02,  4.98592705e-02,
       -4.28421125e-02,  6.25300780e-02,  1.33713698e-02, -2.17253193e-02,
       -1.85960941e-02,  6.39020354e-02,  8.23876560e-02,  8.77231639e-03,
        1.54439300e-01, -3.04225273e-02, -6.47986531e-02, -1.58147842e-01,
        1.51591636e-02,  4.06501517e-02, -8.22627544e-02, -5.49315810e-02,
       -6.43004999e-02,  9.79887918e-02,  5.55504411e-02,  2.72452533e-02,
        1.32259831e-01,  2.79256925e-02,  3.41631882e-02,  3.37711885e-03,
       -5.24783432e-02,  4.10486124e-02,  2.87888069e-02,  6.99759424e-02,
        6.46798089e-02, -6.70677572e-02,  4.33315635e-02,  2.29329132e-02,
       -1.49418831e-01, -2.07917746e-02, -2.15293378e-01, -7.65179768e-02,
        5.37038669e-02,  

In [51]:
# calculate the cosine similarity between the two models
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([base_model['cancer']], [aligned_model['cancer']])



array([[0.43116736]], dtype=float32)

In [55]:
print(cosine_similarity([base_model['cancer']], [base_model['palliative']]))
print(cosine_similarity([aligned_model['cancer']], [aligned_model['palliative']]))

[[0.31454787]]
[[0.4157464]]


In [56]:
print(cosine_similarity([base_model['support']], [base_model['palliative']]))
print(cosine_similarity([aligned_model['support']], [aligned_model['palliative']]))

[[0.2616309]]
[[0.344328]]


In [57]:
# convert the aligned_model to a Gensim KeyedVectors object, so we can use the most_similar method

def convert_to_keyed_vectors(embeddings):
    keyed_vectors = KeyedVectors(vector_size=len(next(iter(embeddings.values()))))
    for word, vector in embeddings.items():
        keyed_vectors.add_vector(word, vector)
    return keyed_vectors


In [58]:
aligned_model_embed = convert_to_keyed_vectors(aligned_model)
aligned_model_embed.most_similar('palliative', topn=10)



[('palliation', 0.8846188187599182),
 ('palliative-care', 0.750785768032074),
 ('pain', 0.6960253119468689),
 ('neck-pain', 0.69566810131073),
 ('painless', 0.6946258544921875),
 ('painful', 0.6772064566612244),
 ('palliative-medicine', 0.675298273563385),
 ('glossalgia', 0.6746706366539001),
 ('pain-postoperative', 0.660344123840332),
 ('flank-pain', 0.658454954624176)]

In [59]:
other_2023.most_similar('palliative', topn=10)

[('palliation', 0.8846188187599182),
 ('palliative-care', 0.7507857084274292),
 ('pain', 0.6960252523422241),
 ('neck-pain', 0.6956680417060852),
 ('painless', 0.6946257948875427),
 ('pallipes', 0.6869878172874451),
 ('painful', 0.6772063970565796),
 ('palliative-medicine', 0.675298273563385),
 ('glossalgia', 0.6746705174446106),
 ('pain-postoperative', 0.660344123840332)]

Some most similar words changed after alignment, but the most similar words are still similar.

In [60]:
aligned_base_embed = convert_to_keyed_vectors(base_model)
aligned_base_embed.most_similar('palliative', topn=10)



[('palliative-care', 0.7936036586761475),
 ('causative', 0.7582331895828247),
 ('pain-postoperative', 0.7546005845069885),
 ('sativa', 0.7240143418312073),
 ('neck-pain', 0.7199923396110535),
 ('palliative-medicine', 0.7172513604164124),
 ('dissipative', 0.7167490720748901),
 ('curative', 0.7110295295715332),
 ('putative', 0.7090983390808105),
 ('cumulative', 0.7028390765190125)]

In [61]:
base_2022.most_similar('palliative', topn=10)

[('palliative-care', 0.7936036586761475),
 ('causative', 0.7582331299781799),
 ('pain-postoperative', 0.7546006441116333),
 ('sativa', 0.7240142226219177),
 ('neck-pain', 0.7199922800064087),
 ('palliative-medicine', 0.7172513604164124),
 ('dissipative', 0.7167489528656006),
 ('curative', 0.7110295295715332),
 ('putative', 0.7090983390808105),
 ('cumulative', 0.7028390169143677)]

The base model's most similar words do not change after alignment.