In [1]:
# Built using the code @https://github.com/mfaruqui/retrofitting

In [9]:
import math
import scipy

In [10]:
from copy import deepcopy

In [11]:
from sentence_transformers import SentenceTransformer

In [None]:
embedder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')

In [37]:
''' Retrofit paper vectors to a citation network '''
def retrofit(paper_embs, lexicon, numIters):
    new_paper_embs = deepcopy(paper_embs)
    embs_vocab = set(new_paper_embs.keys())
    loop_vocab = embs_vocab.intersection(set(lexicon.keys()))
    for it in range(numIters):
    # loop through every node also in ontology (else just use data estimate)
        for paper in loop_vocab:
            paperNeighbours = set(lexicon[paper]).intersection(embs_vocab)
            numNeighbours = len(paperNeighbours)
            #no neighbours, pass - use data estimate
            if numNeighbours == 0:
                continue
            # the weight of the data estimate is the number of neighbours
            new_emb = numNeighbours * paper_embs[paper]
            # loop over neighbours and add to new vector (currently with weight 1)
            for ppWord in paperNeighbours:
                new_emb += new_paper_embs[ppWord]
            new_paper_embs[paper] = new_emb/(2*numNeighbours)
    return new_paper_embs


In [39]:
corpus_embeddings = embedder.encode(["Hello my name is Mark", "I come from a land far away.", "Hi, here we go!", 'My cat is called Dimitri'])

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.98it/s]


In [40]:
sentence_embs = {}
sentence_embs["name"] = corpus_embeddings[0] / math.sqrt((corpus_embeddings[0]**2).sum() + 1e-6)
sentence_embs["land"] = corpus_embeddings[1] / math.sqrt((corpus_embeddings[1]**2).sum() + 1e-6)
sentence_embs["hi"] = corpus_embeddings[2] / math.sqrt((corpus_embeddings[2]**2).sum() + 1e-6)
sentence_embs["cat"] = corpus_embeddings[3] / math.sqrt((corpus_embeddings[3]**2).sum() + 1e-6)

In [41]:
lexicon_papers = {}
lexicon_papers["name"] = ["hi", "land"]

In [46]:
retro_name = retrofit(sentence_embs, lexicon_papers, 10)["name"]
retro_name_less = retrofit(sentence_embs, lexicon_papers, 1)["name"]

In [47]:
retro_name == retro_name_less

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [34]:
distances = scipy.spatial.distance.cdist([retro_name_less], [sentence_embs["name"]], "cosine")[0]

In [35]:
1 - distances

array([0.89655808])

In [22]:
sentence_embs["name"] /= math.sqrt((sentence_embs["name"]**2).sum() + 1e-6)

In [23]:
math.sqrt((sentence_embs["name"]**2).sum() + 1e-6)

0.9999999635581964

In [24]:
corpus_embeddings[0][500] / math.sqrt((sentence_embs["name"]**2).sum() + 1e-6)

0.5246709776389387