#A lower-level implementation of a sentence embedding with PyTorch and transformers.

In this notebook, we will develop our transformation to the last_hidden_state to create the sentence embedding. For this, we perform the mean pooling operation.

In [None]:
!pip install -U transformers


Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.7/dist-packages (2.0.0)


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

First we initialize our model and tokenizer:


In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')

In [None]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

# initialize dictionary to store tokenized sentences
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    # encode each sentence and append to dictionary
    #Tokenize and prepare for the model a sequence or a pair of sequences.
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt') #Return PyTorch torch.Tensor objects.

   #The method encode_plus returns the List of token ids to be fed to a model, List of indices specifying which tokens should be attended to by the model 

    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

# reformat list of tensors into single tensor. Concatenates a sequence of tensors along a new dimension.
tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

In [None]:
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

The dense vector representations of our text are contained within the outputs 'last_hidden_state' tensor, which we access like so:



In [None]:
embeddings = outputs.last_hidden_state
embeddings


tensor([[[ 9.0630e-02,  4.5589e-01, -1.5872e-01,  ..., -3.1558e-01,
           5.8567e-02, -1.7566e-01],
         [-2.1225e-01, -5.7729e-01, -2.0000e-01,  ..., -1.4099e-01,
           1.5964e-01,  7.6760e-01],
         [-2.1763e-01,  1.1818e+00, -2.7912e-01,  ..., -4.7069e-01,
           1.8733e-01,  1.8711e-01],
         ...,
         [-1.2064e-01,  6.1632e-02,  1.4978e-01,  ...,  2.3516e-01,
          -2.2291e-02, -1.6768e-01],
         [-1.4511e-01,  9.7930e-02,  1.5787e-01,  ...,  2.1651e-01,
           6.4461e-02, -1.2910e-01],
         [-1.2381e-01,  1.5829e-01,  1.6246e-01,  ...,  2.2581e-01,
           1.3747e-01, -8.8168e-02]],

        [[ 1.3628e-01,  4.8279e-01,  2.6579e-01,  ...,  5.7575e-01,
          -1.1533e-01, -9.7986e-02],
         [-5.6376e-02,  1.2054e+00,  3.1291e-01,  ...,  1.1160e+00,
           6.6876e-01,  1.0406e+00],
         [ 5.0828e-01,  5.0805e-01,  1.8537e-01,  ...,  8.1560e-01,
           1.2319e+00,  3.0367e-02],
         ...,
         [-6.3757e-02,  1

After we have produced our dense vectors embeddings, we need to perform a mean pooling operation to create a single vector encoding (the sentence embedding).
To do this mean pooling operation, we will need to multiply each value in our embeddings tensor by its respective attention_mask value — so that we ignore non-real tokens.

In [None]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([4, 128])

To perform this operation, we first resize our attention_mask tensor:


In [None]:

mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([4, 128, 384])

In [None]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 

Each vector above represents a single token attention mask - each token now has a vector of size 768 representing it's attention_mask status. Then we multiply the two tensors to apply the attention mask:

In [None]:
masked_embeddings = embeddings * mask
masked_embeddings.shape


torch.Size([4, 128, 384])

In [None]:
masked_embeddings


tensor([[[ 0.0906,  0.4559, -0.1587,  ..., -0.3156,  0.0586, -0.1757],
         [-0.2122, -0.5773, -0.2000,  ..., -0.1410,  0.1596,  0.7676],
         [-0.2176,  1.1818, -0.2791,  ..., -0.4707,  0.1873,  0.1871],
         ...,
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000]],

        [[ 0.1363,  0.4828,  0.2658,  ...,  0.5758, -0.1153, -0.0980],
         [-0.0564,  1.2054,  0.3129,  ...,  1.1160,  0.6688,  1.0406],
         [ 0.5083,  0.5080,  0.1854,  ...,  0.8156,  1.2319,  0.0304],
         ...,
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000],
         [-0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000]],

        [[ 0.1853,  0.1340, -0.3905,  ..., -0.4609, -0.0413, -0.1292],
         [ 0.2210,  0.4039, -0.7355,  ..., -0

Then we sum the remained of the embeddings along axis 1:



In [None]:
summed = torch.sum(masked_embeddings, 1)
summed.shape


torch.Size([4, 384])

Then sum the number of values that must be given attention in each position of the tensor:



In [None]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([4, 384])

Finally, we calculate the mean as the sum of the embedding activations summed divided by the number of values that should be given attention in each position summed_mask:



In [None]:
mean_pooled = summed / summed_mask


Once we have our dense vectors, we can calculate the cosine similarity between each — which is the same logic we used before:


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()

# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)


array([[0.13260713, 0.31947678, 0.17322943]], dtype=float32)

Comparamos con los resultados que obtiene el modelo directamente:


In [None]:
!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

sentence_embeddings = model.encode(sentences)
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)

array([[0.13260716, 0.31947678, 0.17322943]], dtype=float32)


Seguir por aquí:
https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1