In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
# Sentences we want sentence embeddings for
sentences = ["The cat sits outside", "some other text that will use padding"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-base-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-base-en-v1.5')
model.eval()

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

print(encoded_input)
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
print("Sentence embeddings:", sentence_embeddings)
    


{'input_ids': tensor([[  101,  1996,  4937,  7719,  2648,   102,     0,     0,     0,     0],
        [  101,  2070,  2060,  3793,  2008,  2097,  2224, 11687,  4667,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Sentence embeddings: tensor([[-0.5609, -1.5735,  0.4297,  ...,  1.5557,  0.3807,  0.5673],
        [ 0.0375,  0.1586, -0.1318,  ..., -0.0153,  0.0603,  0.1512]])


In [2]:
sentence_embeddings2 = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings2)

Sentence embeddings: tensor([[-0.0362, -0.1016,  0.0278,  ...,  0.1005,  0.0246,  0.0366],
        [ 0.0024,  0.0101, -0.0084,  ..., -0.0010,  0.0038,  0.0096]])
