In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import torch
import transformers
import sentence_transformers

In [2]:
print(torch.__version__, transformers.__version__, sentence_transformers.__version__)

2.3.1+cu121 4.42.4 3.0.1


In [35]:
def get_token_counts(sentences: list[str], tokenizer: transformers.PreTrainedTokenizer) -> list[int]:
    """
    Return a list of token counts for each sentence.

    Args:
        sentences: A list of input sentences (strings).
        tokenizer: A pre-trained tokenizer (e.g., BERT, RoBERTa).

    Returns:
        token_counts: A list of integers, where each element represents the number of tokens for the corresponding sentence.
    """
    token_counts = [len(tokenizer.encode_plus(sentence, add_special_tokens=True)["input_ids"]) for sentence in sentences]
    return token_counts

def get_word_counts(sentences: list[str]):
    word_counts = [len(sentence.strip().split()) for sentence in sentences]
    return word_counts

In [34]:
# Requires transformers>=4.36.0

import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms"
]

model_path = 'Alibaba-NLP/gte-base-en-v1.5'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to('cuda')

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt').to('cuda')

outputs = model(**batch_dict)
embeddings = outputs.last_hidden_state[:, 0]
 
# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())

def get_embeddings(input_texts):
    model.to('cuda')
    batch_dict = tokenizer(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt').to('cuda')
    with torch.no_grad():
        outputs = model(**batch_dict)
    embeddings = outputs.last_hidden_state[:, 0]
    F.normalize(embeddings, p=2, dim=1)
    return embeddings


[[34.50494384765625, 64.03974151611328, 19.520017623901367]]


In [4]:
# Requires sentence_transformers>=2.7.0

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model_sentence = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
embeddings_2 = model_sentence.encode(input_texts)
print(cos_sim(embeddings_2[0], embeddings_2[1]))

tensor([[0.3450]])


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [8]:
cosine_similarity(embeddings_2)

array([[1.0000001 , 0.34504956, 0.6403974 , 0.19520023],
       [0.34504956, 1.0000001 , 0.21707077, 0.5514271 ],
       [0.6403974 , 0.21707077, 1.0000002 , 0.12539631],
       [0.19520023, 0.5514271 , 0.12539631, 1.0000006 ]], dtype=float32)

In [9]:
cosine_similarity(embeddings.detach().numpy())

array([[0.99999994, 0.34504956, 0.64039755, 0.19520023],
       [0.34504956, 1.0000002 , 0.2170707 , 0.55142707],
       [0.64039755, 0.2170707 , 1.0000001 , 0.12539637],
       [0.19520023, 0.55142707, 0.12539637, 1.0000002 ]], dtype=float32)

In [36]:
long_text = "fAs a mathematician and physicist, he made many original fundamental contributions to pure and applied mathematics, mathematical physics, and celestial mechanics.[6] In his research on the three-body problem, Poincaré became the first person to discover a chaotic deterministic system which laid the foundations of modern chaos theory. He is also considered to be one of the founders of the field of topology. Early in the 20th century he formulated the Poincaré conjecture, which became, over time, one of the famous unsolved problems in mathematics. It was solved in 20022003 by Grigori Perelman."
sentences = ["This is a sample sentence.", "Another, slightly longer sentence with multiple tokens.", long_text*40]
token_counts = get_token_counts(sentences, tokenizer)
print(token_counts)

print(get_word_counts(sentences))

[8, 12, 4922]
[5, 7, 3641]


In [37]:
4922/3641

1.3518264213128262

In [None]:
input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms",
    long_text * 40
]

In [32]:
cosine_similarity(model_sentence.encode(input_texts))

array([[1.0000002 , 0.3450492 , 0.6403974 , 0.1952    , 0.28782588],
       [0.3450492 , 1.0000001 , 0.21707052, 0.55142707, 0.26809558],
       [0.6403974 , 0.21707052, 1.        , 0.12539633, 0.21799149],
       [0.1952    , 0.55142707, 0.12539633, 1.0000005 , 0.20562805],
       [0.28782588, 0.26809558, 0.21799149, 0.20562805, 0.9999999 ]],
      dtype=float32)

In [33]:
e = get_embeddings(input_texts)
cosine_similarity(e.detach().cpu().numpy())

array([[1.0000002 , 0.3450492 , 0.6403974 , 0.1952    , 0.28782588],
       [0.3450492 , 1.0000001 , 0.21707052, 0.55142707, 0.26809558],
       [0.6403974 , 0.21707052, 1.        , 0.12539633, 0.21799149],
       [0.1952    , 0.55142707, 0.12539633, 1.0000005 , 0.20562805],
       [0.28782588, 0.26809558, 0.21799149, 0.20562805, 0.9999999 ]],
      dtype=float32)

In [24]:
model.device

device(type='cpu')