In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nvidia/NV-Embed-v2", trust_remote_code=True)
model.max_seq_length = 32768
model.tokenizer.padding_side = "right"

In [None]:
def get_embeddings(texts):
    padded_texts = [i + model.tokenizer.eos_token for i in texts]

    embeddings = model.encode(
        padded_texts,
        normalize_embeddings=True,
        batch_size=1,
    )
    
    return embeddings.tolist()


In [None]:
import faiss
import numpy as np

def get_similarities(query_embedding, embeddings):

    query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
    embeddings = np.array(embeddings, dtype=np.float32)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)  # type: ignore

    scores, indices = index.search(query_embedding, len(embeddings))  # type: ignore

    return scores, indices


In [None]:
import numpy as np
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

In [None]:
text1 = "The user requests a rephrasing of a statement regarding the benefits and drawbacks of being a for-profit entity, focusing on scalability through capital and the risk of values misalignment."
text2 = "The individual seeks an alternative formulation of a proposition concerning the advantages and limitations of operating as a profit-driven organization, highlighting growth potential through investment and the possible misalignment of core principles."

embeddings1 = get_embeddings([text1])[0]
embeddings2 = get_embeddings([text2])[0]

score = cosine_similarity(embeddings1, embeddings2)

print(score)