In [1]:
import numpy as np
import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer

In [2]:
# Check if MPS is available and set the device accordingly
device = torch.device("cpu")

In [3]:
# Load the custom tokenizer
tokenizer_dir = "archive/custom_tokenizer"  # Path to the saved tokenizer directory
custom_tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)

In [4]:
# Load the pre-trained Sentence-BERT model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Replace the default tokenizer with the custom tokenizer
model.tokenizer = custom_tokenizer

In [5]:
# Query and document examples
query = "Donald Trump recently visited the UK."
documents = [
    "The President of the United States visited the UK.",
    "The UK was visited by the President of the United States.",
    "ONU congress decides to ban nuclear weapons.",
    "The sky is blue.",
    "The sun is shining.",
    "Recipe for a delicious cake."
]

# Generate embeddings for documents and query
doc_embeddings = model.encode(documents)
query_embedding = model.encode([query])

# Normalize embeddings
doc_embeddings = doc_embeddings / np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

# Create FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(np.array(doc_embeddings))

# Search for the most similar documents
k = len(documents)
distances, indices = index.search(np.array(query_embedding), k)

# Display results
for i, idx in enumerate(indices[0]):
    fragment = documents[idx]
    score = distances[0][i]
    print(f"Fragment: {fragment}\nGrad de similitudine: {score:.2f}")


Fragment: The sky is blue.
Grad de similitudine: 0.98
Fragment: The sun is shining.
Grad de similitudine: 0.58
Fragment: ONU congress decides to ban nuclear weapons.
Grad de similitudine: 0.55
Fragment: Recipe for a delicious cake.
Grad de similitudine: 0.47
Fragment: The President of the United States visited the UK.
Grad de similitudine: 0.43
Fragment: The UK was visited by the President of the United States.
Grad de similitudine: 0.39
