In [26]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load the original and fine-tuned SBERT models
original_model = SentenceTransformer('stsb-bert-large')
fine_tuned_model = SentenceTransformer('fine_tuned_sbert_model')


In [2]:
# Example sentences for similarity test
query = "Donald Trump recently visited the United Kingdom."
documents = [
    "The President of the United States has just made a trip to the UK.", # Rephrased version of the query
    "The UK was visited by the President of the United States.", # Rephrased version of the query
    "Trump has made some allegations about the UK.", # Slightly modified version
    "The new prime minister of the UK has been elected.", # Predominantly unrelated
    "The ONU congress discusses about US foreign policy.", # Predominantly unrelated
    "The sun is shining.", # Completely unrelated
    "Recipe for a delicious cake." # Completely unrelated
]

# Generate embeddings for the documents and query
original_doc_embeddings = original_model.encode(documents)
fine_tuned_doc_embeddings = fine_tuned_model.encode(documents)
query_embedding = original_model.encode([query])  # Using original model for the query

# Normalize embeddings (important for cosine similarity with FAISS)
original_doc_embeddings = original_doc_embeddings / np.linalg.norm(original_doc_embeddings, axis=1, keepdims=True)
fine_tuned_doc_embeddings = fine_tuned_doc_embeddings / np.linalg.norm(fine_tuned_doc_embeddings, axis=1, keepdims=True)
query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

# Create a FAISS index for original and fine-tuned embeddings
dimension = original_doc_embeddings.shape[1]
index_original = faiss.IndexFlatIP(dimension)  # Inner product (cosine similarity)
index_fine_tuned = faiss.IndexFlatIP(dimension)

# Add embeddings to the FAISS index
index_original.add(np.array(original_doc_embeddings))
index_fine_tuned.add(np.array(fine_tuned_doc_embeddings))

# Search the index for the most similar document
k = len(documents)
distances_original, indices_original = index_original.search(np.array(query_embedding), k)
distances_fine_tuned, indices_fine_tuned = index_fine_tuned.search(np.array(query_embedding), k)

# Print the results
print("Original SBERT Model Similarity Test:")
print(f"Query: {query}")
print(f"Top {k} most similar documents (Original SBERT):")
for i, idx in enumerate(indices_original[0]):
    print(f"{i+1}. {documents[idx]} -> {distances_original[0][i]:.4f}")
print()

print("Fine-tuned SBERT Model Similarity Test:")
print(f"Query: {query}")
print(f"Top {k} most similar documents (Fine-tuned SBERT):")
for i, idx in enumerate(indices_fine_tuned[0]):
    print(f"{i+1}. {documents[idx]} -> {distances_fine_tuned[0][i]:.4f}")

Original SBERT Model Similarity Test:
Query: Donald Trump recently visited the United Kingdom.
Top 7 most similar documents (Original SBERT):
1. Trump has made some allegations about the UK. -> 0.6444
2. The President of the United States has just made a trip to the UK. -> 0.6125
3. The UK was visited by the President of the United States. -> 0.5527
4. The new prime minister of the UK has been elected. -> 0.5304
5. The ONU congress discusses about US foreign policy. -> 0.1229
6. Recipe for a delicious cake. -> 0.0819
7. The sun is shining. -> 0.0553

Fine-tuned SBERT Model Similarity Test:
Query: Donald Trump recently visited the United Kingdom.
Top 7 most similar documents (Fine-tuned SBERT):
1. Trump has made some allegations about the UK. -> 0.5561
2. The President of the United States has just made a trip to the UK. -> 0.5218
3. The new prime minister of the UK has been elected. -> 0.5120
4. The UK was visited by the President of the United States. -> 0.5087
5. The ONU congress dis

In [25]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from IPython.core.display import display, HTML
from termcolor import colored

gt = "TikTok CEO Shou Chew has been invited to and is expected to attend President-elect Donald Trump’s inauguration Monday, according to three sources familiar with the matter, two of whom are Trump transition officials. Chew was invited by the president-elect's team and will sit on the dais in front of the Capitol where Trump will be sworn in as president. Chew is expected to sit alongside the CEOs and leaders of other Big Tech platforms, including Tesla and SpaceX CEO Elon Musk, Amazon founder Jeff Bezos and Meta CEO Mark Zuckerberg, according to one of the officials. Google CEO Sundar Pichai is also planning to attend and sit with the other executives, one Trump transition official said. NBC News has reached out to representatives of TikTok and Google for comment. TikTok declined to comment and Google did not immediately respond. The New York Times was the first outlet to report about Chew’s inauguration attendance. Chew’s attendance comes as the popular TikTok app is preparing to possibly shut down Sunday, the day a U.S. ban is set to go into effect if the Supreme Court doesn’t overrule it. The nine justices on the conservative-majority court heard oral arguments last week about whether to implement the ban that was passed by Congress and signed into law by President Joe Biden last year. Remarks and questions by the justices during the arguments suggested it was likely they would uphold the law that would effectively ban the platform."

# prompt = "TikTok CEO Shou Chew is expected to attend President-elect Donald Trump’s inauguration."

prompt = "CEO of Google Sundar Pichai is planning to attend the inauguration."

# Function to highlight and calculate top-k similar sentences
def highlight_top_k_sentences(model, prompt, ground_truth, k, display_results=True):
    """
    Highlights the top-k most similar sentences from the ground truth or calculates the highest similarity score.

    Args:
        model: SentenceTransformer model.
        prompt: The input prompt (string).
        ground_truth: The ground truth text containing multiple sentences (string).
        k: Number of top sentences to retrieve.
        display_results: Whether to display the highlighted text and scores (boolean).

    Returns:
        max_similarity: The highest similarity score (float).
    """
    # Step 1: Split the ground truth into sentences
    sentences = ground_truth.split('.')
    sentences = [s.strip() for s in sentences if s.strip()]

    # Step 2: Embed the sentences and the prompt
    sentence_embeddings = model.encode(sentences, convert_to_tensor=False, normalize_embeddings=True)
    prompt_embedding = model.encode([prompt], convert_to_tensor=False, normalize_embeddings=True)

    # Step 3: Build the FAISS index
    dimension = sentence_embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product (cosine similarity since embeddings are normalized)
    index.add(sentence_embeddings)

    # Step 4: Perform the search for top-k most similar sentences
    similarity_scores, indices = index.search(prompt_embedding, k)
    similarity_scores = similarity_scores[0]
    indices = indices[0]

    # Step 5: Handle highlighting and display if requested
    max_similarity = max(similarity_scores)
    if display_results:
        highlighted_text = ""
        for i, sentence in enumerate(sentences):
            if i in indices:
                score = similarity_scores[np.where(indices == i)[0][0]]
                # Determine color intensity based on similarity
                intensity = int(score * 255)
                color = f'rgb({255-intensity}, {255-intensity}, 255)'  # Light blue intensity gradient
                highlighted_sentence = f'<span style="background-color: {color}; border-radius: 5px; padding: 2px;">{sentence} <sup>({score:.2f})</sup></span>'
            else:
                highlighted_sentence = sentence
            highlighted_text += highlighted_sentence + '. '

        # Display the table
        table_html = f"""
        <table style="width: 100%; border-collapse: collapse; font-family: Arial;">
            <tr>
                <th style="width: 50%; text-align: left; border-bottom: 1px solid #ddd;">Ground Truth</th>
                <th style="width: 30%; text-align: left; border-bottom: 1px solid #ddd;">Prompt</th>
                <th style="width: 20%; text-align: left; border-bottom: 1px solid #ddd;">Highest Similarity</th>
            </tr>
            <tr>
                <td style="text-align: justify; vertical-align: top; padding: 10px; border-bottom: 1px solid #ddd;">{highlighted_text}</td>
                <td style="text-align: justify; vertical-align: top; padding: 10px; border-bottom: 1px solid #ddd;">{prompt}</td>
                <td style="text-align: justify; vertical-align: top; padding: 10px; border-bottom: 1px solid #ddd;">{max_similarity:.2f}</td>
            </tr>
        </table>
        """
        display(HTML(table_html))

    # Step 6: Return the highest similarity score
    return max_similarity

# Example usage
max_similarity = highlight_top_k_sentences(original_model, prompt, gt, k=3, display_results=True)
print(f"Highest similarity score: {max_similarity:.2f}")

  from IPython.core.display import display, HTML


Ground Truth,Prompt,Highest Similarity
"TikTok CEO Shou Chew has been invited to and is expected to attend President-elect Donald Trump’s inauguration Monday, according to three sources familiar with the matter, two of whom are Trump transition officials (0.54). Chew was invited by the president-elect's team and will sit on the dais in front of the Capitol where Trump will be sworn in as president. Chew is expected to sit alongside the CEOs and leaders of other Big Tech platforms, including Tesla and SpaceX CEO Elon Musk, Amazon founder Jeff Bezos and Meta CEO Mark Zuckerberg, according to one of the officials (0.48). Google CEO Sundar Pichai is also planning to attend and sit with the other executives, one Trump transition official said (0.70). NBC News has reached out to representatives of TikTok and Google for comment. TikTok declined to comment and Google did not immediately respond. The New York Times was the first outlet to report about Chew’s inauguration attendance. Chew’s attendance comes as the popular TikTok app is preparing to possibly shut down Sunday, the day a U. S. ban is set to go into effect if the Supreme Court doesn’t overrule it. The nine justices on the conservative-majority court heard oral arguments last week about whether to implement the ban that was passed by Congress and signed into law by President Joe Biden last year. Remarks and questions by the justices during the arguments suggested it was likely they would uphold the law that would effectively ban the platform.",CEO of Google Sundar Pichai is planning to attend the inauguration.,0.7


Highest similarity score: 0.70
