In [None]:
# save data locally as a json file
import json

with open("data.json", "w") as f:
    json.dump(data_, f, indent=4)

In [29]:
#load data from json file
with open("data.json", "r") as f:
    data = json.load(f)

In [1]:
from sentence_transformers import SentenceTransformer, util
import heapq

# Load Sentence Transformer model
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

# Function to calculate harmonic mean
def harmonic_mean(a, b):
    return (2 * a * b) / (a + b) if a + b != 0 else 0

# Function to get embeddings
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Retrieval pipeline
def retrieve(query, data, top_n=5):
    # Get query embedding
    query_embedding = get_embedding(query)
    
    # Step 1: Calculate similarity with topics
    topic_similarities = []
    for idx, item in enumerate(data):
        topic_embedding = get_embedding(item['topic'])
        similarity = util.cos_sim(query_embedding, topic_embedding).item()
        topic_similarities.append((similarity, idx))
    
    # Step 2: Select top N topics
    top_topics = heapq.nlargest(top_n, topic_similarities, key=lambda x: x[0])
    
    # Step 3: Calculate similarity with content for top topics
    results = []
    for topic_similarity, idx in top_topics:
        content_embedding = get_embedding(data[idx]['content'])
        content_similarity = util.cos_sim(query_embedding, content_embedding).item()
        # Step 4: Rank by harmonic mean
        hmean = harmonic_mean(topic_similarity, content_similarity)
        results.append((hmean, data[idx]))
    
    # Step 5: Sort results by harmonic mean
    results = sorted(results, key=lambda x: x[0], reverse=True)
    
    return results

  from tqdm.autonotebook import tqdm, trange


In [None]:
# User Query
query = "what is the energy in fully developed laminar flow?"

# Retrieve Results
results = retrieve(query, data)

# Print Ranked Results
print("Ranked Results:")
for score, item in results:
    print(f"Total Similarity: {score:.4f}, Topic: {item['topic']}, Content: {item['content'][:1000]}...")