In [1]:
{
  "queries": [
    {
      "id": 1,
      "query": "What is the capital of France?"
    },
    {
      "id": 2,
      "query": "How does photosynthesis work?"
    },
    {
      "id": 3,
      "query": "Python programming language features"
    }
  ]
}

{
  "documents": {
    "doc1": "Paris is the capital of France. It is known for its iconic landmarks...",
    "doc2": "Photosynthesis is the process by which green plants and some other organisms...",
    "doc3": "Python is a high-level, interpreted programming language known for its simplicity..."
  }
}

{'documents': {'doc1': 'Paris is the capital of France. It is known for its iconic landmarks...',
  'doc2': 'Photosynthesis is the process by which green plants and some other organisms...',
  'doc3': 'Python is a high-level, interpreted programming language known for its simplicity...'}}

In [11]:
import json
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def embed_text(text):
    tokens = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

def get_cosine_similarity(query_embedding, doc_embedding):
    sim = cosine_similarity(query_embedding, doc_embedding)
    return sim[0][0]

def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def retrieve_best_matches(query, documents):
    query_embedding = embed_text(query)

    best_matches = []
    for doc_id, doc_text in documents.items():
        doc_embedding = embed_text(doc_text)
        similarity = get_cosine_similarity(query_embedding, doc_embedding)
        best_matches.append((doc_id, similarity))

    best_matches.sort(key=lambda x: x[1], reverse=True)
    return best_matches

if __name__ == "__main__":
    # Load Queries JSON
    queries_path = "queries.json"
    queries_data = load_json(queries_path)["queries"]

    # Load Documents JSON
    documents_path = "documents.json"
    documents_data = load_json(documents_path)["documents"]

    # Perform retrieval for each query
    for query_entry in queries_data:
        query_id = query_entry["id"]
        query_text = query_entry["query"]

        # Retrieve best matches
        matches = retrieve_best_matches(query_text, documents_data)
        # Print ranked list of matches for each query
        print(f"\nQuery ID: {query_id}, Query Text: {query_text}")
        for rank, (doc_id, similarity) in enumerate(matches):
            print(f"Rank {rank + 1}: Document ID: {doc_id}, Similarity: {similarity:.4f}")
            print(f"Document Text: {documents_data[doc_id]}\n")



Query ID: 1, Query Text: The patient has KRAS mutation
Rank 1: Document ID: doc1, Similarity: 0.8575
Document Text: The trial includes patients with KRAS mutation

Rank 2: Document ID: doc4, Similarity: 0.8360
Document Text: The trial includes patients with BRAC mutation

Rank 3: Document ID: doc3, Similarity: 0.7601
Document Text: The trial excludes patients with a history of heart disease

Rank 4: Document ID: doc6, Similarity: 0.7236
Document Text: The trial excludes patients with pregnancy

Rank 5: Document ID: doc5, Similarity: 0.7217
Document Text: The triale excludes patients with HIV

Rank 6: Document ID: doc7, Similarity: 0.7105
Document Text: The trial includes patients undegoing radiotherapy

Rank 7: Document ID: doc2, Similarity: 0.6666
Document Text: The trial includes patients over 18 years of age

