# Querying local PDF library
## Based on: Dartboard RAG: Retrieval-Augmented Generation with Balanced Relevance and Diversity
## https://arxiv.org/pdf/2407.12101

### Import libraries and environment variables

In [1]:
import os
import pickle

from dotenv import load_dotenv
from dartsearch import get_context_with_dartboard
from chat_complete import answer_query

from helper_functions import *
from evalute_rag import *

In [None]:
# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable (comment out if not using OpenAI)
if not os.getenv('OPENAI_API_KEY'):
    print("Please enter your OpenAI API key: ")
    os.environ["OPENAI_API_KEY"] = input("Please enter your OpenAI API key: ")
else:
    os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

### Create Unified Library PDF

In [2]:
merged = merge_pdf_library()

	added jenior-et-al-2021-novel-drivers-of-virulence-in-clostridioides-difficile-identified-via-context-specific-metabolic.pdf to the merger.
	added msystems.pdf to the merger.
	added leslie-et-al-2019-the-gut-microbiota-is-associated-with-clearance-of-clostridium-difficile-infection-independent-of.pdf to the merger.
	added nihms-1845454.pdf to the merger.
	added jenior-et-al-2018-clostridium-difficile-alters-the-structure-and-metabolism-of-distinct-cecal-microbiomes-during.pdf to the merger.
	added leslie-et-al-2021-protection-from-lethal-clostridioides-difficile-infection-via-intraspecies-competition-for-cogerminant.pdf to the merger.
	added journal.pcbi.1011076.pdf to the merger.
	added 2023.08.08.552483v1.full.pdf to the merger.

All PDFs have been concatenated into merged_library.2025-02-21_14-14-01.pdf


### Encode Library

In [3]:
vector_library = retry_with_exponential_backoff(encode_pdf(merged, chunk_size=1000, chunk_overlap=200))

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Test a basic way to save the embedding locally
import pickle

def save_embedding(embedding, pkl_name):
    """Save a vector library locally."""
    try:
        with open(f"{pkl_name}.pkl", "wb") as f: 
            pickle.dump(embedding, f)
    except Exception as e:
        print(f"An error occurred saving embedding: {e}")



## Dartboard Context Retrieval

### Main function for using the dartboard retrieval. This serves instead of get_context (which is simple RAG). It:

1. Takes a text query, vectorizes it, gets the top k documents (and their vectors) via simple RAG
2. Uses these vectors to calculate the similarities to query and between candidate matches
3. Runs the dartboard algorithm to refine the candidate matches to a final list of k documents
4. Returns the final list of documents and their scores

In [None]:

test_query = "What nutrients does C. difficile consume during infection?"


In [None]:
texts, scores = get_context_with_dartboard(test_query, vector_library, k=5)

In [None]:
answer_query(test_query, texts)