In [4]:
### Code written based on LLAMA_INDEX v0.5.0 and OPENAI shortly after release of GPT4
### for using with recent Versions of OpenAI and LLAMA_Index - Major Code-Changes are to be expected!
#Version used for this code:
# PyPDF2 (PyPDF2): 3.0.1 [Pip]
# langchain (langchain): 0.0.332 [Pip]
# llama-index (llama_index): 0.8.65 [Pip]
# nest_asyncio (nest_asyncio): 1.5.8 [Pip]
# openai (openai): 1.2.0 [Pip]
# pypdf (pypdf): 3.15.1 [Pip]


# ### Cell 1: Building and Persisting the Vector Store Index

# Import Necessary Libraries
import os
import sys
import logging
import nest_asyncio  
import pandas as pd
import pypdf
import PyPDF2

# Importing classes and functions from llama_index library for indexing and querying
from llama_index import (
    VectorStoreIndex,
    LLMPredictor,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    load_index_from_storage
)
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms import OpenAI

# Importing ChatOpenAI from langchain for language model interactions
from langchain.chat_models import ChatOpenAI

import openai  # OpenAI API client

# Apply Nest Asyncio
# This allows the Jupyter notebook to handle asynchronous operations properly.
nest_asyncio.apply()

# Configuration and Environment Setup

# Security Note: It's crucial to manage API keys securely.
# Avoid hardcoding them in your scripts. Consider using environment variables or
# a dedicated secrets manager.
#os.environ["OPENAI_API_KEY"] = 'sk-...your-api-key...'  # Replace with your actual API key securely
openai.api_key = os.getenv("OPENAI_API_KEY")  # Fetch the API key from environment variables

# Logging Configuration
# Uncomment the following lines to enable logging for debugging purposes.
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Define Constants
INPUT_DIRECTORY = 'MRONJ'  # Directory containing your input documents (e.g., PDFs)
PERSIST_DIR = "INDEX"      # Directory where the index will be saved for persistence

# LLM (Language Model) Configuration Parameters
# Embedding model ist default to text-embedding-ada-002
LLM_MODEL_NAME = "gpt-4"  # Choose between models like "gpt-3.5-turbo" or "gpt-4"
LLM_TEMPERATURE = 0.6              # Controls the randomness of the model's output
CONTEXT_WINDOW_SIZE = 4096          # Maximum tokens the model can handle in context
CHUNK_SIZE = 1024                   # Size of text chunks for processing
EMBED_BATCH_SIZE = 150              # Batch size for embedding generation

# Function to Attach Filename Metadata to Documents
def attach_filename_metadata(filename):
    """
    Attaches the filename as metadata to each document.

    Args:
        filename (str): The name of the file.

    Returns:
        dict: A dictionary containing the filename metadata.
    """
    return {'file_name': filename}

# Initialize LLMPredictor with ChatOpenAI
llm_predictor = LLMPredictor(
    llm=ChatOpenAI(temperature=LLM_TEMPERATURE, model_name=LLM_MODEL_NAME)
)

# Create a Sentence Window Node Parser with Default Settings
# This parser splits documents into overlapping sentences (windows) for better context handling.
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,  # Number of sentences in each window
    window_metadata_key="window",  # Metadata key for window information
    original_text_metadata_key="original_text",  # Metadata key for original text
)

# Set Up the Service Context
# The ServiceContext combines the LLM predictor, embedding model, and node parser.
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    embed_model=OpenAIEmbedding(embed_batch_size=EMBED_BATCH_SIZE),
    node_parser=node_parser,
    chunk_size=CHUNK_SIZE,
    context_window=CONTEXT_WINDOW_SIZE
)

# Load Documents and Build Index

print('Loading documents...')
# Reads documents from the specified input directory and attaches filename metadata
documents = SimpleDirectoryReader(INPUT_DIRECTORY, file_metadata=attach_filename_metadata).load_data()
print('Documents loaded.')

print('Building Vector Store Index...')
# Creates a VectorStoreIndex from the loaded documents using the service context
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
print('Vector Store Index created.')

print(f'Persisting index to directory: {PERSIST_DIR}')
# Saves the index to the specified persistence directory for later use
index.storage_context.persist(persist_dir=PERSIST_DIR)
print('Index persisted successfully.')


Loading documents...
Documents loaded.
Building Vector Store Index...
Vector Store Index created.
Persisting index to directory: INDEX
Index persisted successfully.


In [5]:
# ### Cell 2: Querying the Vector Store Index

# Import Necessary Libraries for Application
import os
import sys
import logging
import time
import pandas as pd
import pypdf
import PyPDF2

# Importing additional classes and functions from llama_index for querying and post-processing
from llama_index import (
    GPTVectorStoreIndex,
    LLMPredictor,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
    QuestionAnswerPrompt,
    RefinePrompt
)
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.query.query_transform import HyDEQueryTransform
from llama_index.query_engine.transform_query_engine import TransformQueryEngine
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms import OpenAI

from langchain.chat_models import ChatOpenAI

import openai  # OpenAI API client

import nest_asyncio
nest_asyncio.apply()  # Apply nest_asyncio again if not already applied

# Configuration and Environment Setup

# Security Note: Ensure API keys are managed securely.
# os.environ["OPENAI_API_KEY"] = 'sk-...your-api-key...'  # Replace with your actual API key securely
# openai.api_key = os.getenv("OPENAI_API_KEY")  # Fetch the API key from environment variables

# Logging Configuration
# Uncomment the following lines to enable logging for debugging purposes.
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Define Constants
PERSIST_DIR = "INDEX"  # Directory where the index is persisted

# LLM Configuration Parameters for Querying
# Embedding model ist default to text-embedding-ada-002
QUERY_MODEL = "gpt-4"                # Choose between models like "gpt-3.5-turbo" or "gpt-4"
QUERY_TEMPERATURE = 0.4               # Controls the randomness of the model's output
QUERY_CONTEXT_WINDOW = 8192           # Maximum tokens the model can handle in context for queries
QUERY_NUM_OUTPUT_TOKENS = 1024        # Number of tokens the model should generate in response
SIMILARITY_TOP_K = 20                 # Number of top similar documents to retrieve
RESPONSE_MODE = "compact"             # Mode for the response formatting
API_TIMEOUT = 120                     # Timeout for API requests in seconds

# Function to Load the Existing Index from Storage
def load_existing_index(persist_dir):
    """
    Loads the existing index from the specified persistence directory.

    Args:
        persist_dir (str): The directory where the index is persisted.

    Returns:
        VectorStoreIndex: The loaded index.
    """
    logging.info(f'Loading index from storage directory: {persist_dir}')
    storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
    index = load_index_from_storage(storage_context)
    logging.info('Index loaded successfully.')
    return index

# Define Prompt Templates

# Template for the initial question-answering prompt in English
ENGLISH_QA_PROMPT_TEMPLATE = (
    "We have provided scientific context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the following question: {query_str}\n"
    "\n---------------------\n"
    "Please follow the structure below for your response as part of a Clinical Practice Guidelines for MRONJ:\n"
    "- Answer: (Provide a concise yet comprehensive and self-contained answer summarizing the key points.)\n"
    "\n---------------------\n"
    "- Explanation: (Explain your answer by providing informative yet concise details. Describe relevant factors or actions and outline the impact.)"
    "\n---------------------\n"
)

# Initialize the QuestionAnswerPrompt with the defined template
ENGLISH_QA_PROMPT = QuestionAnswerPrompt(ENGLISH_QA_PROMPT_TEMPLATE)

# Template for refining the initial answer based on additional context
ENGLISH_REFINE_PROMPT_TEMPLATE = (
    "The original question is as follows: {query_str}\n"
    "We have provided an original answer: {existing_answer}\n"
    "We have the option to refine the original answer (only if necessary) with some more context below.\n"
    "If necessary, try to improve the answer and explanation by adding more details.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "Given the new context, refine the original answer to better answer the question. The answer might be changed or adopted based on new context.\n"
    "\n---------------------\n"
    "Please follow the structure below for your response as part of a Clinical Practice Guidelines for MRONJ:\n"
    "- Answer: (Provide a concise yet comprehensive and self-contained answer summarizing the key points. Modify based on new context if applicable.)\n"
    "\n---------------------\n"
    "- Explanation: (Explain your answer by providing informative yet concise details. Describe relevant factors or actions and outline the impact) Modify based on new context if applicable.\n"
    "\n---------------------\n"
    "If the new context is not useful, repeat exactly the original answer.\n"
)

# Initialize the RefinePrompt with the defined template
ENGLISH_REFINE_PROMPT = RefinePrompt(ENGLISH_REFINE_PROMPT_TEMPLATE)

# Function to Initialize the Query Engine
def initialize_query_engine(index):
    """
    Initializes the query engine with the specified index and configuration.

    Args:
        index (VectorStoreIndex): The vector store index to query against.

    Returns:
        QueryEngine: The initialized query engine.
    """
    # Initialize the LLMPredictor with ChatOpenAI for querying
    query_llm_predictor = LLMPredictor(
        llm=ChatOpenAI(temperature=QUERY_TEMPERATURE, model_name=QUERY_MODEL)
    )
    
    # Set up the Service Context for querying
    query_service_context = ServiceContext.from_defaults(
        llm_predictor=query_llm_predictor,
        chunk_size=QUERY_NUM_OUTPUT_TOKENS,
        context_window=QUERY_CONTEXT_WINDOW
    )
    
    # Initialize the Query Engine with specified parameters and prompts
    query_engine = index.as_query_engine(
        service_context=query_service_context,
        similarity_top_k=SIMILARITY_TOP_K,
        response_mode=RESPONSE_MODE,
        text_qa_template=ENGLISH_QA_PROMPT,
        refine_template=ENGLISH_REFINE_PROMPT,
        node_postprocessors=[
            MetadataReplacementPostProcessor(target_metadata_key="window")
        ],
    )
    
    return query_engine

# Function to Extract Filenames and Pages from Response Metadata
def get_filenames(response):
    """
    Extracts and summarizes filenames and their corresponding pages from the query response.

    Args:
        response (Response): The response object from the query.

    Returns:
        str: A formatted string summarizing source files and pages.
    """
    filenames_pages = dict()
    for node in response.source_nodes:
        metadata = node.node.metadata
        filename_label = metadata.get("file_name")  # Retrieve filename from metadata
        page_label = metadata.get("page_label")    # Retrieve page number from metadata
        if filename_label is not None:
            filename_label = os.path.basename(filename_label)  # Extract base filename
            if filename_label not in filenames_pages:
                filenames_pages[filename_label] = set()
            if page_label is not None:
                filenames_pages[filename_label].add(f"{page_label}")
    summary = []
    for filename, pages in filenames_pages.items():
        # Sort pages numerically if possible
        sorted_pages = sorted(pages, key=lambda x: (not x.isdigit(), int(x) if x.isdigit() else x))
        summary.append(f"{filename} Page: ({', '.join(sorted_pages)})" if pages else filename)
    return ", ".join(summary)

# Function to Execute a Query and Display Results with Filename Metadata
def execute_query(query_engine, query_text):
    """
    Executes a query using the provided query engine and displays the response along with source metadata.

    Args:
        query_engine (QueryEngine): The query engine to use for executing the query.
        query_text (str): The text of the query to execute.
    """
    logging.info('Executing query...')
    response = query_engine.query(query_text)
    logging.info('Query executed successfully.')
    
    # Display the Response
    print("Response:")
    print(response)
    
    # Extract and Display Filenames and Pages
    filenames_summary = get_filenames(response)
    print("\nSource Files and Pages:")
    print(filenames_summary)

# Load the Existing Index from Storage
index = load_existing_index(PERSIST_DIR)

# Initialize the Query Engine
query_engine = initialize_query_engine(index)

# Define the Sample Query
sample_query = 'Which concomitant medications (drug classes) increase the risk of MRONJ in patients under antiresorptive therapy?'

# Execute the Query and Display Results
execute_query(query_engine, sample_query)


Response:
- Answer: The use of medications with antiangiogenic properties such as glucocorticoids, thalidomide, and bortezomib may increase the risk of MRONJ in patients under antiresorptive therapy. However, it is important to note that other factors, such as underlying dental problems and dental procedures, have been identified as significant risk factors for MRONJ.

- Explanation: Medications with antiangiogenic properties, such as glucocorticoids, thalidomide, and bortezomib, have been identified as coexisting factors that may increase the risk of MRONJ in patients undergoing antiresorptive therapy. These drugs can impair the body's ability to repair and remodel bone, particularly in an environment that is trauma-intense and bacteria-laden, such as the oral cavity. This can lead to the development of osteonecrosis. However, research has shown that underlying dental problems, such as infection or dental extraction, were present in a significant percentage of patients who developed M