In [4]:
### Code written based on LLAMA_INDEX v0.8 and OPENAI shortly after release of GPT4 Turbo
### for using with recent Versions of OpenAI and LLAMA_Index - Major Code-Changes are to be expected!
#Version used for this code:
# PyPDF2 (PyPDF2): 3.0.1 [Pip]
# langchain (langchain): 0.0.332 [Pip]
# llama-index (llama_index): 0.8.65 [Pip]
# nest_asyncio (nest_asyncio): 1.5.8 [Pip]
# openai (openai): 1.2.0 [Pip]
# pypdf (pypdf): 3.15.1 [Pip]


# ### Cell 1: Building and Persisting the Vector Store Index

# Import Necessary Libraries
import os
import sys
import logging
import nest_asyncio  
import pandas as pd
import pypdf
import PyPDF2

# Importing classes and functions from llama_index library for indexing and querying
from llama_index import (
    VectorStoreIndex,
    LLMPredictor,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    load_index_from_storage
)
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms import OpenAI

# Importing ChatOpenAI from langchain for language model interactions
from langchain.chat_models import ChatOpenAI

import openai  # OpenAI API client

# Apply Nest Asyncio
# This allows the Jupyter notebook to handle asynchronous operations properly.
nest_asyncio.apply()

# Configuration and Environment Setup

# Security Note: Ensure API keys are managed securely.
# os.environ["OPENAI_API_KEY"] = 'sk-...'  # Replace with your actual API key securely
openai.api_key = os.getenv("OPENAI_API_KEY")  # Fetch the API key from environment variables

# Logging Configuration
# Uncomment the following lines to enable logging for debugging purposes.
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Define Constants
INPUT_DIRECTORY = 'TRAUMA'  # Directory containing your input documents (e.g., PDFs)
PERSIST_DIR = "INDEX"      # Directory where the index will be saved for persistence

# LLM (Language Model) Configuration Parameters
# Embedding model ist default to text-embedding-ada-002
LLM_MODEL_NAME = "gpt-4-1106-preview"  # Choose between models like "gpt-4" or "gpt-4-1106-preview" 
LLM_TEMPERATURE = 0.4              # Controls the randomness of the model's output
CONTEXT_WINDOW_SIZE = 48192         # Feasible amount tokens the model can handle in context
CHUNK_SIZE = 1024                   # Size of text chunks for processing
EMBED_BATCH_SIZE = 150              # Batch size for embedding generation

# Function to Attach Filename Metadata to Documents
def attach_filename_metadata(filename):
    """
    Attaches the filename as metadata to each document.

    Args:
        filename (str): The name of the file.

    Returns:
        dict: A dictionary containing the filename metadata.
    """
    return {'file_name': filename}

# Initialize LLMPredictor with ChatOpenAI
llm_predictor = LLMPredictor(
    llm=ChatOpenAI(temperature=LLM_TEMPERATURE, model_name=LLM_MODEL_NAME)
)

# Create a Sentence Window Node Parser with Default Settings
# This parser splits documents into overlapping sentences (windows) for better context handling.
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,  # Number of sentences surrounding the embedded sentence in each window
    window_metadata_key="window",  # Metadata key for window information
    original_text_metadata_key="original_text",  # Metadata key for original text
)

# Set Up the Service Context
# The ServiceContext combines the LLM predictor, embedding model, and node parser.
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    embed_model=OpenAIEmbedding(embed_batch_size=EMBED_BATCH_SIZE),
    node_parser=node_parser,
    chunk_size=CHUNK_SIZE,
    context_window=CONTEXT_WINDOW_SIZE
)

# Load Documents and Build Index

print('Loading documents...')
# Reads documents from the specified input directory and attaches filename metadata
documents = SimpleDirectoryReader(INPUT_DIRECTORY, file_metadata=attach_filename_metadata).load_data()
print('Documents loaded.')

print('Building Vector Store Index...')
# Creates a VectorStoreIndex from the loaded documents using the service context
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
print('Vector Store Index created.')

print(f'Persisting index to directory: {PERSIST_DIR}')
# Saves the index to the specified persistence directory for later use
index.storage_context.persist(persist_dir=PERSIST_DIR)
print('Index persisted successfully.')


Loading documents...
Documents loaded.
Building Vector Store Index...
Vector Store Index created.
Persisting index to directory: INDEX
Index persisted successfully.


In [3]:
# ### Cell 2: Querying the Vector Store Index 

# Import Necessary Libraries for Application
import os
import sys
import logging
import time
import pandas as pd
import pypdf
import PyPDF2

# Importing additional classes and functions from llama_index for querying and post-processing
from llama_index import (
    GPTVectorStoreIndex,
    LLMPredictor,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
    QuestionAnswerPrompt,
    RefinePrompt
)
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.query.query_transform import HyDEQueryTransform
from llama_index.query_engine.transform_query_engine import TransformQueryEngine
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms import OpenAI

from langchain.chat_models import ChatOpenAI

import openai  # OpenAI API client

import nest_asyncio
nest_asyncio.apply()  # Apply nest_asyncio again if not already applied

# Configuration and Environment Setup

# Security Note: Ensure API keys are managed securely.
# os.environ["OPENAI_API_KEY"] = 'sk-...'  # Replace with your actual API key securely
openai.api_key = os.getenv("OPENAI_API_KEY")  # Fetch the API key from environment variables

# Logging Configuration
# Uncomment the following lines to enable logging for debugging purposes.
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Define Constants
PERSIST_DIR = "INDEX"  # Directory where the index is persisted

# LLM Configuration Parameters for Querying
# Embedding model is default to text-embedding-ada-002
QUERY_MODEL = "gpt-4-1106-preview"    # Version used in paper, choose newer models if desired
QUERY_TEMPERATURE = 0.4               # Controls the randomness of the model's output
QUERY_CONTEXT_WINDOW = 48192          # Feasible amount of tokens the model can handle in context for queries
QUERY_NUM_OUTPUT_TOKENS = 1024        # Number of tokens the model should generate in response
SIMILARITY_TOP_K = 20                 # Number of top similar documents to retrieve
RESPONSE_MODE = "compact"             # Mode for the response formatting


# Function to Load the Existing Index from Storage
def load_existing_index(persist_dir):
    """
    Loads the existing index from the specified persistence directory.

    Args:
        persist_dir (str): The directory where the index is persisted.

    Returns:
        VectorStoreIndex: The loaded index.
    """
    logging.info(f'Loading index from storage directory: {persist_dir}')
    storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
    index = load_index_from_storage(storage_context)
    logging.info('Index loaded successfully.')
    return index

# Define Prompt Templates

## 1st Step

# --- Helper Functions ---
ENGLISH_QA_PROMPT_TMPL1 = (
    "We have provided scientific context information below. \n"
    "---------------------\n"
    "{context_str}\n"
    "\n---------------------\n"
    "Given this information, please evaluate the following image finding: {query_str}\n"   
    "\n---------------------\n"
    "Please follow the structure below for your response:\n"
    "- Primary diagnosis: (Main pathologic finding as working diagnosis, no explanation)\n"
    "\n---------------------\n"
)

ENGLISH_QA_PROMPT1 = QuestionAnswerPrompt(ENGLISH_QA_PROMPT_TMPL1)

# --- Helper Functions ---
ENGLISH_REFINE_PROMPT_TMPL1 = (
    "The original case file with the initial image finding is as follows:\n"
    "---------------------\n"
    "{query_str}\n"
    "\n---------------------\n"
    "The initial evaluation of the image finding provided is:\n"
    "---------------------\n"
    "{existing_answer}\n"
    "\n---------------------\n"
    "Additional context information is provided below to potentially refine the evaluation of the image finding.\n"
    "If necessary, refine the evaluation of the image finding based on the new context.\n"
    "---------------------\n"
    "{context_msg}\n"
    "\n---------------------\n"
    "Given the new context, please refine the evaluation of the image finding. The evaluation might change or be adopted based on new context.\n"
    "\n---------------------\n"
    "Please follow the structure below for your response:\n"
    "- Primary diagnosis: (Refined pathologic finding as working diagnosis, no explanation)\n"
    "\n---------------------\n"
    "If the new context is not useful, repeat exactly the original answer.\n"
)

ENGLISH_REFINE_PROMPT1 = RefinePrompt(ENGLISH_REFINE_PROMPT_TMPL1)

## 2nd Step

# --- Helper Functions ---
ENGLISH_QA_PROMPT_TMPL2 = (
    "We have provided scientific context information below. \n"
    "---------------------\n"
    "{context_str}\n"
    "\n---------------------\n"
    "Given this information, please evaluate the following image finding and suggested primary diagnosis: {query_str}\n"   
    "\n---------------------\n"
    "Please follow the structure below for your response:\n"
    "- Primary diagnosis: (Main pathologic finding as working diagnosis, no explanation)\n"
    "\n---------------------\n"
    "- Radiological classification system for this diagnosis (Please state only one classification! If there is no appropriate classification, specify this):\n"
    "\n---------------------\n"
    "- Grading in the classification system for this diagnosis (Please state only one grade! If there is no classification system, specify this):\n"
    "\n---------------------\n"
    "Explanation: Provide a concise yet comprehensive and self-contained explanation summarizing the key points of the main diagnosis, classification system and grading."
    "\n---------------------\n"
)

ENGLISH_QA_PROMPT2 = QuestionAnswerPrompt(ENGLISH_QA_PROMPT_TMPL2)

# --- Helper Functions ---
ENGLISH_REFINE_PROMPT_TMPL2 = (
    "The original case file with the initial image finding and suggested primary diagnosis is as follows:\n"
    "---------------------\n"
    "{query_str}\n"
    "\n---------------------\n"
    "The initial evaluation of the image finding provided is:\n"
    "---------------------\n"
    "{existing_answer}\n"
    "\n---------------------\n"
    "Additional context information is provided below to potentially refine the evaluation of the image finding.\n"
    "If necessary, refine the evaluation of the image finding based on the new context.\n"
    "---------------------\n"
    "{context_msg}\n"
    "\n---------------------\n"
    "Given the new context, please refine the evaluation of the image finding. The evaluation might change or be adopted based on new context.\n"
    "\n---------------------\n"
    "Please follow the structure below for your response:\n"
    "- Primary diagnosis: (Refined pathologic finding as working diagnosis, no explanation)\n"
    "\n---------------------\n"
    "- Radiological classification system for this diagnosis (Please state only one classification! If there is no appropriate classification, specify this):\n"
    "\n---------------------\n"
    "- Grading in the classification system for this diagnosis (Please state only one grade! If there is no classification system, specify this):\n"
    "\n---------------------\n"
    "Explanation: Provide a concise yet comprehensive and self-contained explanation summarizing the key points of the refined main diagnosis, radiological classification system and grading (if applicable).\n"
    "\n---------------------\n"
    "If the new context is not useful, repeat exactly the original answer.\n"
)

ENGLISH_REFINE_PROMPT2 = RefinePrompt(ENGLISH_REFINE_PROMPT_TMPL2)

def get_filenames(response):
    """
    Extracts and summarizes filenames and their corresponding pages from the query response.

    Args:
        response (Response): The response object from the query.

    Returns:
        str: A formatted string summarizing source files and pages.
    """
    filenames_pages = dict()
    for doc_id, metadata in response.metadata.items():
        filename_label = metadata.get("file_name")
        page_label = metadata.get("page_label")
        if filename_label is not None:
            filename_label = os.path.basename(filename_label)
            if filename_label not in filenames_pages:
                filenames_pages[filename_label] = set()
            if page_label is not None:
                filenames_pages[filename_label].add(f"{page_label}")
    summary = []
    for filename, pages in filenames_pages.items():
        # Sort pages numerically if possible
        sorted_pages = sorted(pages, key=lambda x: (not x.isdigit(), int(x) if x.isdigit() else x))
        summary.append(f"{filename} Page: ({', '.join(sorted_pages)})" if pages else filename)
    return ", ".join(summary)

# Set up the LLMPredictor and ServiceContext
llm_predictor = LLMPredictor(
    llm=ChatOpenAI(
        temperature=QUERY_TEMPERATURE, 
        model_name=QUERY_MODEL, 
    )
)
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor, 
    chunk_size=QUERY_NUM_OUTPUT_TOKENS, 
    context_window=QUERY_CONTEXT_WINDOW
)

# Load the Existing Index from Storage ## might take some time, disable when already loaded once
index = load_existing_index(PERSIST_DIR)

# Initialize Query Engines for Both Steps
query_engine1 = index.as_query_engine(
    service_context=service_context,
    similarity_top_k=SIMILARITY_TOP_K,
    response_mode=RESPONSE_MODE,
    text_qa_template=ENGLISH_QA_PROMPT1,
    refine_template=ENGLISH_REFINE_PROMPT1,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

query_engine2 = index.as_query_engine(
    service_context=service_context,
    similarity_top_k=SIMILARITY_TOP_K,
    response_mode=RESPONSE_MODE,
    text_qa_template=ENGLISH_QA_PROMPT2,
    refine_template=ENGLISH_REFINE_PROMPT2,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

# Define the Case Description
casedesc = '''
CT scan of the lumbosacral spine reveals a vertically oriented transforaminal fracture through the left sacrum. The fracture extends upward, involving the L5-S1 facets and causing significant widening of the facet joint, but no dissociation of the superior sacral facet from the medial sacrum is seen.
'''

# Prepare Input Text for the First Query
input_text = f'{casedesc}'

# Query the Index and Get the First Response
response1 = query_engine1.query(input_text)

# Process the First Response
output_accGPT1 = response1.response.replace('\n\n\n', '\\n\\n\\n').replace('\n\n', '\\n\\n').replace('\n', '\\n')

# Prepare Input Text for the Second Query
input_text2 = f'{casedesc} \n---------------------\n {output_accGPT1} Next step: Classification and Grading'

# Query the Index and Get the Second Response
response2 = query_engine2.query(input_text2)

# Process the Second Response
output_accGPT2 = response2.response.replace('\n\n\n', '\\n\\n\\n').replace('\n\n', '\\n\\n').replace('\n', '\\n')

# Get the Corresponding Filenames
filenames = get_filenames(response2)

# Display the Results
print("First Response (Primary Diagnosis):")
print(output_accGPT1)
print('----------------------------------------------------------------')
print("Second Response (Classification and Grading):")
print(output_accGPT2)
print("Source Files and Pages:")
print(filenames)


First Response (Primary Diagnosis):
- Primary diagnosis: Isler type II sacral fracture with limited lumbosacral instability
----------------------------------------------------------------
Second Response (Classification and Grading):
- Primary diagnosis: Isler type II sacral fracture with limited lumbosacral instability\n\n---------------------\n\n- Radiological classification system for this diagnosis: Isler Classification of Lumbosacral Instability\n\n---------------------\n\n- Grading in the classification system for this diagnosis: Type II\n\n---------------------\n\nExplanation: The Isler Classification of Lumbosacral Instability is used to assess the integrity and stability of the lumbosacral junction in patients with vertical sacral fractures that extend to involve the L5-S1 facets. It is particularly relevant when Denis zone 2 fractures propagate superiorly. In the case described, the CT scan shows a vertically oriented transforaminal fracture through the left sacrum with upwa