#CBCTChat Demo Script
This script provides a demonstration of the CBCTChat system that can query
two different models with a given input text and display the results.


---


**Not for medical use. Data is processed by OpenAI**

**Ensure that the S2 CBCT Guildeline is copied into the data folder and the OpenAI API key is set up before executing.**



# Required Libraries
Ensure these libraries are installed for the script to function correctly.


In [1]:
!pip install llama_index -q
!pip install langchain -q
!pip install PyPDF2 -q
!pip install pypdf -q
!pip install backoff -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m796.4/796.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.8/143.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

# Setting up API Key
For security reasons, it's best to not hard-code API keys.
Consider using environment variables or external configuration.

In [2]:
import os
import openai
os.environ["OPENAI_API_KEY"] = 'ENTER_API_CODE_HERE'
# sometimes os.environ does not work, therefore enter twice
openai.api_key = 'ENTER_API_CODE_HERE'

# Create Index
assuming guideline PDF-file is in /data/ Folder

In [20]:
import logging
import sys
import time
import pandas as pd
import pypdf
import backoff
import PyPDF2

from llama_index import (GPTVectorStoreIndex, LLMPredictor,
                         SimpleDirectoryReader, ServiceContext,
                         StorageContext, load_index_from_storage)

from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding


# Initialize logging for better debugging and tracking.
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Constants and Configuration
chunk_size = 256
seconds = 6
TIMEOUT = 120  # Timeout for API requests
INPUT_FOLDER = 'data'



def get_pdf_metadata(filepath):
    """
    Extract metadata from a given PDF file.

    Args:
    - filepath (str): Path to the PDF file.

    Returns:
    - dict: A dictionary containing metadata of the PDF.
    """
    with open(filepath, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        info = pdf_reader.metadata

        metadata = {}
        for key, value in info.items():
            if key == '/Title':
                metadata['title'] = value
            else:
                metadata[key] = value

        if 'title' not in metadata:
            filename = os.path.basename(filepath)
            metadata['title'] = filename
        return metadata

# Initialize service context for indexing
service_context = ServiceContext.from_defaults(chunk_size=chunk_size, embed_model=OpenAIEmbedding(embed_batch_size=150))
# Load data from the directory and create an index
documents = SimpleDirectoryReader(INPUT_FOLDER, file_metadata=get_pdf_metadata).load_data()
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
index.storage_context.persist(persist_dir="./DVT_index")


# CBCTChat

In [22]:
import time
from IPython.display import display, HTML
from llama_index import QuestionAnswerPrompt, RefinePrompt
from langchain.chat_models import ChatOpenAI


# Set up parameters for two models
MODEL1 = "gpt-3.5-turbo"
MODEL2 = "gpt-4"
TEMP = 0.6

# Define templates for German language prompts
GERMAN_QA_PROMPT_TMPL = (
    "Wir haben unten Kontextinformationen bereitgestellt. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Angesichts dieser Informationen, bitte beantworte folgende Frage: {query_str}\n"
)
GERMAN_QA_PROMPT = QuestionAnswerPrompt(GERMAN_QA_PROMPT_TMPL)

GERMAN_REFINE_PROMPT_TMPL = (
"Die ursprüngliche Frage lautet wie folgt: {query_str}\n"
    "Wir haben eine ursprüngliche Antwort bereitgestellt: {existing_answer}\n"
    "Wir haben die Möglichkeit, die ursprüngliche Antwort zu verfeinern "
    "(nur wenn nötig) mit etwas mehr Kontext unten.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "Angesichts des neuen Kontextes, verfeinern Sie die ursprüngliche Antwort "
    "um die Frage besser zu beantworten. "
    "Wenn der Kontext nicht nützlich ist, wiederhole exakt die ursprüngliche Antwort."
)
GERMAN_REFINE_PROMPT = RefinePrompt(GERMAN_REFINE_PROMPT_TMPL)

# Create a QueryEngine
query_engine = index.as_query_engine(service_context=service_context, text_qa_template=GERMAN_QA_PROMPT, refine_template=GERMAN_REFINE_PROMPT,
 response_mode="compact", similarity_top_k=10)

# Utility function to extract and format filenames from a given response.
def get_filenames(response):
    """
    Extract and format filenames from the given response.

    Args:
    - response (object): Response object containing metadata.

    Returns:
    - str: Comma-separated string of formatted filenames.
    """

    base_url = "https://register.awmf.org/assets/guidelines/"
    file_page_dict = {}

    # Organize data into a dictionary
    for doc_id, metadata in response.metadata.items():
        title = metadata.get("title")
        page_label = metadata.get("page_label")

        if title:
            if title not in file_page_dict:
                file_page_dict[title] = set()  # Use a set instead of a list
            if page_label:
                file_page_dict[title].add(page_label)  # Add the page_label to the set

    # Format the filenames as clickable links and list pages
    filenames = []
    for title, pages in file_page_dict.items():
        pages = sorted(list(pages))  # Convert the set back to a list and sort it
        title_link = f'<a href="{base_url + title}" target="_blank">{title}</a>'
        if pages:
            # Create links for each page
            page_links = [f'<a href="{base_url + title}#page={page}" target="_blank">{page}</a>' for page in pages]
            filenames.append(f'{title_link} (Seiten: {", ".join(page_links)})')
        else:
            filenames.append(title_link)

    return ", ".join(filenames)

# Set up the LLMPredictor and ServiceContext for MODEL1
llm_predictor_model1 = LLMPredictor(llm=ChatOpenAI(temperature=TEMP, model_name=MODEL1))
service_context_model1 = ServiceContext.from_defaults(llm_predictor=llm_predictor_model1)

# Set up the LLMPredictor and ServiceContext for MODEL2
llm_predictor_model2 = LLMPredictor(llm=ChatOpenAI(temperature=TEMP, model_name=MODEL2))
service_context_model2 = ServiceContext.from_defaults(llm_predictor=llm_predictor_model2)

def query_single_text_model1(input_text):
    """
    Query the first model with the given input text and display results.

    Args:
    - input_text (str): Text input for the query.
    """
    display(HTML(f"<b>Results from CBCT using {MODEL1}:</b>"))
    query_engine = index.as_query_engine(service_context=service_context_model1, text_qa_template=GERMAN_QA_PROMPT, refine_template=GERMAN_REFINE_PROMPT, response_mode="compact", similarity_top_k=10)

    # Query the index and get the response
    response = query_engine.query(input_text)
    output_accGPT = response.response#.replace('\n', '\\n')

    # Get the corresponding filenames
    filenames = get_filenames(response)

    # Display the outputs in HTML
    display(HTML(f"<b>Result:</b> {output_accGPT}"))
    display(HTML(f"<b>Filenames:</b> {filenames}</p>"))

def query_single_text_model2(input_text):
    """
    Query the second model with the given input text and display results.

    Args:
    - input_text (str): Text input for the query.
    """
    display(HTML(f"<b>Results from CBCT using {MODEL2}:</b>"))
    query_engine = index.as_query_engine(service_context=service_context_model2, text_qa_template=GERMAN_QA_PROMPT, refine_template=GERMAN_REFINE_PROMPT, response_mode="compact", similarity_top_k=10)

    # Query the index and get the response
    response = query_engine.query(input_text)
    output_accGPT = response.response#.replace('\n', '\\n')

    # Get the corresponding filenames
    filenames = get_filenames(response)

    # Display the outputs in HTML
    display(HTML(f"<b>Result:</b> {output_accGPT}"))
    display(HTML(f"<b>Filenames:</b> {filenames}</p>"))


# Run CBCTChat with Question Answer input

In [23]:
input_text = "Sollte eine DVT bei Kindern bei jedem Zahnarztbesuch durchgeführt werden?"

query_single_text_model1(input_text)
query_single_text_model2(input_text)