In [1]:
from PyPDF2 import PdfReader
import os, re, io
import tempfile
from dotenv import load_dotenv
from pdf2image import convert_from_path
import fitz
import textwrap
from collections import OrderedDict

import sys
sys.path.append('../') 
from my_rag import RAG

# Load environment variables from .env file
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

#OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
pdf_storage_dir = 'data/pdfs'
chunk_size = 500 # 500 characters default
chunk_overlap = 25 # default
top_k = 3 # number of chunks found in semantic search
DBPATH = '../data/db_file.db'
model = 'all-MiniLM-L6-v2'
question_file = '../assets/question_doc.txt'
llm_model = 'gpt-3.5-turbo'




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_questions(question_file):
    qa_pairs = []
    with open(question_file, 'r') as file:
        lines = file.readlines()
    question = None
    for line in lines:
        if line.startswith('Q:'):
            question = line.strip()
        elif line.startswith('A:') and question:
            answer = line.strip()
            qa_pairs.append((question, answer))
            question = None
    print(f'A total of {len(qa_pairs)} has been successfully loaded.')
    return qa_pairs

In [20]:
def read_question_answer_pairs(file_path):
    pairs = []
    current_question = None
    current_answer = ""

    try:
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith('Q:') and current_question:
                    pairs.append((current_question, current_answer.strip()))
                    current_question = line.strip()
                    current_answer = ""
                elif line.startswith('Q:'):
                    current_question = line.strip()
                elif line.startswith('A:'):
                    current_answer = line.strip()
                else:
                    current_answer += " " + line.strip()

            # Add the last Q-A pair if exists
            if current_question and current_answer:
                pairs.append((current_question, current_answer.strip()))

    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            for line in file:
                if line.startswith('Q:') and current_question:
                    pairs.append((current_question, current_answer.strip()))
                    current_question = line.strip()
                    current_answer = ""
                elif line.startswith('Q:'):
                    current_question = line.strip()
                elif line.startswith('A:'):
                    current_answer = line.strip()
                else:
                    current_answer += " " + line.strip()

            # Add the last Q-A pair if exists
            if current_question and current_answer:
                pairs.append((current_question, current_answer.strip()))

    print(f'Read a total of {len(pairs)} Q-A pairs.')
    return pairs


In [21]:
def generate_rag_responses(questions, ragclass):
    responses = []
    for question in questions:
        # Generate responses
        rag_response = ragclass.generate_response(question)
        responses.append(rag_response)

    return responses

def generate_llm_responses(questions, ragclass):
    llm_responses = []
    for question in questions:
       # Generate responses
       llm_response = ragclass.integrate_llm(question)
       llm_responses.append(llm_response)
    return llm_responses


In [22]:
qa_pairs = read_question_answer_pairs(question_file)
for q, a in qa_pairs:
    print(f"Question: {q}\nAnswer: {a}\n")

Read a total of 50 Q-A pairs.
Question: Q: Where do I change my monitor configuration settings?
Answer: A: To adjust the windows that allow for viewing within the Visage 7 client, navigate to the upper left-hand corner and choose File > Preferences> Monitor Configuration. This will allow you to customize your monitors for viewing, displaying the study browser, study navigator, and export.

Question: Q: Where do I change my monitor configuration settings for viewing on a Visage 7 client?
Answer: A: To adjust the windows that allow for viewing within the Visage 7 client, navigate to the upper left-hand corner and choose File > Preferences> Monitor Configuration. This will allow you to customize your monitors for viewing, displaying the study browser, study navigator, and export.

Question: Q: How do I access priors for a patient?
Answer: A: Patient priors can be accessed from the Study Navigator, or from the Thumbnail Browser at the bottom of the viewer.

Question: Q: How do I access pri

In [6]:
# instantiate RAG class
rag = RAG(db_path = DBPATH, llm_api_key=OPENAI_API_KEY, embedding_model=model, chunk_size = chunk_size, overlap=chunk_overlap, top_k = top_k)


In [23]:
# get lists of questions and answers 
question_list = [question for (question, _) in qa_pairs]
answer_list = [answer for (_, answer) in qa_pairs]



In [8]:
# get the rag and llm responses for each question
rag_response_list = generate_rag_responses(question_list, rag)
llm_response_list = generate_llm_responses(question_list, rag)


In [10]:
# structured_output = ''
# for i, qa in enumerate(question_list):
#     structured_output += ('\n' + question_list[i])
#     structured_output += ('\n' + 'RAG response: ' + rag_response_list[i]+'\n')
#     structured_output += ('\n' + 'LLM response: ' + llm_response_list[i]+'\n') 

# with open('../assets/output_file.txt', 'w') as file:
#     file.write(structured_output)


In [25]:

# write the Q and Answers (human, rag, and llm to an output file for grading)
structured_output = ''
wrapper = textwrap.TextWrapper(width=80) 

for i, qa in enumerate(question_list):
    structured_output += ('\n' + question_list[i])
    structured_output += ('\n' + 'human answer: ' + '\n'.join(wrapper.wrap(answer_list[i])))
    structured_output += ('\n' + 'RAG response: ' + '\n'.join(wrapper.wrap(rag_response_list[i])))
    structured_output += ('\n' + 'LLM response: ' + '\n'.join(wrapper.wrap(llm_response_list[i])))
    structured_output += '\n'

with open('../assets/output_file2.txt', 'w') as file:
    file.write(structured_output)