### Use OpenAI knowledge-retrieval gpt-4-1106-preview
Retrieval augments the Assistant with knowledge from outside its model, such as proprietary product information or documents provided by your users. Once a file is uploaded and passed to the Assistant, OpenAI will automatically chunk your documents, index and store the embeddings, and implement vector search to retrieve relevant content to answer user queries.\
https://platform.openai.com/docs/assistants/tools/knowledge-retrieval

In [51]:
import os
import json
import time
import openai


def show_json(obj):
    display(json.loads(obj.model_dump_json())) 

# Pretty printing helper
def pretty_print(messages):
    print("# Messages")
    for m in messages:
        print(f"{m.role}: {m.content[0].text.value}")
    print()

def wait_on_run(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id,
        )
        time.sleep(0.5)
    return run

client = openai.OpenAI(api_key = os.getenv("OPENAI_API_KEY"))
file_path = "..\\..\\data\\pdf\\test\\2308.00479.pdf"
user_prompt = "Could you please summarize the article?"

# Upload a file with an "assistants" purpose
file = client.files.create(
  file=open(file_path, "rb"),
  purpose='assistants'
)

# Add the file to the assistant
# You can attach a maximum of 20 files per Assistant, and they can be at most 512 MB each. 
# In addition, the size of all the files uploaded by your organization should not exceed 100GB. 
# You can request an increase in this storage limit using our help center.
assistant = client.beta.assistants.create(
  instructions="You are a customer support chatbot. Use your knowledge base to best respond to customer queries.",
  model="gpt-4-1106-preview",
  tools=[{"type": "retrieval"}],
  file_ids=[file.id]
)

# thread = client.beta.threads.create()

# message = client.beta.threads.messages.create(
#   thread_id=thread.id,
#   role="user",
#   content=user_prompt,
#   file_ids=[file.id]
# )

thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": user_prompt,
      "file_ids": [file.id]
    }
  ]
)

run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id,
)

run = wait_on_run(run, thread)
messages = client.beta.threads.messages.list(thread_id=thread.id)

pretty_print(messages)
# messages.data[0].content[0].text.value

# Messages
assistant: The article discusses the application of Retrieval Augmented Generation (RAG) enhanced by Representative Vector Summarization (RVS) for unstructured textual data, particularly in the context of medical education. Large Language Models (LLMs), known for their impressive zero-shot learning capabilities in various tasks including content generation and chatbot functionality, can run into issues when applied to domain-specific tasks, such as producing factually incorrect information or being difficult to update once trained on a vast corpus of data.

To address these challenges, the authors propose a combined extractive and abstractive summarization method using representative vectors that improves upon direct summarization by existing LLMs, which may struggle with large documents due to context window limitations and the "Lost in the Middle" problem where facts in the middle of the context are overlooked. RVS selects a predefined number of representative chunks of te

### Use Azure Document Intelligence models
https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/use-sdk-rest-api?view=doc-intel-3.0.0&preserve-view=true%3Fpivots%3Dprogramming-language-python&tabs=windows&pivots=programming-language-python

In [None]:
# step 1 read PDF to memory
import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

# Azure AI sample document
file_path = "..\\..\\data\\pdf\\test\\2308.00479.pdf"
# file_formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/rest-api/read.png"

# formatting function
def format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])


def analyze_read(file_path):
    # sample document

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    if "https://" in file_path:
        poller = document_analysis_client.begin_analyze_document_from_url(
            "prebuilt-read", file_path
        )
    else:
        with open(file_path, "rb") as f:
            poller = document_analysis_client.begin_analyze_document(
            "prebuilt-read", document=f, locale="en-US"
        )
  
    result = poller.result()

    print("Document contains content: ", result.content)

    for idx, style in enumerate(result.styles):
        print(
            "Document contains {} content".format(
                "handwritten" if style.is_handwritten else "no handwritten"
            )
        )

    for page in result.pages:
        print("----Analyzing Read from page #{}----".format(page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit
            )
        )

        for line_idx, line in enumerate(page.lines):
            print(
                "...Line # {} has text content '{}' within bounding box '{}'".format(
                    line_idx,
                    line.content,
                    format_polygon(line.polygon),
                )
            )

        for word in page.words:
            print(
                "...Word '{}' has a confidence of {}".format(
                    word.content, word.confidence
                )
            )

    print("----------------------------------------")
    return result


if __name__ == "__main__":
    result = analyze_read(file_path)
   

In [56]:
# step 2 abtractively summarization 
from azure.ai.textanalytics import TextAnalyticsClient


def sample_abstractive_summarization(document) -> None:
    # [START abstract_summary]
    endpoint = os.environ["AZURE_LANGUAGE_ENDPOINT"]
    key = os.environ["AZURE_LANGUAGE_KEY"]    
    text_analytics_client = TextAnalyticsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key),
    )

    poller = text_analytics_client.begin_abstract_summary(document)
    abstract_summary_results = poller.result()
    for result in abstract_summary_results:
        if result.kind == "AbstractiveSummarization":
            print("Summaries abstracted:")
            [print(f"{summary.text}\n") for summary in result.summaries]
        elif result.is_error is True:
            print("...Is an error with code '{}' and message '{}'".format(
                result.error.code, result.error.message
            ))
    # [END abstract_summary]

sample_abstractive_summarization([result.content])

Summaries abstracted:
The document discusses the application of Retrieval Augmented Generation (RAG) in medical education, specifically in the field of medical education. The paper introduces a combined extractive and abstractive summarization method for large unstructured textual data using representative vectors. The method, called Representative Vector Summarization (RVS), is implemented in docGPT, a document intelligence program written in Python. The document also describes the process of retrieval and summarization, and the evaluation of the methods. The authors conclude that RVS could provide efficient methods for retrieving information quickly from large knowledgebases, particularly in the field of clinical medicine.

