<a href="https://colab.research.google.com/github/manasanagendran/goto_multimodal_rag/blob/main/Company_Insurance_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
pip install pymupdf pdf2image pillow openai langchain-community




In [14]:
from openai import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
import os,fitz,base64

In [15]:
import config

In [16]:
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as img_f:
        return base64.b64encode(img_f.read()).decode("utf-8")

In [17]:
def describe_image(base64_image):
    """
    Uses OpenAI's GPT-4o model to generate a description of the image.
    """
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        { "role": "system", "content": "Your job is to extract all the information from the images, including the text. If you are unable to extract the image, do not add that  extract and add that information.Extract all the text from the image without changing the order or structure of the information. Recheck if all the text has been extracted correctly and return in the same presentation and structure as present in the original image.If the question asked is not present in the document give a response saying that it invalid "},
         { "role": "user",
          "content": [
            {"type": "text", "text": "Extract all the text from the image in the same structure as present in the image. and then after it summarise everything in brief, do not miss anything "},
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/png;base64,{base64_image}",
              },
            },
          ],
        }
      ],
      max_tokens=300,
    )
    return response.choices[0].message.content

In [29]:
def extract_text_and_images(folder_name,pdf_path):
    doc = fitz.open(pdf_path)
    content_blocks = []
    # Create the output folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    for i, page in enumerate(doc):
        # Extract text
        text = page.get_text().strip()
        if text:
            content_blocks.append(f"\n\nPage {i + 1}:\n{text}")

        # Extract and encode images
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_img = doc.extract_image(xref)
            image_bytes = base_img["image"]
            ext = base_img["ext"]
            image_path = f"page_{i+1}_img_{img_index}.{ext}"
            image_filepath = os.path.join(folder_name, image_path)
            with open(image_filepath, "wb") as f:
                f.write(image_bytes)
            # Optionally describe image with captioning model or just store image path
            base64_img = encode_image_to_base64(image_filepath)
            image_description = describe_image(base64_img)
            content_blocks.append( f"\n\n[Image: {image_path}]\n{image_description}")
    return content_blocks

In [20]:
client = OpenAI(api_key=config.API_KEY)

In [30]:
folder_name="image_base64_folder"
pdf_path= "/content/Group Mediclaim Benefits Manual-PY24-25.pdf"
img_text_list=extract_text_and_images(folder_name,pdf_path)

In [None]:
img_text_list


In [38]:
!pip install langchain_openai chromadb

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.34.1-py3-none-any.whl.metadata (1.6 k

In [36]:
from langchain_openai import OpenAIEmbeddings
import tiktoken
embedding = OpenAIEmbeddings(api_key=config.API_KEY)

In [42]:
context="\n".join(img_text_list)

In [39]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# Assume content_list is list of extracted text/image notes


text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(context)
 # or HuggingFaceEmbeddings, etc.
vectorstore = Chroma.from_texts(chunks, embedding, persist_directory="./chroma_pdf")
vectorstore.persist()

  vectorstore.persist()


In [None]:
vectorstore.search("what is the maternity benefit without add-on?",search_type='similarity')

In [56]:
def rag_chatbot(user_input, history):
    # Step 1: Retrieve relevant context
    docs = vectorstore.similarity_search(user_input, k=4)

 # Step 2: Construct conversation with memory
    messages = [
        {"role": "system", "content": "You are a helpful assistant answering questions from a PDF."}
    ]

    for q, a in history:
        messages.append({"role": "user", "content": q})
        messages.append({"role": "assistant", "content": a})

    messages.append({
        "role": "user",
        "content": f"Use the following PDF context to answer: \n{context}\n\nQuestion: {user_input}"
    })

    # Step 3: Get GPT answer
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=1000
    )
    reply = response.choices[0].message.content.strip()

    history.append((user_input, reply))
    return "", history

In [57]:
import gradio as gr

In [59]:
with gr.Blocks() as demo:
    gr.Markdown("## 📄 RAG Chatbot with Memory (PDF + ChromaDB + GPT-4o)")

    chatbot = gr.Chatbot()
    user_input = gr.Textbox(label="Ask a question", placeholder="e.g. What is the summary on page 2?")
    send_button = gr.Button("Send")

    send_button.click(rag_chatbot, inputs=[user_input, chatbot], outputs=[user_input, chatbot])

demo.launch(debug=True)

  chatbot = gr.Chatbot()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c9c9fd73f67fa1d447.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://c9c9fd73f67fa1d447.gradio.live


