MULTIMODEL RAG

IMPORTING ESSENTIAL LIBRARIES

In [1]:
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
import os
import sys
import pickle
import camelot
from pdf2image import convert_from_path
import fitz
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import io
import torch

  from .autonotebook import tqdm as notebook_tqdm


CREATING DIRECTORIES


In [2]:
def route_query(question):
    TEXT_ONLY_KEYWORDS = [ "explain", "describe", "what is", "definition", "meaning", "overview", "summary", "details", "guidelines", "rules", "policy", "procedure", "steps", "process",
 "eligibility","benefits","limitations","scope","features","objectives","criteria","conditions","notes","concept","theory"]

    IMAGE_ONLY_KEYWORDS = [
    "image","picture","photo","figure only","just the image","show the image","display image","visual only","see the image","look at this","screenshot only"
    ]

    IMAGE_TEXT_KEYWORDS = [
        "diagram",    "flowchart",    "workflow",    "architecture",    "as shown",    "as illustrated","as depicted","refer to the figure", 
        "with diagram",    "with image",    "illustration",    "visual explanation",    "step by step diagram",   "process diagram"
    ]

    IMAGE_CAPTION_KEYWORDS = [
        "describe the image",
        "what does the image show",
        "image description",
        "caption",
        "explain the figure",
        "explain the image",
        "what is shown in the picture",
        "figure explanation",
        "visual description"
    ]
    q = question.lower()

    if any(k in q for k in IMAGE_ONLY_KEYWORDS):
        return "image_required"

    if any(k in q for k in IMAGE_TEXT_KEYWORDS):
        return "text_plus_image"

    if any(k in q for k in IMAGE_CAPTION_KEYWORDS):
        return "image_caption"

    # if any(k in q for k in GRAPH_KEYWORDS):
    #     return "graph_required"

    # if any(k in q for k in MAP_KEYWORDS):
    #     return "map_required"

    # if any(k in q for k in TABLE_ONLY_KEYWORDS):
    #     return "table_only"

    # if any(k in q for k in TEXT_TABLE_KEYWORDS):
    #     return "text_plus_table"

    # if any(k in q for k in MULTIMODAL_KEYWORDS):
    #     return "multimodal_all"

    return "text_only"

TEXT_ONLY_KEYWORDS = [ "explain", "describe", "what is", "definition", "meaning", "overview", "summary", "details", "guidelines", "rules", "policy", "procedure", "steps", "process",
 "eligibility","benefits","limitations","scope","features","objectives","criteria","conditions","notes","concept","theory"]

IMAGE_ONLY_KEYWORDS = [
"image","picture","photo","figure only","just the image","show the image","display image","visual only","see the image","look at this","screenshot only"
]

IMAGE_TEXT_KEYWORDS = [
    "diagram",    "flowchart",    "workflow",    "architecture",    "as shown",    "as illustrated","as depicted","refer to the figure", 
       "with diagram",    "with image",    "illustration",    "visual explanation",    "step by step diagram",   "process diagram"
]

IMAGE_CAPTION_KEYWORDS = [
    "describe the image",
    "what does the image show",
    "image description",
    "caption",
    "explain the figure",
    "explain the image",
    "what is shown in the picture",
    "figure explanation",
    "visual description"
]

TABLE_ONLY_KEYWORDS = [
    "table","tabular","list of","comparison table","compare","difference between","rows","columns","statistics",
    "data","figures","values","rates","charges","amount","cost","price","limits","coverage","package list"
]


GRAPH_KEYWORDS = [
    "graph",
    "chart",
    "bar chart",
    "line graph",
    "pie chart",
    "histogram",
    "trend",
    "distribution",
    "growth",
    "comparison graph",
    "visual trend",
    "plot"
]

MAP_KEYWORDS = [
    "map",
    "location",
    "geographical",
    "region",
    "state wise",
    "district wise",
    "coverage map",
    "area",
    "zones",
    "boundaries",
    "spatial",
    "location based"
]

TEXT_TABLE_KEYWORDS = [
    "explain with table",
    "tabular explanation",
    "table with explanation",
    "compare and explain",
    "summarize in table",
    "data with explanation"
]


In [3]:
# caption_pipe = pipeline(
#     "image-to-text",
#     model="Salesforce/blip-image-captioning-base"
# )

processor = BlipProcessor.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)



output_file = "./data/vector_store_pmjay.pkl"
# Auto-create directory in one line
os.makedirs(os.path.dirname(output_file), exist_ok=True)
os.makedirs("image_store_pmjay", exist_ok=True)

# todo: add argument parser
num_files = None
index_top_k = False
if len(sys.argv) > 1:
    index_top_k = False   # change to True if needed
    num_files = 20        # how many PDFs to index (if index_top_k=True)

print(f"Indexing top k files: {num_files}, index_top_k: {index_top_k}")

#print("Indexing top k files: {}, index_top_k: {}".format(num_files, index_top_k))

cache_index = True
embedding_model = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})





Loading weights: 100%|██████████| 473/473 [00:00<00:00, 552.57it/s, Materializing param=vision_model.post_layernorm.weight]                                       
[1mBlipForConditionalGeneration LOAD REPORT[0m from: Salesforce/blip-image-captioning-base
Key                                       | Status     |  | 
------------------------------------------+------------+--+-
text_decoder.bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Indexing top k files: 20, index_top_k: False


  embedding_model = HuggingFaceEmbeddings(
Loading weights: 100%|██████████| 103/103 [00:02<00:00, 35.91it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [4]:
def caption_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=30)
    caption = processor.batch_decode(out, skip_special_tokens=True)
    return caption

FUNCTIONS

In [None]:
def split_file_to_chunks(file_path):
    all_chunks=[]
    # -------- TEXT --------
    docs = []
    loader = PyPDFLoader(file_path)
    pages = loader.load()

    for d in pages:
        docs.append(
            Document(
                page_content=d.page_content,
                metadata={
                    "pdf": file_path,
                    "page": d.metadata["page"],
                    "modality": "text"
                }
            )
        )

       # -------- TABLES --------
    tables = camelot.read_pdf(file_path, pages="all")

    for table in tables:
        docs.append(
            Document(
                page_content=table.df.to_string(),
                metadata={
                    "pdf": file_path,
                    "page": table.page,
                    "modality": "table"
                }
            )
        )

        # PyMuPDF
    doc = fitz.open(file_path)

    for page_index, page in enumerate(doc):
        page_num = page_index + 1
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base = doc.extract_image(xref)

            image_id = f"{os.path.basename(file_path)}_p{page_num}_i{img_index}"
            image_path = f"image_store_pmjay/{image_id}.{base['ext']}"

            with open(image_path, "wb") as f:
                f.write(base["image"])

            caption = caption_image(image_path)[0]#["generated_text"]

            docs.append(
                Document(
                    page_content=caption,
                    metadata={
                        "pdf": file_path,
                        "page": page_num,
                        "image_id": image_id,
                        "modality": "image_caption"
                    }
                )
            )

    splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50)
    
    chunks = splitter.split_documents(docs)
    return chunks

    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=500, chunk_overlap=50)
    # chunks = text_splitter.split_document(document=(document.append(store_text)))
    # all_chunks.extend(chunks)
    # chunks = text_splitter.split_document(document=(tables.append(store_table)))
    # all_chunks.extend(chunks)
    # chunks = text_splitter.split_document(document=doc.append(store_image_caption))
    # all_chunks.extend(chunks)
    # return all_chunks


: 

ITERATING EVERY LOOP


In [None]:
all_chunks=[] 
count = 0 
early_exit = False 
for folder_name, _, filename in os.walk("PM_JAY"): 
    for file in filename:
        if not file.endswith(".pdf"): 
            continue 
        file_path = os.path.join(folder_name, file) 
        print("Loading: ", file_path) 
        chunks = split_file_to_chunks(file_path) 
        all_chunks.extend(chunks) 
        count += 1 
        if index_top_k and count >= num_files: 
            early_exit = True 
            break 
    if early_exit: 
        break 
print("Total chunks:", len(all_chunks)) 
if not all_chunks: 
    raise ValueError("No document found! Check your PDF folder.")
else:
    print("Sample chunk:\n", all_chunks[0].page_content[:300])
print("Indexing files") 
 
vector_store_pmjay = FAISS.from_document( all_chunks, embedding=embedding_model) 

if cache_index: 
    print("Caching index") 
    pickle.dump(vector_store_pmjay, open(output_file, "wb")) 

print("Done...")

Loading:  PM_JAY/CLAIMS Pdfs/84_Guidelines-on-Claim-Settlement.pdf
Loading:  PM_JAY/CLAIMS Pdfs/PM-JAY Process Flow at Empanelled Hospitals.pdf
Loading:  PM_JAY/CLAIMS Pdfs/Claims adjudication manual.pdf


  if self._document_has_no_text():
  if self._document_has_no_text():
  if self._document_has_no_text():
  if self._document_has_no_text():
  if self._document_has_no_text():


Loading:  PM_JAY/CLAIMS Pdfs/TMS User Manual (National Transaction Management System).pdf




Loading:  PM_JAY/CLAIMS Pdfs/BIS Card Generation.pdf




Loading:  PM_JAY/CLAIMS Pdfs/Approvers in TMS Manual (PDF).pdf
Loading:  PM_JAY/Registration pdf's/BIS-2.0-Mobile-App-Usermanual_ver13.0.pdf




Loading:  PM_JAY/Registration pdf's/hissar_15840.pdf
Loading:  PM_JAY/Registration pdf's/DISCHARGE SUMMARY FORM.pdf
Loading:  PM_JAY/Registration pdf's/HOW_TO_APPLY_AYUSHMAN_CARD.pdf
Loading:  PM_JAY/Registration pdf's/PM-JAY Process Flow at Empanelled Hospitals.pdf
Loading:  PM_JAY/Registration pdf's/Health benefit packages & Empanelment criteria for AB-PMJAY.pdf


In [None]:
retriever = vector_store_pmjay.as_retriever()
print("input")
query = input("E: ") #"HTML forms and input types"
retriever = vector_store_pmjay.as_retriever(search_kwargs={"k": 5})
docs = retriever.invoke(query)


NameError: name 'vector_store_pmjay' is not defined

In [None]:
for i, doc in enumerate(docs):
    print(f"\n--- Result {i+1} ---")
    print("Page:", doc.metadata.get("page"))
    print(doc.page_content)
    if docs.metadata["modality"] == "image_caption":
        image_id = docs.metadata["image_id"]
        image_path = f"image_store_pmjay/{image_id}.png"  # or correct ext
        display(Image.open(image_path))
