MULTIMODEL RAG

IMPORTING ESSENTIAL LIBRARIES

In [None]:
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
import os
import sys
import pickle
import camelot
from pdf2image import convert_from_path
import fitz
from transformers import pipeline


FUNCTIONS

In [None]:
caption_pipe = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-base"
)

output_file = "./data/vector_store_pmjay.pkl"
# Auto-create directory in one line
os.makedirs(os.path.dirname(output_file), exist_ok=True)
os.makedirs("image_store_pmjay", exist_ok=True)

# todo: add argument parser
num_files = None
index_top_k = False
if len(sys.argv) > 1:
    index_top_k = False   # change to True if needed
    num_files = 10        # how many PDFs to index (if index_top_k=True)

print(f"Indexing top k files: {num_files}, index_top_k: {index_top_k}")

#print("Indexing top k files: {}, index_top_k: {}".format(num_files, index_top_k))

cache_index = True
embedding_model = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})

def split_file_to_chunks(file_path):
    all_chunks=[]
    # -------- TEXT --------
    docs = []
    loader = PyPDFLoader(file_path)
    pages = loader.load()

    for d in pages:
        docs.append(
            Document(
                page_content=d.page_content,
                metadata={
                    "pdf": file_path,
                    "page": d.metadata["page"],
                    "modality": "text"
                }
            )
        )

       # -------- TABLES --------
    tables = camelot.read_pdf(file_path, pages="all")

    for table in tables:
        docs.append(
            Document(
                page_content=table.df.to_string(),
                metadata={
                    "pdf": file_path,
                    "page": table.page,
                    "modality": "table"
                }
            )
        )

        # PyMuPDF
    doc = fitz.open(file_path)

    for page_index, page in enumerate(doc):
        page_num = page_index + 1
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base = doc.extract_image(xref)

            image_id = f"{os.path.basename(file_path)}_p{page_num}_i{img_index}"
            image_path = f"image_store/{image_id}.{base['ext']}"

            with open(image_path, "wb") as f:
                f.write(base["image"])

            caption = caption_pipe(image_path)[0]["generated_text"]

            docs.append(
                Document(
                    page_content=caption,
                    metadata={
                        "pdf": file_path,
                        "page": page_num,
                        "image_id": image_id,
                        "modality": "image_caption"
                    }
                )
            )

    splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50)
    
    chunks = splitter.split_documents(docs)
    return chunks

    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=500, chunk_overlap=50)
    # chunks = text_splitter.split_document(document=(document.append(store_text)))
    # all_chunks.extend(chunks)
    # chunks = text_splitter.split_document(document=(tables.append(store_table)))
    # all_chunks.extend(chunks)
    # chunks = text_splitter.split_document(document=doc.append(store_image_caption))
    # all_chunks.extend(chunks)
    # return all_chunks
