In [1]:
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Taking the key to connect with OpenAI to use them 

In [3]:
load_dotenv()

True

In [4]:
#Initializing the Model becuase as we need to use the model so key is loaded now we enter in the room and selected the mode which is
#CLIP and now we are initialing it

In [5]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_preprocess = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [6]:
#Now the next step is to embedd the images and text for that matter we need to make two functions which 

In [7]:
def embed_image(image_data):
    if isinstance(image_data , str):
        image =Image.open(image_data).convert("RGB")
    else:
        image = image_data

    inputs = clip_preprocess(images = image, return_tensors="pt")

    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        features = features / features.norm(dim= -1 , keepdim=True)
        return features.squeeze().numpy()

In [8]:
#Lets do embedding for the text

In [9]:
def embed_text(text):
    inputs = clip_preprocess(text=text, return_tensors="pt" , padding=True, truncation=True, max_length=77)

    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        features = features / features.norm(dim= -1 , keepdim=True)
        return features.squeeze().numpy()

In [10]:
#Embedding is done lets read the pdf

In [11]:
pdf_path  = "multimodal_sample.pdf"
doc = fitz.open(pdf_path)

all_docs =[]
all_embeddings = []
image_data_store = {}
splitter = RecursiveCharacterTextSplitter(chunk_size=500 , chunk_overlap=100)

In [12]:
doc

Document('multimodal_sample.pdf')

In [13]:
for i, page in enumerate(doc):
    #This is for the text
    text = page.get_text()
    if text.strip():
        temp_doc = Document(page_content=text , metadata={"page":i , "type":"text"})
        text_chunk = splitter.split_documents([temp_doc])

        for chunk in text_chunk:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)
    #This is for the image
    ##Three major steps will be taken 
    #-> Convert PDF to PIL image
    #-> Store as a base64 string which is an text version of images
    #-> CLIP embedding, CLIP is a pre trained model from openAI, also available on Hugging Face

    #First step of converting the pdf to PIL image format
    for img_index , img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            image_id = f"page_{i}_img_{img_index}"
            
      #-> Store as a base64 string which is an text version of images      

            buffered =io.BytesIO()
            pil_image.save(buffered , format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64
            
      #-> CLIP embedding, CLIP is a pre trained model from openAI, also available on Hugging Face          

            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

    #Lets create a document
            image_doc = Document(page_content=f"[Image: {image_id}]", metadata={"page":i , "type":"image" , "image_id" : image_id})
            all_docs.append(image_doc)
        except Exception as e:
            print(f"Error is for image {img_index} at the page {i}: {e}")
            continue

doc.close()

In [14]:
all_embeddings

[array([-2.67243781e-03,  1.28300022e-02, -5.18314056e-02,  4.14879434e-02,
        -2.33942084e-02, -7.55866105e-03, -3.67659107e-02,  1.19710736e-01,
         8.52081031e-02,  2.05425802e-03, -1.11534707e-02, -1.29592177e-02,
         5.25014549e-02, -3.65397707e-03,  4.76078540e-02,  1.58372894e-02,
         2.03388296e-02,  4.35362384e-02, -3.29167210e-03,  2.03181319e-02,
         1.88026356e-03, -4.23493981e-02,  5.44103794e-03,  3.70935835e-02,
        -1.65622961e-02,  6.48646429e-03, -4.78012413e-02,  8.67478456e-03,
         5.88859655e-02, -3.21394205e-02,  4.32440080e-02,  9.65300854e-03,
        -4.47923737e-03, -1.94858033e-02, -3.63502391e-02, -1.23471869e-02,
        -2.17928980e-02, -1.99016239e-02,  8.09619948e-02, -3.32986861e-02,
        -2.38901377e-02, -3.96138951e-02, -1.27280178e-02,  3.50381061e-02,
        -2.52217129e-02,  2.00031535e-03,  1.49660306e-02, -2.31976397e-02,
        -6.86791241e-02, -5.25778392e-04, -2.22545750e-02, -1.04103824e-02,
        -1.9

In [15]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')]

In [16]:
#Convert embedding into numpy array

embeddings_array = np.array(all_embeddings)

In [17]:
embeddings_array

array([[-0.00267244,  0.01283   , -0.05183141, ..., -0.00385082,
         0.02977719, -0.00010684],
       [ 0.0173234 , -0.0132769 , -0.02427034, ...,  0.0899405 ,
        -0.00272152,  0.03253041]], shape=(2, 512), dtype=float32)

In [18]:
#Let make a vector store we will use FAISS

In [19]:
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs , embeddings_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]
    
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [20]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x25d3128d290>

In [21]:
#Lets initialize our model 

In [22]:
llm = init_chat_model("openai:gpt-4.1")

In [23]:
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000025D4DF83ED0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x0000025D4E091390>, root_client=<openai.OpenAI object at 0x0000025D4DB8AE50>, root_async_client=<openai.AsyncOpenAI object at 0x0000025D4E091090>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********'))

In [24]:
def retrieve_multimodal(query , k=5):
    query_embedding = embed_text(query)

    results = vector_store.similarity_search_by_vector(embedding= query_embedding , k=k)

    return results

In [25]:
#Here is the formating of the query which will later will be given to the model (GPT-4)

In [26]:
def create_multimodal_message(query , retrieved_docs):
    content= []

    #Well this is the first step when query comes, we just need to add raw query

    content.append({
        "type": "text",
        "text" : f"The Question is: {query} \n \n Context \n"
    })

    #Now lets separate the text and image from documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    if temp_doc:
        text_content = "\n\n".join([f"[The page is: {doc.metadata['page']}]: {doc.page_content}" for doc in text_docs])
        content.append({
            "type": "text",
            "text" : f"The text Excrept:\n {text_content}\n"
        })

    #Lets do it for the images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type":"text",
                "text": f"\n [The image data is from page {doc.metadata['page']}]: \n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)
            

In [27]:
def multimodal_pdf_rag_pipeline(query):
    context_docs = retrieve_multimodal(query , k =4)
    message = create_multimodal_message(query , context_docs)
    response = llm.invoke([message])

    #Print retrieved context info 
    print(f"\n Retrieved {len(context_docs)} documents: ")

    for doc in context_docs:
        doc_type = doc.metadata.get("type" , "unknown")
        page = doc.metadata.get("page" , "?")

        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page} : {preview}")
        else:
            print(f"  - Image is from page {page}")
    print("\n")

    return response.content
            

In [28]:
if __name__ == "__main__":
    queries = [
        "Summarize the main findings from the document"
    ]

    for query in queries:
        print(f"\n Query: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: \n {answer}")
        print("=" * 70)


 Query: Summarize the main findings from the document
--------------------------------------------------


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}