In [1]:
import pymupdf as fitz
import os  
import base64 
import io 
from langchain_core.documents import Document 
from transformers import CLIPProcessor, CLIPModel 
from PIL import Image 
import torch 
import numpy as np 
from langchain.chat_models import init_chat_model 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_community.vectorstores import FAISS 
from langchain.prompts import PromptTemplate 
from langchain.schema.messages import HumanMessage 
from sklearn.metrics.pairwise import cosine_similarity        

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os 
from dotenv import load_dotenv 
load_dotenv() 

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')  

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") 
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
clip_model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [4]:
def get_img_embed(image_data): 
    """We embed the image data """ 
    if isinstance(image_data,str): 
        image = Image.open(image_data).convert('RGB') 
    else: 
        image = image_data 

    inputs = clip_processor(images=image,return_tensors='pt') 
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs) 
        ##Normalize this in to vectors  
        features = features /features.norm(dim=-1,keepdim=True) 
        return features.squeeze().numpy()        
    
def get_text_embed(text_data): 
    inputs = clip_processor(
        text = text_data,
        return_tensors= 'pt',
        padding = True,
        truncation=True,
        max_length = 77 # clips_max_token_length 
    )

    with torch.no_grad():
        features = clip_model.get_text_features(**inputs) 
        ##Normalize this in to vectors  
        features = features /features.norm(dim=-1,keepdim=True) 
        return features.squeeze().numpy()        


In [5]:
pdf_path = "data_incident.pdf"
doc = fitz.open(pdf_path) 

all_docs=[] 
all_embeddings=[] 
image_data_store = {} 

splitter = RecursiveCharacterTextSplitter(chunk_size =500, chunk_overlap =10)



In [6]:
for i,page in enumerate(doc): 
    # for text 
    text = page.get_text() 
    if text.strip():
        temp_doc = Document(page_content=text,metadata={"page":i,"type":"text"}) 
        text_chunk = splitter.split_documents([temp_doc])  

        for chunk in text_chunk: 
            text_embeddings = get_text_embed(chunk.page_content) 
            all_embeddings.append(text_embeddings) 
            all_docs.append(chunk) 

    #for images   
    # The three important process is important 
    # Convert the PDF image to PIL format 
    # store as base64 for GPT-4v (which need base4 images)
    ## Create CLIP embeddings for retrival  

    for img_index,img in enumerate(page.get_images(full=True)): 
        try: 
            xref = img[0] 
            base_image = doc.extract_image(xref) 
            image_bytes = base_image['image'] 

            #convert to PIL image  
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")


            image_id = f"page_{i}_img_{img_index}" 

            #store image as base64 for later use with GPT -4V 
            buffered = io.BytesIO()
            pil_image.save(buffered,format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode() 
            image_data_store[image_id] = img_base64 

            #embed image using CLIP 
            embedding = get_img_embed(pil_image)
            all_embeddings.append(embedding) 

            #create the documents for image 
            image_doc = Document(
                page_content=f"[Image:{image_id}]", 
                metadata = {"page":{i},"type":"image","image_id":image_id}
            )
            all_docs.append(image_doc) 
        except Exception as e:
            print(f"Error in processing the image {img_index} on page {i}: {e}") 
            continue 


doc.close() 


In [7]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='THE CENTRE FOR HUMANITARIAN DATA\n1\nAUGUST 2019\nWHAT IS A DATA INCIDENT IN HUMANITARIAN RESPONSE?\nIn the humanitarian sector, data incidents are events involving the management of data that have caused \nharm or have the potential to cause harm to crisis affected populations, humanitarian organisations \nand their operations, and other individuals or groups. These events can exploit or exacerbate existing'),
 Document(metadata={'page': 0, 'type': 'text'}, page_content='vulnerabilities.1 In some cases, they may also create new vulnerabilities that can increase the risk of future \ndata incidents. \nHumanitarians have not had a common understanding of what comprises a data incident, nor is there \na minimum technical standard for how these incidents should be prevented and managed. How the \nhumanitarian sector develops tools and implements procedures for data incident management will play'),
 Document(metadata={'page': 0, 

In [8]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.0066789 , -0.02534097,  0.02463271, ..., -0.0367243 ,
         0.01132058,  0.00913771],
       [ 0.00192667, -0.01688765,  0.01968805, ...,  0.00987415,
        -0.01357171, -0.00770265],
       [ 0.02495551,  0.00877788,  0.00201296, ...,  0.00492413,
        -0.02641535, -0.0226235 ],
       ...,
       [ 0.05377281,  0.01607991,  0.0554263 , ..., -0.03836596,
         0.00054299,  0.01913766],
       [ 0.01667728,  0.00028056,  0.01801195, ..., -0.11729382,
        -0.03672666,  0.02014948],
       [ 0.00669684, -0.02093203, -0.0081911 , ...,  0.05437464,
        -0.00852971, -0.02943654]], dtype=float32)

In [9]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='THE CENTRE FOR HUMANITARIAN DATA\n1\nAUGUST 2019\nWHAT IS A DATA INCIDENT IN HUMANITARIAN RESPONSE?\nIn the humanitarian sector, data incidents are events involving the management of data that have caused \nharm or have the potential to cause harm to crisis affected populations, humanitarian organisations \nand their operations, and other individuals or groups. These events can exploit or exacerbate existing'),
  Document(metadata={'page': 0, 'type': 'text'}, page_content='vulnerabilities.1 In some cases, they may also create new vulnerabilities that can increase the risk of future \ndata incidents. \nHumanitarians have not had a common understanding of what comprises a data incident, nor is there \na minimum technical standard for how these incidents should be prevented and managed. How the \nhumanitarian sector develops tools and implements procedures for data incident management will play'),
  Document(metadata={'page': 

In [10]:
# Create custom FAISS index since we have precomputed embeddings 
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content,emb)for doc,emb in zip(all_docs,embeddings_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [11]:
llm  = init_chat_model("openai:gpt-4.1")
llm 

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000001C372863130>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001C37288DED0>, root_client=<openai.OpenAI object at 0x000001C34A847220>, root_async_client=<openai.AsyncOpenAI object at 0x000001C3728631F0>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********'))

In [12]:
def retrieve_mutlimodel(query,k=5):
    query_embeded = get_text_embed(query) 

    results = vector_store.similarity_search_by_vector(
        embedding=query_embeded, 
        k=k
    ) 

    return results 

In [13]:
def create_multimodel_message(query,retrived_docs):
    """create a docs with both text and image for GPt-4v""" 

    content =[] 

    content.append({
        "type":"text", 
        "text":f"Question:{query}\n\ncontext:\n" 
    }) 

    #Seperate text and image documents 
    text_docs = [doc for doc in retrived_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrived_docs if doc.metadata.get("type") == "image"] 


       # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
    
    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    return HumanMessage(content=content)

In [14]:
def multimodel_pdf_rag(query):
    #retrieve the relavent docs 
    context_docs = retrieve_mutlimodel(query,k=5) 

    #create the multimodel message 
    message = create_multimodel_message(query,context_docs) 

    # Get response from GPT-4V
    response = llm.invoke([message]) 
    

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response.content

    

In [19]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "Five steps in the treatment of security incidents ",
        
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodel_pdf_rag(query)
        print(f"Answer: {answer}") 
        print("=" * 70)


Query: Five steps in the treatment of security incidents 
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 0: •	 A data incident has four aspects: (i) a threat source, (ii) a threat event, (iii) a vulnerability...
  - Text from page 1: These scenarios demonstrate how to think about identifying causal chains that may create context-
sp...
  - Text from page 3: After clearly defining what constitutes a data incident, organizations can develop Standard Operatin...
  - Text from page 4: Use a risk model to understand the causal chain that can lead to data incidents for specific offices...
  - Text from page 3: 3.	Treatment of the incident: A technical expert decides on the necessary measures to treat the inci...


Answer: Based on the provided excerpts, **the five steps in the treatment of security incidents** (specifically data incidents) are:

1. **Notification**  
   The incident is reported or detected. This triggers the incident management