In [None]:
# --- Install/upgrade dependencies if not already installed ---
# Run this once in a cell
# !pip install --upgrade transformers huggingface_hub torch pillow scikit-learn langchain langchain-openai python-dotenv pymupdf faiss-cpu

# --- Imports ---
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_core.messages import HumanMessage
from langchain_community.vectorstores import FAISS
import fitz  # PyMuPDF
import io
import base64

# --- Load environment variables (for OpenAI / LangChain) ---
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("❌ OPENAI_API_KEY not found. Please create a .env file with:\nOPENAI_API_KEY=sk-xxxx")

os.environ["OPENAI_API_KEY"] = api_key

# --- Initialize CLIP model (for embeddings) ---
model_id = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_id)
clip_processor = CLIPProcessor.from_pretrained(model_id)
clip_model.eval()
print("✅ CLIP model loaded successfully")

# === Embedding functions ===
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If path
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL Image
        image = image_data

    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        features = features / features.norm(dim=-1, keepdim=True)  # Normalize
        return features.squeeze().numpy()

def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        features = features / features.norm(dim=-1, keepdim=True)  # Normalize
        return features.squeeze().numpy()

# === Process PDF ===
pdf_path = "multimodal_sample.pdf"   # change this to your PDF filename
doc = fitz.open(pdf_path)

all_docs = []
all_embeddings = []
image_data_store = {}

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

for i, page in enumerate(doc):
    # --- Process Text ---
    text = page.get_text()
    if text.strip():
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

    # --- Process Images ---
    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # Create Document entry
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()
print(f"✅ Processed {len(all_docs)} documents ({len(all_embeddings)} embeddings)")
print(f"✅ Stored {len(image_data_store)} images for multimodal use.")

# === Create FAISS vector store ===
embeddings_array = np.array(all_embeddings)

vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,
    metadatas=[doc.metadata for doc in all_docs]
)

# === Initialize OpenAI Chat Model ===
llm = ChatOpenAI(model="gpt-4.1", temperature=0, api_key=api_key)

# === Retrieval function ===
def retrieve_multimodal(query, k=5):
    query_embedding = embed_text(query)
    results = vector_store.similarity_search_by_vector(embedding=query_embedding, k=k)
    return results

# === Create multimodal message for GPT-4V ===
def create_multimodal_message(query, retrieved_docs):
    content = []
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })

    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)

# === Full multimodal pipeline ===
def multimodal_pdf_rag_pipeline(query):
    context_docs = retrieve_multimodal(query, k=5)
    message = create_multimodal_message(query, context_docs)
    response = llm.invoke([message])

    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

# === Main Execution ===
if __name__ == "__main__":
    queries = [
        "What does the chart on page 1 show about revenue trends?",
        "Summarize the main findings from the document",
        "What visual elements are present in the document?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")


✅ CLIP model loaded successfully


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


✅ Processed 2 documents (2 embeddings)
✅ Stored 1 images for multimodal use.

Query: What does the chart on page 1 show about revenue trends?
--------------------------------------------------


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}