# PDF with images
A library from PyMuPDF which is fitz works great more than anything

In [1]:
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Clip Model
import os
from dotenv import load_dotenv
load_dotenv()



### initialize the Clip Model for unified embeddings
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
# Creating Embedding functions
def embed_image(image_data):
    # Embedd image using CLIP
    if isinstance(image_data,str):# If path exists
        image = Image.open(image_data).convert("RGB")
    else:
        image = image_data
    
    inputs = clip_processor(images=image,return_tensors="pt")

    if torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        #Normalizing embeddings to unit vector
        features = features / features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()
    
def embed_text(text):
    #Embedding text using CLIP
    inputs = clip_processor(
        text = text,
        padding = True,
        max_length = 77, # max token length of CLIP
        truncation = True,
        return_tensors = "pt"
    )
    if torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        #Normalizing embeddings 
        features = features / features.norm(dim=-1,keepdim=True)
        return features.squeeze().numpy()
    

In [5]:
#Processing pdf
pdf_path = "multimodal-sample.pdf"
doc = fitz.open(pdf_path)
# Storing all docs and embeddings
all_docs = []
all_embeddings = []
image_data_store = {} # So this is the one that stores all the actual image data for LLM
#splits mitch
splitter = RecursiveCharacterTextSplitter(chunk_size = 300,chunk_overlap=30)


In [8]:
splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x1303ef020>

In [None]:
for i,page in enumerate(doc):
    # processing text 
    text = page.get_text()
    if text.strip():
        # creating a temporary document for splitting
        temp_doc = Document(page_content=text,metadata={"page":i,"type":"text"})
        text_chunks = splitter.split_documents([temp_doc])
        # embedding these chunks using clip
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)
    
    # processing images
    