In [2]:
import os
from langchain.llms import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from dotenv import find_dotenv, load_dotenv
from pinecone.grpc import PineconeGRPC
from pinecone import ServerlessSpec
import pinecone

  from tqdm.autonotebook import tqdm


In [16]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import ChatOpenAI

# Load and process Form ADV PDF
pdf_path = "formADV.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

# Embed the chunks into vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Initialize the retriever
retriever = vectorstore.as_retriever()

# Define the format function
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Initialize GPT-4
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Test cases for specific questions
test_queries = [
    "What question asks if the company provides continuous and regular supervisory or management services to securities portfolios? Please include the item as the prefix to the question. (Ex. If it is part a in Item 5, put 5a)",
    "What does question N say?",
    "What is in part 1 of the criminal disclosure reporting page?",
    "What does question 3 in Schedule A Directors and Executive Officers section ask for?",
    "What question asks what the approximate amount of total regulatory assets under management attributable to non-US persons are? Please include the item as the prefix to the question. (Ex. If it is part a in Item 5, put 5a)."
]

# Iterate over test queries
for query in test_queries:
    print(f"Query: {query}")
    
    # Retrieve relevant document chunks based on the query
    context = format_docs(retriever.get_relevant_documents(query))
    formatted_prompt = f"Answer the following based on the context:\n{context}\n\nQuestion: {query}"
    
    # Call GPT-4 with the formatted prompt
    response = llm.invoke(formatted_prompt)
    print(f"Response: {response.content}\n{'-'*50}")


Query: What question asks if the company provides continuous and regular supervisory or management services to securities portfolios? Please include the item as the prefix to the question. (Ex. If it is part a in Item 5, put 5a)
Response: The question that asks if the company provides continuous and regular supervisory or management services to securities portfolios is "F. (1) Do you provide continuous and regular supervisory or management services to securities portfolios?"
--------------------------------------------------
Query: What does question N say?
Response: Question N asks: "Are you a public reporting company under Sections 12 or 15(d) of the Securities Exchange Act of 1934?"
--------------------------------------------------
Query: What is in part 1 of the criminal disclosure reporting page?
Response: Part 1 of the Criminal Disclosure Reporting Page includes the identification of the person(s) or entity(ies) for whom the Disclosure Reporting Page (DRP) is being filed. This c

RAG by itself does not seem to be very good at recognizing the spatial layout of the docuemnt including how the 
text is related to the numbers and item headings. As such, querying for which specific question in terms of the
item and number will not be an effective task with this current approach. 

In [4]:
import fitz  # PyMuPDF

# Load the PDF file
pdf_path = "formADV.pdf"
pdf_document = fitz.open(pdf_path)

# Check the total number of pages
total_pages = pdf_document.page_count
print(f"Total pages in the document: {total_pages}")

# Loop through the first 5 pages and extract the text
for page_num in range(5):
    page = pdf_document.load_page(page_num)
    page_text = page.get_text("text")  # Extract as plain text
    
    # Clean the text by removing excessive white spaces
    cleaned_text = "\n".join([line.strip() for line in page_text.splitlines() if line.strip()])
    
    print(f"Page {page_num + 1} text:\n{cleaned_text}\n{'-'*80}")



Total pages in the document: 83
Page 1 text:
FORM ADV (Paper Version)
• UNIFORM APPLICATION FOR INVESTMENT ADVISER
REGISTRATION
AND
• REPORT BY EXEMPT REPORTING ADVISERS
PART 1A
Complete this form truthfully. False statements or omissions may result in
denial of your application, revocation of your registration, or criminal
prosecution. You must keep this form updated by filing periodic
amendments. See Form ADV General Instruction 4.
Check the box that indicates what you would like to do (check all that apply):
SEC or State Registration:
Submit an initial application to register as an investment adviser with the SEC.
Submit an initial application to register as an investment adviser with one or more states.
Submit an annual updating amendment to your registration for your fiscal year ended
.
Submit an other-than-annual amendment to your registration.
SEC or State Report by Exempt Reporting Advisers:
Submit an initial report to the SEC.
Submit a report to one or more state securities au

Using Unstructured Startup API to create structured JSON format from PDF

In [5]:
import os

from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig,
    LocalUploaderConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig

if __name__ == "__main__":
    Pipeline.from_configs(
        context=ProcessorConfig(),
        indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
        downloader_config=LocalDownloaderConfig(),
        source_connection_config=LocalConnectionConfig(),
        partitioner_config=PartitionerConfig(
            partition_by_api=True,
            api_key=os.getenv("UNSTRUCTURED_API_KEY"),
            partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
            strategy="hi_res",
            additional_partition_args={
                "split_pdf_page": True,
                "split_pdf_allow_failed": True,
                "split_pdf_concurrency_level": 15
            }
        ),
        uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"))
    ).run()


2024-09-19 15:52:31,199 MainProcess INFO     Created index with configs: {"input_path": "formADV.pdf", "recursive": false}, connection configs: {"access_config": "**********"}
2024-09-19 15:52:31,199 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-09-19 15:52:31,200 MainProcess INFO     Created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "metadata_include": [], "partition_endpoint": "https://api.unstructuredapp.io/general/v0/general", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-09-19 15:52:31,200 MainProcess INFO     Created upload with configs: {"

In [6]:
# Passing in a small chunk of the structured JSON data to the LLM

# Next step is to determine whether to pass in the text iself in a chunked format or create embedding + RAG technique

import json
from langchain_openai import ChatOpenAI

# Load the structured JSON data
with open("output/formADV.pdf.json", "r") as f:
    structured_data = json.load(f)

# Extract a small chunk (for example, the first 5 entries)
small_chunk = structured_data[:20]

print(small_chunk)

# Convert the chunk into a string to pass to the LLM
json_string_chunk = json.dumps(small_chunk, indent=2)

# Initialize GPT-4
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Define a query to ask the LLM
query = "What question is Item 1 Part C asking?"
query2 = ""

# Combine the query with the chunk of the structured JSON context
prompt = f"Given this structured document data:\n\n{json_string_chunk}\n\n{query}"
prompt2 = f"Given this structured document data:\n\n{json_string_chunk}\n\n{query}"

# Pass the prompt to the LLM
response = llm.invoke(prompt)

# Print the response from GPT-4
print(response.content)

# Define a query for determining which item and part can answer the user's question
user_input_question = "Where do I find information on the adviser's legal name?"

# Structure the prompt
prompt = f"""
You are provided with structured data that consists of item numbers and their corresponding parts. 
Your task is to answer the following question by indicating which item and part number contains the answer. 
Format your response as 'Item X, Part Y'. If the question is answer by multiple parts of an item, include all 
parts that are relevant to the question. For example, if the question asks for the number of individual clients of
the investment adviser, return 5D(a)(1) and 5D(a)(2). If the question is answered by a single part, only include that part. If 
the question is not answered by any parts of any items, respond with 'No relevant information found.' 

Given the structured data:\n\n{json_string_chunk}\n\nQuestion: {user_input_question}
"""

# Pass the prompt to the LLM
response = llm.invoke(prompt)

# Print the response from GPT-4
print(response.content)


Item 1 Part C is asking if this filing is reporting a change in your legal name (Item 1.A.) or primary business name (Item 1.B.(1)), and if so, to enter the new name and specify whether the name change is of your legal name or primary business name.
Item 1, Part A


In [23]:
# Passing in a small chunk of the structured JSON data to the LLM

# Next step is to determine whether to pass in the text iself in a chunked format or create embedding + RAG technique

import json
from langchain_openai import ChatOpenAI

# Load the structured JSON data
with open("output/formADV.pdf.json", "r") as f:
    structured_data = json.load(f)

# Extract a small chunk (for example, the first 5 entries)
small_chunk = structured_data[25:50]

print(small_chunk)

# Convert the chunk into a string to pass to the LLM
json_string_chunk = json.dumps(small_chunk, indent=2)

# Initialize GPT-4
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=os.getenv("OPENAI_API_KEY"))


# Define a query for determining which item and part can answer the user's question
user_input_question = "Where do I find your SEC filing number if you are registered with the SEC as an investment advisor?"

# Structure the prompt
prompt = f"""
You are provided with structured data that consists of item numbers and their corresponding parts. 
Your task is to answer the following question by indicating which item and part number contains the answer. 
Format your response as 'Item X, Part Y'. If the question is answer by multiple parts of an item, include all 
parts that are relevant to the question. For example, if the question asks for the number of individual clients of
the investment adviser, return 5D(a)(1) and 5D(a)(2). If the question is answered by a single part, only include that part. If 
the question is not answered by any parts of any items, respond with 'No relevant information found.' 

Given the structured data:\n\n{json_string_chunk}\n\nQuestion: {user_input_question}
"""

# Pass the prompt to the LLM
response = llm.invoke(prompt)

# Print the response from GPT-4
print(response.content)


Item 1 Part C is asking if there is a change in the legal name or primary business name of the entity. If there is a change, the new name needs to be entered and specified whether the change is in the legal name or the primary business name.
Item 1D(1)


In [74]:
# Passing in a small chunk of the structured JSON data to the LLM

# Next step is to determine whether to pass in the text iself in a chunked format or create embedding + RAG technique

import json
from langchain_openai import ChatOpenAI

# Load the structured JSON data
with open("output/formADV.pdf.json", "r") as f:
    structured_data = json.load(f)

# Extract a small chunk (for example, the first 5 entries)
small_chunk = structured_data[50:75]

print(small_chunk)

# Convert the chunk into a string to pass to the LLM
json_string_chunk = json.dumps(small_chunk, indent=2)

# Initialize GPT-4
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=os.getenv("OPENAI_API_KEY"))


# Define a query for determining which item and part can answer the user's question
user_input_question = "Which item and question asks if you are registered with a foreign financial regulatory authority?"

# Structure the prompt
prompt = f"""
You are provided with structured data that consists of item numbers and their corresponding parts. 
Your task is to answer the following question by indicating which item and part number contains the answer. 
Format your response as 'Item X, Part Y'. If the question is answer by multiple parts of an item, include all 
parts that are relevant to the question. For example, if the question asks for the number of individual clients of
the investment adviser, return 5D(a)(1) and 5D(a)(2). If the question is answered by a single part, only include that part. If 
the question is not answered by any parts of any items, respond with 'No relevant information found.' 

Given the structured data:\n\n{json_string_chunk}\n\nQuestion: {user_input_question}
"""

# Pass the prompt to the LLM
response = llm.invoke(prompt)

# Print the response from GPT-4
print(response.content)


[{'type': 'NarrativeText', 'element_id': 'f0650c374f397059f034621934405bb8', 'text': '(other titles, if any)', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 4, 'parent_id': 'dd6b4362f9910f8b839257f366d9d241', 'filename': 'formADV.pdf', 'data_source': {'url': None, 'version': None, 'record_locator': {'path': '/Users/nikitajha/Documents/NikitaFinal/Coding Projects/SEC/formADV.pdf'}, 'date_created': '1725737463.6450179', 'date_modified': '1725737463.6473238', 'date_processed': '1725931022.635649', 'permissions_data': [{'mode': 33188}], 'filesize_bytes': 767857}}}, {'type': 'NarrativeText', 'element_id': '851f77730bcea9bad66bcc1818fff09e', 'text': '(area code)', 'metadata': {'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 4, 'parent_id': 'dd6b4362f9910f8b839257f366d9d241', 'filename': 'formADV.pdf', 'data_source': {'url': None, 'version': None, 'record_locator': {'path': '/Users/nikitajha/Documents/NikitaFinal/Coding Projects/SEC/formA

In [65]:
import json
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import ChatOpenAI

# Load and process structured JSON data
with open("output/refinedOutput.json", "r") as f:
    structured_data = json.load(f)

# Convert JSON content to a format suitable for embedding
def extract_text_from_json(data):
    return "\n".join([str(item) for item in data])

# Extract text from the structured JSON data
json_string_chunk = extract_text_from_json(structured_data)

# Split the text into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_text(json_string_chunk)

# Embed the chunks into the vector store
vectorstore = Chroma.from_texts(texts=splits, embedding=OpenAIEmbeddings())

# Initialize the retriever
retriever = vectorstore.as_retriever()

# Initialize GPT-4
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Define the instructions and user query
instructions = """
You are provided with structured data that consists of item numbers and their corresponding parts. 
Your task is to answer the following question by indicating which item and part number contains the answer. 
For example, if the question asks for the number of individual clients of the investment adviser, 
return 5D(a)(1) and 5D(a)(2). If the question is answered by a single part, only include that part. 
If the question is not answered by any parts of any items, respond with 'No relevant information found.'
"""

# Define the user queries (test cases)
test_queries = [
    "What question asks for which days of week that you normally conduct business at your principal office and place of business?"
]

# Iterate over the test queries and run them
for query in test_queries:
    print(f"Query: {query}")
    
    # Retrieve relevant document chunks based on the query
    context = format_docs(retriever.get_relevant_documents(query))
    
    # Combine instructions with the query for the prompt
    formatted_prompt = f"{instructions}\n\nContext: {context}\n\nQuestion: {query}"
    
    # Call GPT-4 with the formatted prompt
    response = llm.invoke(formatted_prompt)
    
    # Print the response
    print(f"Response: {response.content}\n{'-'*50}")


Query: What question asks for which days of week that you normally conduct business at your principal office and place of business?
Response: The question that asks for which days of week that you normally conduct business at your principal office and place of business is in part (2).
--------------------------------------------------


OFFICIALLY GIVING UP ON RAG

PDF to Image Approach

In [79]:
import base64
import requests
from pdf2image import convert_from_path

# OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

# Convert PDF to images
pdf_path = 'formADV.pdf'
images = convert_from_path(pdf_path, first_page=1, last_page=26)

# Function to encode the image in base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Save and encode images
encoded_images = []
for i, image in enumerate(images):
    image_path = f'page_{i+1}.png'
    image.save(image_path, 'PNG')
    
    # Encode each image as base64
    base64_image = encode_image(image_path)
    encoded_images.append(base64_image)

# Create payload with base64 images
headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

# Example with two images (you can scale this up for more images)
payload = {
  "model": "gpt-4o-mini",
  "messages": [
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in these images?"},
        {
          "type": "image_url",
          "image_url": {"url": f"data:image/png;base64,{encoded_images[0]}"}
        },
        {
          "type": "image_url",
          "image_url": {"url": f"data:image/png;base64,{encoded_images[1]}"}
        }
      ]
    }
  ],
  "max_tokens": 300
}

# Send the request to OpenAI API
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

# Print the response
print(response.json())




In [81]:

# Create payload with base64 images
headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

# Example with two images (you can scale this up for more images)
payload = {
  "model": "gpt-4o-mini",
  "messages": [
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in these images?"},
        {
          "type": "image_url",
          "image_url": {"url": f"data:image/png;base64,{encoded_images[0]}"}
        },
        {
          "type": "image_url",
          "image_url": {"url": f"data:image/png;base64,{encoded_images[1]}"}
        }
      ]
    }
  ],
  "max_tokens": 300
}

# Assuming you already have the response
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

# Parse the response to JSON
response_json = response.json()

# Extract the message content
message_content = response_json['choices'][0]['message']['content']
print(message_content)

The images you provided are pages from Form ADV, specifically a paper version of the Uniform Application for Investment Adviser Registration and Report by Exempt Reporting Advisers. This form is used by individuals and firms to register as investment advisers with the Securities and Exchange Commission (SEC) or relevant state authorities. 

Key sections include:

1. **Part 1A** - Notifies the applicant about the importance of truthful information and outlines different registration options.
2. **Item 1** - Contains identifying information prompts, including legal names, business names, SEC file numbers, and addresses.
3. **State Registration and Reporting Requirements** - Details on submitting initial applications, updates, or amendments relevant to investment advisers.

Overall, the form collects essential information for regulatory compliance regarding financial advisory services.


In [82]:
# Create payload with base64 images
headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

# Define the instructions for relevant items, parts, etc.
instructions = """
You are provided with structured data that consists of item numbers and their corresponding parts. 
Your task is to answer the following question by indicating which item and part number contains the answer. 
For example, if the question asks for the number of individual clients of the investment adviser, 
return 5D(a)(1) and 5D(a)(2). If the question is answered by a single part, only include that part. 
If the question is not answered by any parts of any items, respond with 'No relevant information found.
"""

# Example with two images (you can scale this up for more images)
payload = {
  "model": "gpt-4o-mini",
  "messages": [
    {
      "role": "user",
      "content": [
        {"type": "text", "text": f"{instructions} Which questions asks for your SEC filing number if you are registered with the SEC as an investment advisor?"},
        {
          "type": "image_url",
          "image_url": {"url": f"data:image/png;base64,{encoded_images[0]}"}
        },
        {
          "type": "image_url",
          "image_url": {"url": f"data:image/png;base64,{encoded_images[1]}"}
        }
      ]
    }
  ],
  "max_tokens": 300
}

# Send the request to OpenAI API
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

# Parse the response to JSON
response_json = response.json()

# Extract the message content
message_content = response_json['choices'][0]['message']['content']
print(message_content)


The question asks for your SEC filing number if you are registered with the SEC as an investment adviser. The relevant item and part numbers containing this information are:

**Item 1, Part D(1)**.


In [1]:
from byaldi import RAGMultiModalModel

# Step 1: Initialize the ColPali model
def initialize_colpali_model():
    model = RAGMultiModalModel.from_pretrained("vidore/colpali")
    return model

# Step 2: Index your images
def index_images(image_directory, index_name="image_index"):
    # Initialize the model
    model = initialize_colpali_model()

    # Index the images in the specified directory
    model.index(
        input_path=image_directory,  # Path to your directory containing images
        index_name=index_name,       # Name for your index
        store_collection_with_index=False,  # You can store the indexed data for later retrieval
        overwrite=True               # Overwrite existing index if any
    )

    print(f"Indexing complete. Images indexed in {index_name}.")

# Example usage:
index_images("images/")


  from .autonotebook import tqdm as notebook_tqdm


Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.41it/s]
You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Indexing file: images/page_2.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 0 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_3.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 1 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_1.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 2 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_4.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 3 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_5.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 4 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_7.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 5 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_6.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 6 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_19.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 7 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_25.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 8 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_24.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 9 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_18.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 10 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_26.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 11 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_23.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 12 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_22.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 13 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_20.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 14 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_21.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 15 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_10.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 16 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_11.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 17 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_13.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 18 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_12.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 19 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_16.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 20 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_17.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 21 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_15.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 22 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_14.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 23 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_8.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 24 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_9.png
Added page 1 of document 25 to index.
Index exported to .byaldi/image_index
Index exported to .byaldi/image_index
Indexing complete. Images indexed in image_index.


In [4]:
from byaldi import RAGMultiModalModel

# Step 1: Initialize the ColPali model and index the images (if needed)
def initialize_model_and_index(image_directory, index_name="image_index"):
    # Initialize the model
    model = RAGMultiModalModel.from_pretrained("vidore/colpali")
    
    # Index the images if the index does not already exist
    model.index(
        input_path=image_directory,
        index_name=index_name,
        store_collection_with_index=False,
        overwrite=True  # Set to False to avoid overwriting an existing index
    )

    return model

# Step 2: Search for relevant images (reusing the same model instance)
def search_images(model, query, top_k=5):
    # Perform the search using the internal index
    results = model.search(query, k=top_k)

    # Step 3: Display the results
    print(f"Top {top_k} results for query '{query}':")
    for result in results:
        print(f"Doc ID: {result.doc_id}, Page: {result.page_num}, Score: {result.score}")

# Example usage:
image_directory = "images/"
model = initialize_model_and_index(image_directory, "image_index")  # Initialize model and index images
search_images(model, "What does Item 1 part A ask?", top_k=5)  # Reuse the model for each query


Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.42it/s]
You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


overwrite is on. Deleting existing index image_index to build a new one.
Indexing file: images/page_2.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 0 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_3.png
Added page 1 of document 1 to index.


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Index exported to .byaldi/image_index
Indexing file: images/page_1.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 2 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_4.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 3 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_5.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 4 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_7.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 5 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_6.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 6 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_19.png
Added page 1 of document 7 to index.


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Index exported to .byaldi/image_index
Indexing file: images/page_25.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 8 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_24.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 9 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_18.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 10 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_26.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 11 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_23.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 12 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_22.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 13 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_20.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 14 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_21.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 15 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_10.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 16 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_11.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 17 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_13.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 18 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_12.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 19 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_16.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 20 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_17.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 21 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_15.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 22 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_14.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 23 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_8.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 24 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_9.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 25 to index.
Index exported to .byaldi/image_index
Index exported to .byaldi/image_index
Top 5 results for query 'What does Item 1 part A ask?':
Doc ID: 16, Page: 1, Score: 19.125
Doc ID: 2, Page: 1, Score: 19.125
Doc ID: 15, Page: 1, Score: 18.875
Doc ID: 22, Page: 1, Score: 18.75
Doc ID: 21, Page: 1, Score: 18.75


In [3]:
from byaldi import RAGMultiModalModel
from PIL import Image
import os

# Initialize the ColPali model and index the images
def initialize_model_and_index(image_directory, index_name="image_index"):
    # Initialize the model
    model = RAGMultiModalModel.from_pretrained("vidore/colpali")

    # Store a mapping between Doc ID and image filename
    doc_id_to_image = {}

    # Index the images and track the Doc ID
    model.index(
        input_path=image_directory,
        index_name=index_name,
        store_collection_with_index=False,
        overwrite=True
    )

    # Assuming the model has a way to provide document IDs after indexing
    # If not, use the natural order of the files as a starting point
    image_files = sorted(os.listdir(image_directory))  # Sort to ensure matching order

    for idx, image_file in enumerate(image_files):
        doc_id_to_image[idx + 1] = image_file  # Map the index (or doc_id) to the filename

    # Save the mapping for later use
    return model, doc_id_to_image

# Search for images using the mapping
def search_and_display_images(model, doc_id_to_image, query, image_directory, top_k=3):
    # Perform the search
    results = model.search(query, k=top_k)

    print(f"Top {top_k} results for query '{query}':")
    for result in results:
        doc_id = result.doc_id
        page = result.page_num
        score = result.score
        
        # Use the doc_id to find the corresponding image file
        if doc_id in doc_id_to_image:
            image_file = doc_id_to_image[doc_id]
            image_path = os.path.join(image_directory, image_file)

            # Display the search result details
            print(f"Doc ID: {doc_id}, Image File: {image_file}, Score: {score}")

            # Load and display the image
            image = Image.open(image_path)
            image.show()
        else:
            print(f"Image for Doc ID {doc_id} not found.")

# Example usage:
image_directory = "images/"
model, doc_id_to_image = initialize_model_and_index(image_directory, "image_index")
search_and_display_images(model, doc_id_to_image, "What does Item 1 part A ask?", image_directory, top_k=3)


Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.51it/s]
You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


overwrite is on. Deleting existing index image_index to build a new one.
Indexing file: images/page_2.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 0 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_3.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 1 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_1.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 2 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_4.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 3 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_5.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 4 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_7.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 5 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_6.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 6 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_19.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 7 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_25.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 8 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_24.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 9 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_18.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 10 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_26.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 11 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_23.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 12 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_22.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 13 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_20.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 14 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_21.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 15 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_10.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 16 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_11.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 17 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_13.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 18 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_12.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 19 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_16.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 20 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_17.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 21 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_15.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 22 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_14.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 23 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_8.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 24 to index.
Index exported to .byaldi/image_index
Indexing file: images/page_9.png


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Added page 1 of document 25 to index.
Index exported to .byaldi/image_index
Index exported to .byaldi/image_index
Top 3 results for query 'What does Item 1 part A ask?':
Doc ID: 16, Image File: page_23.png, Score: 19.125


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Doc ID: 2, Image File: page_10.png, Score: 19.125


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Doc ID: 15, Image File: page_22.png, Score: 18.875


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
search_and_display_images(model, doc_id_to_image, "Advisory Affiliates", image_directory, top_k=3)


You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.


Top 3 results for query 'Advisory Affiliates':
Doc ID: 9, Image File: page_17.png, Score: 15.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Doc ID: 11, Image File: page_19.png, Score: 14.75


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Doc ID: 12, Image File: page_2.png, Score: 14.1875


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
import torch
from colpali_engine.models.paligemma_colbert_architecture import ColPali
from transformers import AutoProcessor

# Initialize the ColPali model and processor
def initialize_colpali_model():
    model_name = "vidore/colpali-v1.2"
    
    # Load the base model and adapter for ColPali
    model = ColPali.from_pretrained("vidore/colpaligemma-3b-pt-448-base").eval()
    model.load_adapter(model_name)
    model.to("cuda" if torch.cuda.is_available() else "cpu")  # Move to GPU/CPU
    
    processor = AutoProcessor.from_pretrained(model_name)
    
    return model, processor

# Example usage:
model, processor = initialize_colpali_model()


Downloading shards: 100%|██████████| 2/2 [01:41<00:00, 50.75s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  7.11it/s]


In [1]:
import torch
from colpali_engine.models.paligemma_colbert_architecture import ColPali
from transformers import AutoProcessor

# Initialize the ColPali model and processor
def initialize_colpali_model():
    model_name = "vidore/colpali-v1.2"
    
    # Load the base model and adapter for ColPali
    model = ColPali.from_pretrained("vidore/colpaligemma-3b-pt-448-base", torch_dtype=torch.float32).eval()
    model.load_adapter(model_name)
    model.to("cuda" if torch.cuda.is_available() else "cpu")  # Move to GPU/CPU
    
    processor = AutoProcessor.from_pretrained(model_name)
    
    return model, processor

# Example usage:
model, processor = initialize_colpali_model()

from PIL import Image
import os
from tqdm import tqdm

# Generate embeddings for images using ColPali and map them to the corresponding image files
def index_images(image_directory, model, processor):
    image_embeddings = {}
    doc_id_to_image = {}

    # Get all image files in the directory
    image_files = sorted(os.listdir(image_directory))

    # Index each image
    for idx, image_file in enumerate(tqdm(image_files, desc="Indexing images")):
        image_path = os.path.join(image_directory, image_file)
        image = Image.open(image_path)

        # Preprocess the image and generate embeddings
        inputs = processor(images=[image], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
        with torch.no_grad():
            embedding = model(**inputs).cpu().numpy()  # Get embeddings as float32 tensor

        # Store the embedding and map it to the image
        doc_id = idx + 1
        image_embeddings[doc_id] = embedding
        doc_id_to_image[doc_id] = image_file

    return image_embeddings, doc_id_to_image

# Example usage
image_directory = "images/"  # Replace with your actual image directory
image_embeddings, doc_id_to_image = index_images(image_directory, model, processor)


  from .autonotebook import tqdm as notebook_tqdm
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.90s/it]
Indexing images:   0%|          | 0/26 [00:00<?, ?it/s]You are using PaliGemma without a text prefix. It will perform as a picture-captioning model.
You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text and `<bos>` token after that. For this call, we will infer how many images each text has and add special tokens.
Truncation was not explicitly activated but `max_length` is pro

In [8]:
from sklearn.decomposition import PCA
import numpy as np

# Reduce the dimensionality of all image embeddings using PCA
def reduce_image_embedding_dimensionality(image_embeddings, target_dim=26):
    # Stack all the image embeddings into a matrix of shape (n_images, embedding_dim)
    all_embeddings = np.vstack([embedding.reshape(1, -1) for embedding in image_embeddings.values()])
    
    # Apply PCA to reduce the dimensionality to target_dim (maximum is 26 in this case)
    pca = PCA(n_components=target_dim)
    reduced_embeddings = pca.fit_transform(all_embeddings)  # Reduce dimensions
    
    # Create a mapping of reduced embeddings back to their document IDs
    reduced_image_embeddings = {doc_id: reduced_embedding for doc_id, reduced_embedding in zip(image_embeddings.keys(), reduced_embeddings)}
    
    return reduced_image_embeddings

# Apply PCA to reduce image embedding dimensions to 26
reduced_image_embeddings = reduce_image_embedding_dimensionality(image_embeddings, target_dim=26)


In [10]:
from byaldi import RAGMultiModalModel
from transformers import Qwen2VLForConditionalGeneration,AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from pdf2image import convert_from_path
import os

ImportError: cannot import name 'ColPali' from 'colpali_engine.models' (/Users/nikitajha/miniconda3/envs/localgpt-vision/lib/python3.10/site-packages/colpali_engine/models/__init__.py)

In [3]:
!pip install flash-attn


Collecting flash-attn
  Using cached flash_attn-2.6.3.tar.gz (2.6 MB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[20 lines of output][0m
  [31m   [0m fatal: not a git repository (or any of the parent directories): .git
  [31m   [0m 
  [31m   [0m 
  [31m   [0m torch.__version__  = 2.4.0
  [31m   [0m 
  [31m   [0m 
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 2, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/private/var/folders/dr/13q6jns56rldkk1nfqpc25240000gn/T/pip-install-fu3oh81m/flash-attn_0ef965155c564c08a5b2b08f60032699/setup.py", line 179, in <module>
  [31m   [0m     CUDAExtension(
  [31m   [0m   File "/Users/nikitajha/miniconda3/envs/localgpt-vision/lib/python3.10/site