install python packages and popper (for ocd)



In [None]:
!sudo apt-get install poppler-utils -y
!pip3 install colpali-engine==0.3.1 pdf2image pypdf pyvespa vespacli requests numpy tqdm pdfminer.six

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from io import BytesIO
from colpali_engine.models import ColQwen2, ColQwen2Processor

### Load the model

We use device map auto to load the model on the available GPU if available, otherwise on the CPU or MPS if available.


In [None]:
model_name = "vidore/colqwen2-v0.1"

model = ColQwen2.from_pretrained(
    model_name, torch_dtype=torch.bfloat16, device_map="auto"
)
processor = ColQwen2Processor.from_pretrained(model_name)
model = model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Working with pdfs

We need to convert a PDF to an array of images. One image per page.
We use the `pdf2image` library for this task. Secondary, we also extract the text contents of the PDF using `pypdf`.

NOTE: This step requires that you have `poppler` installed on your system. Read more in [pdf2image](https://pdf2image.readthedocs.io/en/latest/installation.html) docs.


In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from pdf2image import convert_from_path

def get_pdf_images(pdf_path):
    page_texts = []

    # Extract text from each page using pdfminer.six
    for page_layout in extract_pages(pdf_path):
        page_text = ''
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                page_text += element.get_text()
        page_texts.append(page_text)

    # Convert PDF pages to images
    images = convert_from_path(pdf_path)

    # Ensure each page has a corresponding image
    assert len(images) == len(page_texts)

    return images, page_texts


Upload the pdf files


In [None]:
from google.colab import files
uploaded = files.upload()

Saving MiR 24V Battery Troubleshooting and Technical Guide 2.1_en.pdf to MiR 24V Battery Troubleshooting and Technical Guide 2.1_en.pdf


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
sample_pdfs = [
    {
        "title": "MiR200 Hook quick start",
        "path": "MiR200 Hook quick start 1.1_en.pdf",
    },{
        "title": "MiR200 Hook user guide",
        "path": "MiR200 Hook user guide 3.1_en.pdf",
    },
    {
        "title": "MiR200 Quick Start",
        "path": "MiR200 Quick Start 2.3_en.pdf",
    },
    {
        "title": "MiR200 User Guide",
        "path": "MiR200 User Guide 3.3_en.pdf",
    },
]

Now we can convert the PDFs to images and also extract the text content.


In [None]:
for pdf in sample_pdfs:
    page_images, page_texts = get_pdf_images("/content/drive/MyDrive/MiR200/"+pdf["path"])
    pdf["images"] = page_images
    pdf["texts"] = page_texts

Let us look at the extracted image of the first PDF page. This is the document side input to ColPali, one image per page.


In [None]:
from IPython.display import display


def resize_image(image, max_height=800):
    width, height = image.size
    if height > max_height:
        ratio = max_height / height
        new_width = int(width * ratio)
        new_height = int(height * ratio)
        return image.resize((new_width, new_height))
    return image


display(resize_image(sample_pdfs[0]["images"][15]))

KeyError: 'images'

Let us also look at the extracted text content of the first PDF page.


In [None]:
print(sample_pdfs[0]["texts"][15])

3. States
l For MiR100 HW 2.0–2.2 and MiR200 HW 1.1–1.3, the adapter cable in the retrofit kit connects
the two pins via the CAN bus connection—see "Battery adapter cable 450687 and 450681 for
replacement batteries" on page 37. Additionally, a host detect wire must be installed also to
connect the two pins. If this wire is not installed, the battery does not enter Active state even if
the battery cable is connected to the robot. To install this wire, see the guide How to install the
host detect wire W318a in MiR100 and MiR200. You can find this guide on MiR Support Portal.
For the battery to be active, the following requirements must be met:
l The battery adapter cable must be connected to the battery and robot.
l The host detect cable must be installed—see "Host detect cable W318a for replacement
batteries" on page 38.
l The F1 relay must be on—see Figure 3.2.
l For MiR100 HW 3.0–5.0 and MiR200 HW 2.0–5.0, the adapter cable in the retrofit kit connects
the two pins via the CAN bus con

Now we use the ColPali model to generate embeddings of the images.


In [None]:
for pdf in sample_pdfs:
    page_embeddings = []
    dataloader = DataLoader(
        pdf["images"],
        batch_size=2,
        shuffle=False,
        collate_fn=lambda x: processor.process_images(x),
    )

    for batch_doc in tqdm(dataloader):
        with torch.no_grad():
            batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
            embeddings_doc = model(**batch_doc)
            page_embeddings.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
    pdf["embeddings"] = page_embeddings

100%|██████████| 10/10 [01:56<00:00, 11.66s/it]
100%|██████████| 94/94 [17:51<00:00, 11.40s/it]
100%|██████████| 20/20 [03:49<00:00, 11.50s/it]
100%|██████████| 93/93 [17:39<00:00, 11.39s/it]


In [None]:
import base64
from PIL import Image
import torch

def encode_base64_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return str(base64.b64encode(buffered.getvalue()), "utf-8")


def decode_base64_image(base64_string):
    # Decode the base64 string into bytes
    image_data = base64.b64decode(base64_string)

    # Convert the bytes back into an image
    buffered = BytesIO(image_data)
    image = Image.open(buffered)

    return image

def unique_positive_hash_64bit(value):
    hash_value = hash(value)
    unsigned_hash = hash_value & ((1 << 64) - 1)  # Treat as 64-bit unsigned
    return unsigned_hash

print(unique_positive_hash_64bit("example"))  # Unique, positive integer within 64-bit range



12250240787110954365


In [None]:
import numpy as np

documents = []
for pdf in sample_pdfs:
    path = pdf["path"]
    title = pdf["title"]
    for page_number, (page_text, embedding, image) in enumerate(
        zip(pdf["texts"], pdf["embeddings"], pdf["images"])
    ):
        base_64_image = encode_base64_image(resize_image(image, 640))
        multivector = embedding.cpu().float().numpy().tolist()
        documents.append(
                models.PointStruct(
                    id=unique_positive_hash_64bit(path + str(page_number)),  # we just use the index as the ID
                    vector=multivector,  # This is now a list of vectors
                    payload={
                        "path": path,
                        "title": title,
                        "page_number": page_number,
                        "image": base_64_image,
                        "text": page_text,
                    },
                )
            )


In [None]:
!pip install uv
!uv pip install --system colpali_engine>=0.3.1 datasets huggingface_hub[hf_transfer] qdrant-client transformers>=4.45.0 stamina rich
from qdrant_client import QdrantClient
from qdrant_client.http import models

Collecting uv
  Downloading uv-0.4.26-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.4.26-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/13.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/13.7 MB[0m [31m40.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/13.7 MB[0m [31m79.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m10.0/13.7 MB[0m [31m91.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m13.7/13.7 MB[0m [31m118.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m13.7/13.7 MB[0m [31m118.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━

In [None]:
collection_name = "robot-manuals"
QDRANT_API_KEY = ""
qdrant_client = QdrantClient(
     url="https://c1c6b76d-3e82-4265-8850-542922eeb4b9.us-east4-0.gcp.cloud.qdrant.io:6333/dashboard#/collections",
     api_key=QDRANT_API_KEY,
 )

In [None]:
vector_params = models.VectorParams(
    size=128,
    distance=models.Distance.COSINE,
    multivector_config=models.MultiVectorConfig(
        comparator=models.MultiVectorComparator.MAX_SIM
    ),
)

In [None]:
scalar_quant = models.ScalarQuantizationConfig(
    type=models.ScalarType.INT8,
    quantile=0.99,
    always_ram=False,
)

In [None]:
qdrant_client.recreate_collection(
    collection_name=collection_name,  # the name of the collection
    on_disk_payload=True,  # store the payload on disk
    optimizers_config=models.OptimizersConfigDiff(
        indexing_threshold=100
    ),  # it can be useful to swith this off when doing a bulk upload and then manually trigger the indexing once the upload is done
    vectors_config=models.VectorParams(
        size=128,
        distance=models.Distance.COSINE,
        multivector_config=models.MultiVectorConfig(
            comparator=models.MultiVectorComparator.MAX_SIM
        ),
        quantization_config=models.ScalarQuantization(
            scalar=models.ScalarQuantizationConfig(
                type=models.ScalarType.INT8,
                quantile=0.99,
                always_ram=True,
            ),
        ),
    ),
)

  qdrant_client.recreate_collection(


True

In [None]:
import stamina


@stamina.retry(on=Exception, attempts=3)
def upsert_to_qdrant(batch):
    try:
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points,
            wait=False,
        )
    except Exception as e:
        print(f"Error during upsert: {e}")
        return False
    return True

In [None]:
documents[0].id


3521001740918857957

In [None]:
batch_size = 6  # Adjust based on your GPU memory constraints

# Use tqdm to create a progress bar
with tqdm(total=len(documents), desc="Upserting Progress") as pbar:
    for i in range(0, len(documents), batch_size):
        points = documents[i : i + batch_size]

        # Upload points to Qdrant
        try:
            upsert_to_qdrant(points)
        # clown level error handling here 🤡
        except Exception as e:
            print(f"Error during upsert: {e}")
            continue

        # Update the progress bar
        pbar.update(batch_size)

print("Upserting complete!")

Upserting Progress: 100%|██████████| 432/432 [01:25<00:00,  5.08it/s]

Upserting complete!



