# OCR and RAG

[RAG on PDF from OpenAI](https://cookbook.openai.com/examples/parse_pdf_docs_for_rag) - Best results

In [None]:
# Imports
from pdf2image import convert_from_path
from pdfminer.high_level import extract_text
import base64
import io
import os
import concurrent.futures
from tqdm import tqdm
from openai import OpenAI
import re
import json
from rich import print

In [None]:
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")

# Get the OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY_DRACO')
# Initializing OpenAI client - see https://platform.openai.com/docs/quickstart?context=python
client = OpenAI(api_key=openai_api_key)
openai_model="gpt-4o"
openai_small_model='gpt-4o-mini'

files_path = "pdfs"
pdf_file="HardyCross.pdf"
contract='Contract.pdf'

In [None]:
# Saving the cleaned content to a file
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
file_path_clean = os.path.join(desktop_path, "cleaned_content.md")

file_path_docling=os.path.join(desktop_path, "docling.md")
file_path_pytesseract=os.path.join(desktop_path, "pytesseract.md")
file_path_mistral_ocr=os.path.join(desktop_path, "mistral_ocr.md")

In [4]:
def convert_doc_to_images(path):
    images = convert_from_path(path)
    return images

def extract_text_from_doc(path):
    text = extract_text(path)
    return text

# Converting images to base64 encoded images in a data URI format to use with the ChatCompletions API
def get_img_uri(img):
    png_buffer = io.BytesIO()
    img.save(png_buffer, format="PNG")
    png_buffer.seek(0)

    base64_png = base64.b64encode(png_buffer.read()).decode('utf-8')

    data_uri = f"data:image/png;base64,{base64_png}"
    return data_uri

In [None]:
def analyze_doc_image(img):
    img_uri = get_img_uri(img)
    data = analyze_image(img_uri)
    return data


def analyze_image(data_uri):
    response = client.chat.completions.create(
        model=openai_model,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    {
                    "type": "image_url",
                    "image_url": {
                        "url": f"{data_uri}"
                    }
                    }
                ]
                },
        ],
        max_tokens=500,
        temperature=0,
        top_p=0.1
    )
    return response.choices[0].message.content

In [10]:
system_prompt="""
Parse with OCR the contents of this page extracted from a PDF file. Include all the information, including equations in markdown format. 
If there is a latex equation, convert it to markdown format by including it between $$

# Describe visual elements in detail (but include all the information and text of the original image as is):

# - **Diagrams**: Explain each component and how they interact. For example, "The process begins with X, which then leads to Y and results in Z.". 
Include relevant information such as arrays that show direction.
  
# - **Tables**: Break down the information logically. For instance, "Product A costs X dollars, while Product B is priced at Y dollars."

# Focus on the content itself rather than the format:

# - **DO NOT** include terms referring to the content format.
  
# - **DO NOT** mention the content type. Instead, directly discuss the information presented.

# Keep your explanation comprehensive yet concise:

# - Be exhaustive in describing the content, as your audience cannot see the image.
  
# - Exclude irrelevant details such as page numbers or the position of elements on the image.
"""

In [30]:
all_items = os.listdir(files_path)
files = [item for item in all_items if os.path.isfile(os.path.join(files_path, item)) and item.endswith(('.pdf', '.doc', '.docx', '.txt'))]

In [None]:
docs = []

for f in files:
    
    path = f"{files_path}/{f}"
    doc = {
        "filename": f
    }
    text = extract_text_from_doc(path)
    doc['text'] = text
    imgs = convert_doc_to_images(path)
    pages_description = []
    
    print(f"Analyzing pages for doc {f}")
    
    # Concurrent execution
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        
        # Removing 1st slide as it's usually just an intro
        futures = [
            executor.submit(analyze_doc_image, img)
            for img in imgs[:]
        ]
        
        with tqdm(total=len(imgs)) as pbar:
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(1)
        
        for f in futures:
            res = f.result()
            pages_description.append(res)
        
    doc['pages_description'] = pages_description
    docs.append(doc)

docs

In [32]:
# Chunking content by page and merging together slides text & description if applicable
content = []
for doc in docs:
    # Removing first slide as well
    text = doc['text'].split('\f')[:]
    description = doc['pages_description']
    description_indexes = []
    for i in range(len(text)):
        slide_content = text[i] + '\n'
        # Trying to find matching slide description
        slide_title = text[i].split('\n')[0]
        for j in range(len(description)):
            description_title = description[j].split('\n')[0]
            if slide_title.lower() == description_title.lower():
                slide_content += description[j].replace(description_title, '')
                # Keeping track of the descriptions added
                description_indexes.append(j)
        # Adding the slide content + matching slide description to the content pieces
        content.append(slide_content) 
    # Adding the slides descriptions that weren't used
    for j in range(len(description)):
        if j not in description_indexes:
            content.append(description[j])

In [33]:
# Cleaning up content
# Removing trailing spaces, additional line breaks, page numbers and references to the content being a slide
clean_content = []
for c in content:
    text = c.replace(' \n', '').replace('\n\n', '\n').replace('\n\n\n', '\n').strip()
    text = re.sub(r"(?<=\n)\d{1,2}", "", text)
    text = re.sub(r"\b(?:the|this)\s*slide\s*\w+\b", "", text, flags=re.IGNORECASE)
    clean_content.append(text)

In [34]:
merged_string = ''.join(clean_content)

In [35]:
clean_prompt=f"""
In the following text the equations that should be shown in an markdown file are not rendered properly. Return a version where the errors in formatting are fixed and the equations 
can be displayed properly. Convert latex block equations to markdown version by including them between $$...$$. For inline math use $...$.
Return only the content that should be directly copy pasted in a md file. Do not return it in quotes, just plain code.
Text: {merged_string}
"""

In [None]:
messages = [
    {"role": "user", "content": clean_prompt}
]

# Use OpenAI to judge tool usage
import openai
from langsmith.wrappers import wrap_openai
client = wrap_openai(openai.Client(api_key=openai_api_key))

response = client.chat.completions.create(
    messages=messages,
    temperature=0,
    model=openai_small_model,
    seed=42
)

answer = response.choices[0].message.content.strip()

with open(file_path_clean, "w") as f:
    f.write(answer)

# Saving result to file for later
json_path = "parsed_pdf_docs.json"

with open(json_path, 'w') as f:
    json.dump(docs, f)

OCR tools

[Marked-pdf](https://github.com/VikParuchuri/marker) - very good!

In [None]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

converter = PdfConverter(
    artifact_dict=create_model_dict(),
)
rendered = converter(pdf_file)
text, _, images = text_from_rendered(rendered)

In [None]:
text

In [None]:
images

If we wanted to change the path and display imgs in Markdown file too - Markdown inside Jupyter cells doesn't not support that

In [None]:
# import os, re
# from IPython.display import display, Markdown

# def display_hardy_cross_with_images(markdown_text, image_dict=None, output_dir='/img'):
#     if image_dict is None:
#         image_dict = {}

#     os.makedirs(output_dir, exist_ok=True)

#     def replace_image(match):
#         image_path = match.group(1)
#         image_name = os.path.basename(image_path)

#         if image_name in image_dict:
#             save_path = os.path.join(output_dir, image_name)
#             image_dict[image_name].save(save_path)
#             return f" ![{image_name}]({save_path})"
#         else:
#             return f"[Image: {image_name} not found]"

#     processed_text = re.sub(r'!\[\]\(([^)]+)\)', replace_image, markdown_text)
#     display(Markdown(processed_text))
#     return processed_text

# display_hardy_cross_with_images(rendered.markdown, image_dict=images)

[Docling](https://github.com/docling-project/docling) (ok for contract, but not good for pdf with images and/or equations) - install with ```pip install docling```

In [None]:
from docling.document_converter import DocumentConverter

# source = "H"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(contract)

merged_string = result.document.export_to_markdown()

clean_prompt=f"""
Translate the following text from Dutch to English
Text: {merged_string}
"""
messages = [
    {"role": "user", "content": clean_prompt}
]

# Use OpenAI to judge tool usage
import openai
from langsmith.wrappers import wrap_openai
client = wrap_openai(openai.Client(api_key=openai_api_key))

response = client.chat.completions.create(
    messages=messages,
    temperature=0,
    model=openai_small_model,
    seed=42
)

answer = response.choices[0].message.content.strip()

with open(file_path_docling, "w") as f:
    f.write(answer)

Pytesseract - Not good for equations

In [38]:
import pytesseract
from pdf2image import convert_from_path

images = convert_from_path(pdf_file)
ocr_text = ""

for img in images:
    ocr_text += pytesseract.image_to_string(img, lang="eng") + "\n\n"

# Display the first few lines as a preview
import markdownify

markdown_text = markdownify.markdownify(ocr_text)

with open(file_path_pytesseract, "w") as f:
    f.write(markdown_text)

[Mistral OCR](https://docs.mistral.ai/capabilities/document/) (really bad for equations, ok for contract but still errors in format and gpt4o better) - install with ```pip install mistralai```

In [4]:
import os
from mistralai import Mistral

mistral_api_key = os.getenv('MISTRAL_API_KEY')

client = Mistral(api_key=mistral_api_key)

In [14]:
uploaded_pdf = client.files.upload(
    file={
        "file_name": contract,
        "content": open(contract, "rb"),
    },
    purpose="ocr"
)  

signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

In [15]:
ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": signed_url.url,
    }
)

In [16]:
# Display the first few lines as a preview
import markdownify
markdown_text = markdownify.markdownify(ocr_response.pages[0].markdown)

with open(file_path_mistral_ocr, "w") as f:
    f.write(markdown_text)

In [43]:
# # For an online doc
# ocr_response = client.ocr.process(
#     model="mistral-ocr-latest",
#     document={
#         "type": "document_url",
#         "document_url": "https://arxiv.org/pdf/2201.04234"
#     },
#     include_image_base64=True
# )

# # For an online image
# ocr_response = client.ocr.process(
#     model="mistral-ocr-latest",
#     document={
#         "type": "image_url",
#         "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png"
#     }
# )

[surya-ocr](https://github.com/VikParuchuri/surya) - not that good, fails for equations

In [None]:
from PIL import Image
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor

image = Image.open("test.png") #img extracted from HardyCross.pdf with equations
langs = ["en"] # Replace with your languages or pass None (recommended to use None)
recognition_predictor = RecognitionPredictor()
detection_predictor = DetectionPredictor()

predictions = recognition_predictor([image], [langs], detection_predictor)

In [None]:
[x.text for x in predictions[0].text_lines]

[MGP Alibaba](https://huggingface.co/alibaba-damo/mgp-str-base) - very bad, just one word output

In [None]:
from transformers import MgpstrProcessor, MgpstrForSceneTextRecognition
import requests
from PIL import Image

processor = MgpstrProcessor.from_pretrained('alibaba-damo/mgp-str-base')
model = MgpstrForSceneTextRecognition.from_pretrained('alibaba-damo/mgp-str-base')

# load image from the IIIT-5k dataset
# url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
# image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
image = Image.open("test.png").convert("RGB")


pixel_values = processor(images=image, return_tensors="pt").pixel_values
outputs = model(pixel_values)

generated_text = processor.batch_decode(outputs.logits)['generated_text']
generated_text

[LatexOCR](https://github.com/lukas-blecher/LaTeX-OCR) - Bad, very old implementation that doesn't output correct equations

In [None]:
from PIL import Image
from pix2tex.cli import LatexOCR

img = Image.open('test.png') #img extracted from HardyCross.pdf with equations
model = LatexOCR()
print(model(img))

# Didn't run in MacOS - RunPod used

[Zerox](https://github.com/getomni-ai/zerox) - not working in RunPod either

In [None]:
!pip install py-zerox

In [None]:
from pyzerox import zerox
import os
import json
import asyncio

### Model Setup (Use only Vision Models) Refer: https://docs.litellm.ai/docs/providers ###

## placeholder for additional model kwargs which might be required for some models
kwargs = {}

## system prompt to use for the vision model
custom_system_prompt = None

# to override
# custom_system_prompt = "For the below PDF page, do something..something..." ## example

###################### Example for OpenAI ######################
os.environ["OPENAI_API_KEY"] = openai_api_key ## your-api-key

# ###################### Example for Azure OpenAI ######################
# model = "azure/gpt-4o-mini" ## "azure/<your_deployment_name>" -> format <provider>/<model>
# os.environ["AZURE_API_KEY"] = "" # "your-azure-api-key"
# os.environ["AZURE_API_BASE"] = "" # "https://example-endpoint.openai.azure.com"
# os.environ["AZURE_API_VERSION"] = "" # "2023-05-15"

# ###################### Example for Gemini ######################
# model = "gemini/gpt-4o-mini" ## "gemini/<gemini_model>" -> format <provider>/<model>
# os.environ['GEMINI_API_KEY'] = "" # your-gemini-api-key

# ###################### Example for Anthropic ######################
# model="claude-3-opus-20240229"
# os.environ["ANTHROPIC_API_KEY"] = "" # your-anthropic-api-key

# ###################### Vertex ai ######################
# model = "vertex_ai/gemini-1.5-flash-001" ## "vertex_ai/<model_name>" -> format <provider>/<model>
# ## GET CREDENTIALS
# ## RUN ##
# # !gcloud auth application-default login - run this to add vertex credentials to your env
# ## OR ##
# file_path = 'path/to/vertex_ai_service_account.json'

# # Load the JSON file
# with open(file_path, 'r') as file:
#     vertex_credentials = json.load(file)

# # Convert to JSON string
# vertex_credentials_json = json.dumps(vertex_credentials)

# vertex_credentials=vertex_credentials_json

# ## extra args
# kwargs = {"vertex_credentials": vertex_credentials}

###################### For other providers refer: https://docs.litellm.ai/docs/providers ######################

# Define main async entrypoint
async def main():
    file_path = pdf_file ## local filepath and file URL supported

    ## process only some pages or all
    select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)

    output_dir = "./output_test" ## directory to save the consolidated markdown file
    result = await zerox(file_path=file_path, model=openai_small_model, output_dir=output_dir,
                        custom_system_prompt=system_prompt,select_pages=select_pages, **kwargs)
    return result


# run the main function:
result = asyncio.run(main())

# print markdown result
print(result)


[Ollama OCR](https://github.com/imanoop7/Ollama-OCR) - install with ```pip install ollama-ocr``` - not great, it's chat with a document not just ocr, gives errors in results and takes 1 min per page with parallel processing

In [None]:
from ollama_ocr import OCRProcessor

# Initialize OCR processor
ocr = OCRProcessor(model_name='granite3.2-vision', base_url="http://host.docker.internal:11434/api/generate")  # You can use any vision model available on Ollama
# you can pass your custom ollama api llama3.2-vision:11b

# Process an image
result = ocr.process_image(
    image_path=pdf_file, # path to your pdf files "path/to/your/file.pdf"
    format_type="markdown",  # Options: markdown, text, json, structured, key_value
    custom_prompt=system_prompt,#"Extract all text, focusing on dates and names.", # Optional custom prompt
    language="English" # Specify the language of the text (New! 🆕)
)
print(result)

[X-PLUG](https://github.com/X-PLUG/mPLUG-DocOwl) and official repo [here](https://github.com/X-PLUG/mPLUG-DocOwl) - not good, just vision-LLM on an image

In [None]:
!git clone https://github.com/X-PLUG/mPLUG-Owl.git

In [None]:
!pip install --upgrade pip

In [None]:
cd mPLUG-Owl/mPLUG-Owl2/

In [None]:
!pip install -e .

In [None]:
!pip install sentencepiece

In [None]:
import torch
from PIL import Image
from transformers import TextStreamer

from mplug_owl2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from mplug_owl2.conversation import conv_templates, SeparatorStyle
from mplug_owl2.model.builder import load_pretrained_model
from mplug_owl2.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria

image_file = 'test.png' # Image Path of img extracted from HardyCross.pdf with equations
model_path = 'MAGAer13/mplug-owl2-llama2-7b'
query = "Describe the image. Give all the equations from it"

model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, load_8bit=True, load_4bit=False, device="cuda")

conv = conv_templates["mplug_owl2"].copy()
roles = conv.roles

image = Image.open(image_file).convert('RGB')
max_edge = max(image.size) # We recommand you to resize to squared image for BEST performance.
image = image.resize((max_edge, max_edge))

image_tensor = process_images([image], image_processor)
image_tensor = image_tensor.to(model.device, dtype=torch.float16)

inp = DEFAULT_IMAGE_TOKEN + query
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
stop_str = conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

temperature = 0.7
max_new_tokens = 512

with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        images=image_tensor,
        do_sample=True,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        streamer=streamer,
        use_cache=True,
        stopping_criteria=[stopping_criteria])

outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
print(outputs)

[OlmOCR](https://github.com/allenai/olmocr) - Demo seems good, decent in general but not perfect for equations, GPU needed, python=3.11, can't run in MacOS, english only

In [None]:
# !apt-get update -y
# !apt-get install poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools -y

In [None]:
!git clone https://github.com/allenai/olmocr.git

In [None]:
cd olmocr

In [None]:
# !pip install -e .[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/

In [None]:
!python -m olmocr.pipeline /workspace --pdfs /workspace/HardyCross.pdf

[GOT OCR](https://huggingface.co/stepfun-ai/GOT-OCR2_0) - not good, version from [Git](https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git) doesn't work

In [None]:
# !pip install torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0

In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval().cuda()


# input your test image
image_file = "/workspace/test.png" #img with the first page from HardyCross.pdf with equations

# plain texts OCR
res = model.chat(tokenizer, image_file, ocr_type='ocr')

# format texts OCR:
# res = model.chat(tokenizer, image_file, ocr_type='format')

# fine-grained OCR:
# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_box='')
# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_box='')
# res = model.chat(tokenizer, image_file, ocr_type='ocr', ocr_color='')
# res = model.chat(tokenizer, image_file, ocr_type='format', ocr_color='')

# multi-crop OCR:
# res = model.chat_crop(tokenizer, image_file, ocr_type='ocr')
# res = model.chat_crop(tokenizer, image_file, ocr_type='format')

# render the formatted OCR results:
# res = model.chat(tokenizer, image_file, ocr_type='format', render=True, save_render_file = './demo.html')

print(res)

[Nougat OCR](https://facebookresearch.github.io/nougat/) - old implementation and not good

In [None]:
# !pip install nougat-ocr

In [None]:
# !pip install albumentations==1.0.0 transformers==4.38.2

In [None]:
!nougat /workspace/HardyCross.pdf -o /workspace

[MegaParse](https://github.com/QuivrHQ/MegaParse) - Decent but needs an extra LLM round to convert to proper Markdown equations

In [None]:
from megaparse.parser.megaparse_vision import MegaParseVision
from langchain_openai import ChatOpenAI

import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")

model = ChatOpenAI(model=openai_model, api_key=openai_api_key)  # type: ignore
parser = MegaParseVision(model=model)
response = parser.convert("./HardyCross.pdf")
print(response)

In [None]:
from IPython.display import display, Markdown
display(Markdown(response.content[0].text))

# Tools

[Smolagents](https://huggingface.co/docs/smolagents/en/tutorials/secure_code_execution) - Excellent code agent (use cheap model since uses a lot of tokens, even for simple tasks due to many iterations)

In [None]:
# !pip install smolagents
# !pip install torch
# !pip install colpali_engine
# !pip install 'smolagents[e2b]'
# !pip install openpyxl==3.1.5

Run smolagents in an isolated environment with secure code execution - For Models check [here](https://huggingface.co/docs/smolagents/en/reference/models)

| Feature            | `executor_type="e2b"`                     | `Sandbox()` (local)               |
|--------------------|-------------------------------------------|-----------------------------------|
| Execution Location | Cloud container                          | Local Python environment          |
| Isolation          | High                                      | Medium (unless manually hardened) |
| Setup Overhead     | Needs E2B setup & keys                    | None                              |
| Latency            | Slightly higher due to remote execution  | Low                               |
| API Credentials    | Must be injected                          | Local env accessible              |
| Scaling            | Cloud-native scaling                      | Limited by local resources        |
| Best For           | Untrusted code, scalable production       | Local dev, fast prototyping       |

In [2]:
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")
openai_api_key = os.getenv('OPENAI_API_KEY_DRACO')

The approach below prints all the steps/planning of the model and errors - Streaming responses, sometimes seems it needs less tokens for the same task compared to sandbox

In [None]:
# #This for using together AI - needs credits
# from smolagents import InferenceClientModel, CodeAgent
# agent = CodeAgent(model=InferenceClientModel(), tools=[], executor_type="e2b")
# agent.run("Can you give me the 100th Fibonacci number?")

import base64, openpyxl
from io import StringIO
from smolagents import CodeAgent
from smolagents import OpenAIServerModel

model = OpenAIServerModel(
    model_id="gpt-4o-mini",
    api_base="https://api.openai.com/v1",
    api_key=openai_api_key,
)

agent = CodeAgent(
    model=model,  # or "gpt-3.5-turbo"
    tools=[],
    executor_type="e2b"
)

b64_data = agent.run(
    # """Create a DataFrame with one column and 60 rows containing the first 60 Fibonacci numbers. 
    # Return the file contents as base64 so I can download it."""
    """
    Using Python code and relevant tools (e.g., networkx), solve the following problem: 
    
    DESCRIPTION: Consider a pipe network consisting of four junctions arranged in a diamond shape:
    junction A is on the left, junction B is at the top, junction C is at the bottom, and junction D is at the right. There's also a vertical pipe connecting junction B (top) directly 
    with junction C (bottom), dividing the network into two loops. The flows at the junctions are as follows: junction A has an inflow of 100 L/s, junction B has an outflow of 25 L/s, 
    and junction D has an outflow of 75 L/s. The pipes, their resistances (K values), and initial guesses for flow (Q) are given as follows: Pipe AB connects junctions A and B, 
    has K = 1, and an initial assumed flow of 60 L/s directed from A to B. Pipe BC connects junctions B and C (vertical pipe), has K = 3, and an initial assumed flow of 15 L/s directed
    from B to C. Pipe AC connects junctions A and C, has K = 2, and an initial assumed flow of 40 L/s directed from A to C. Pipe BD connects junctions B and D, has K = 2, and an initial 
    assumed flow of 20 L/s directed from B to D. Pipe CD connects junctions C and D, has K = 1, and an initial assumed flow of 55 L/s directed from C to D. Two loops are defined for the
    Hardy Cross method. Loop 1 is the left loop formed by junctions A-B-C-A, with clockwise flow direction taken as positive. Loop 2 is the right loop formed by junctions B-D-C-B, 
    with clockwise flow direction also taken as positive.

    TASK: Your task is to perform one iteration of the Hardy Cross method calculations for both loops, and report the corrected flows in each pipe after one iteration, clearly stating
    the direction and magnitude of the flows. OUTPUT FORMAT: The output should be formatted as a JSON object, where each key is a pipe ID and each value is the corrected flow in L/s after 
    one iteration. The sign of the flow indicates the direction, where a positive value means the flow is in the same direction as initially assumed, while a negative value means the flow 
    has reversed direction relative to the initial assumption. Return only the JSON object without any other information.
    
    """
) # assuming it returns the base64 string

# decoded = base64.b64decode(b64_data).decode('utf-8')
# with open('fibonacci.csv', 'w', encoding='utf-8') as f:
#     f.write(decoded)

# lines = decoded.strip().splitlines()
# wb = openpyxl.Workbook()
# ws = wb.active
# for i, line in enumerate(lines, start=1):
#     ws.cell(row=i, column=1).value = line

# wb.save('fibonacci.xlsx')

In [None]:
eval(b64_data)

The approach below is better since we have have in a variable the output, and in another one the execution steps/planning of the model - Only prints at the end

In [None]:
from e2b_code_interpreter import Sandbox
import os

# Create the sandbox
sandbox = Sandbox()

# Install required packages
sandbox.commands.run("pip install smolagents")
sandbox.commands.run("pip install 'smolagents[openai]'") #to use openai model

def run_code_raise_errors(sandbox, code: str, verbose: bool = False) -> str:
    execution = sandbox.run_code(
        code,
        envs={#'HF_TOKEN': os.getenv('HF_TOKEN'),
              'OPENAI_API_KEY': openai_api_key}
    )
    if execution.error:
        execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
        logs = execution_logs
        logs += execution.error.traceback
        raise ValueError(logs)
    return "\n".join([str(log) for log in execution.logs.stdout]), execution

# #For inference with an open-source model using together AI, replace below with:
# from smolagents import CodeAgent, InferenceClientModel

# # Initialize the agents
# agent = CodeAgent(
#     model=InferenceClientModel(token=os.getenv("HF_TOKEN"), provider="together"),
#     tools=[],
#     name="coder_agent",
#     description="This agent takes care of your difficult algorithmic problems using code."
# )

# manager_agent = CodeAgent(
#     model=InferenceClientModel(token=os.getenv("HF_TOKEN"), provider="together"),
#     tools=[],
#     managed_agents=[agent],
# )

# Define your agent application
agent_code = """
import os
from smolagents import CodeAgent
from smolagents import OpenAIServerModel

model = OpenAIServerModel(
    model_id="gpt-4o-mini",
    api_base="https://api.openai.com/v1",
    api_key=os.getenv('OPENAI_API_KEY'),
)

# Initialize the agents
agent = CodeAgent(
    model=model,
    tools=[],
    name="coder_agent",
    description="This agent takes care of your difficult algorithmic problems using code.",
    additional_authorized_imports=['json']
)

manager_agent = CodeAgent(
    model=model,
    tools=[],
    managed_agents=[agent],
    additional_authorized_imports=['json']
)

# Run the agent
response = manager_agent.run(" Using Python code and relevant tools (e.g., networkx), solve the following problem: \
    DESCRIPTION: Consider a pipe network consisting of four junctions arranged in a diamond shape: \
    junction A is on the left, junction B is at the top, junction C is at the bottom, and junction D is at the right. There's also a vertical pipe connecting junction B (top) directly \
    with junction C (bottom), dividing the network into two loops. The flows at the junctions are as follows: junction A has an inflow of 100 L/s, junction B has an outflow of 25 L/s, \
    and junction D has an outflow of 75 L/s. The pipes, their resistances (K values), and initial guesses for flow (Q) are given as follows: Pipe AB connects junctions A and B, \
    has K = 1, and an initial assumed flow of 60 L/s directed from A to B. Pipe BC connects junctions B and C (vertical pipe), has K = 3, and an initial assumed flow of 15 L/s directed \
    from B to C. Pipe AC connects junctions A and C, has K = 2, and an initial assumed flow of 40 L/s directed from A to C. Pipe BD connects junctions B and D, has K = 2, and an initial \
    assumed flow of 20 L/s directed from B to D. Pipe CD connects junctions C and D, has K = 1, and an initial assumed flow of 55 L/s directed from C to D. Two loops are defined for the \
    Hardy Cross method. Loop 1 is the left loop formed by junctions A-B-C-A, with clockwise flow direction taken as positive. Loop 2 is the right loop formed by junctions B-D-C-B, \
    with clockwise flow direction also taken as positive. \
    TASK: Your task is to perform one iteration of the Hardy Cross method calculations for both loops, and report the corrected flows in each pipe after one iteration, clearly stating \
    the direction and magnitude of the flows. OUTPUT FORMAT: The output should be formatted as a JSON object, where each key is a pipe ID and each value is the corrected flow in L/s after \
    one iteration. The sign of the flow indicates the direction, where a positive value means the flow is in the same direction as initially assumed, while a negative value means the flow \
    has reversed direction relative to the initial assumption. Return only the JSON object without any other information. ")
print(response)
""" #'Create a DataFrame with one column and 60 rows containing the first 60 Fibonacci numbers. Return the file contents as base64 so I can download it.'

# Run the agent code in the sandbox
execution_logs, execution_result = run_code_raise_errors(sandbox, agent_code)
print("Logs:",execution_logs)
print("Results of execution:", execution_result)

# decoded = base64.b64decode(execution_logs).decode('utf-8')
# with open('fibonacci.csv', 'w', encoding='utf-8') as f:
#     f.write(decoded)

# lines = decoded.strip().splitlines()
# wb = openpyxl.Workbook()
# ws = wb.active
# for i, line in enumerate(lines, start=1):
#     ws.cell(row=i, column=1).value = line

# wb.save('fibonacci.xlsx')

In [None]:
eval(execution_logs.strip())

Enable tool usage in [smolagents](https://colab.research.google.com/drive/1iSjTCsIggeevlQojsYdLmzTKZ7og7rva?usp=sharing) - need to ```pip install 'smolagents[litellm]'```

In [None]:
from smolagents import CodeAgent, LiteLLMModel #DuckDuckGoSearchTool, 

model = LiteLLMModel(model_id=openai_model, api_key=openai_api_key)

agent = CodeAgent(tools=[], model=model, add_base_tools=True) #add_base_tool adds by default python code interpreter to tool list

agent.run(
    "Could you give me the 118th number in the Fibonacci sequence?",
)

In [None]:
agent = CodeAgent(tools=[], model=model) #only few imports by default like math. Other libraries should be passed as argument additional_authorized_imports=['requests', 'bs4']
agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") #Might get different outputs every time we run this

In [None]:
model = LiteLLMModel(model_id=openai_small_model, api_key=openai_api_key, temperature=0, seed=42) #Still not reproducible results
agent = CodeAgent(tools=[], model=model, max_steps=5)  # Steps set to 5 to avoid charging more
for i in range(5):
    agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'? Use requests to fetch data from it") 
#Might get different outputs every time we run this

In [None]:
agent = CodeAgent(tools=[], model=model, additional_authorized_imports=['requests', 'bs4']) #These authorize the use of these libraries
agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'? Use requests to fetch data from it") 

Below works even without python interpreter!

In [None]:
agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")

PythonInterpreterTool is enabled by default since we set ```add_base_tools=True``` - below actually same as above

In [None]:
# print(agent.system_prompt_template) #To see what is the prompt template used
from smolagents import ToolCallingAgent, PythonInterpreterTool

# modified_prompt = "...."

agent = ToolCallingAgent(tools=[PythonInterpreterTool()], model=model)#, system_prompt=modified_prompt)

agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")

print(agent.logs) #To see full output
# agent.write_inner_memory_from_logs()

Create our custom tool within smolagents

In [None]:
from transformers import tool

@tool
def do_sth_tool(task: str) -> str:
    """
    This is a description of a tool that does sth.

    Args:
        task: The task for which
    """

    return 

agent = CodeAgent(tools=[do_sth_tool], model=model)

Import tool from HF space - ```pip install 'smolagents[gradio]'```

In [None]:
from smolagents import (
    load_tool,
    CodeAgent,
    # HfApiModel,
    GradioUI
)

# Import tool from Hub
image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True)

# model = HfApiModel(model_id)

# Initialize the agent with the image generation tool
agent = CodeAgent(tools=[image_generation_tool], model=model)

GradioUI(agent).launch()

Answer more complicated questions by enabling web search (cannot use ManagedAgent anymore) - ```pip install 'smolagents[litellm]'```

In [None]:
from smolagents import DuckDuckGoSearchTool 
from smolagents import CodeAgent, LiteLLMModel

model = LiteLLMModel(model_id=openai_model, api_key=openai_api_key)
web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)

answer = web_agent.run("Which dynasty was ruling China at the time of the fall of Constantinople?.")

Call model from [providers supported in HF](https://huggingface.co/docs/inference-providers/index)

In [None]:
# from smolagents import InferenceClientModel

# model = InferenceClientModel(
#     model_id="deepseek-ai/DeepSeek-R1",
#     provider="together",
# )

OpenAI compatible server

In [None]:
# from smolagents import OpenAIServerModel

# model = OpenAIServerModel(
#     model_id="deepseek-ai/DeepSeek-R1",
#     api_base="https://api.together.xyz/v1/", # Leave this blank to query OpenAI servers.
#     api_key=os.environ["TOGETHER_API_KEY_DRACO"], # Switch to the API key for the server you're targeting.
# )

Local model

In [None]:
# from smolagents import TransformersModel

# model = TransformersModel(
#     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
#     max_new_tokens=4096,
#     device_map="auto"
# )

[Google adk](https://github.com/google/adk-python) - Doesn't produce full answer

In [16]:
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")

os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")

In [17]:
from google.adk.agents import Agent
from google.adk.tools import google_search, built_in_code_execution


root_agent = Agent(
    name="search_assistant",
    model="gemini-2.0-flash", # Or your preferred Gemini model
    instruction="You are a helpful assistant.",#, Answer user questions using Google Search when needed.",
    # description="An assistant that can search the web.",
    tools=[built_in_code_execution]#google_search]
)

In [18]:
from google.adk.sessions import InMemorySessionService
from google.adk.runners import Runner

APP_NAME="customer_support_agent"
USER_ID="user1234"
SESSION_ID="1234"

# Session and Runner
session_service = InMemorySessionService()
session = session_service.create_session(app_name=APP_NAME, user_id=USER_ID, session_id=SESSION_ID)
runner = Runner(agent=root_agent, app_name=APP_NAME, session_service=session_service)

In [None]:
from google.genai import types

# Agent Interaction
def call_agent(query):
    content = types.Content(role='user', parts=[types.Part(text=query)])
    events = runner.run(user_id=USER_ID, session_id=SESSION_ID, new_message=content)

    for event in events:
        if event.is_final_response():
            final_response = event.content.parts[0].text
            print("Agent Response: ", final_response)

call_agent("Calculate the 12 fibonacci number?")

# Visual RAG - Only use with GPU/RunPod

[Visual RAG with smolagents](https://huggingface.co/blog/paultltc/deepsearch-using-visual-rag) - Didn't work

In [5]:
#!pip install accelerate wheel pdf2image
#!pip install flash-attn --no-build-isolation

#!apt update
#!apt install poppler-utils

In [24]:
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")

# Get the OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY_DRACO')

In [None]:
from smolagents import load_tool
from flash_attn import flash_attn_varlen_func
# Load the visual RAG tool
visual_rag_tool = load_tool(
    "vidore/visual-rag-tool",
    trust_remote_code=True,
    api_key=openai_api_key,
)

# Index the PDF document
visual_rag_tool.index([pdf_file])

# Query the tool
visual_rag_tool("What is the final DQ equation?", k=3)

[Cohere Visual RAG](https://colab.research.google.com/drive/1JwZ_nWhBUFbrzJnHKmyd0qKJ3gVt5lCe?usp=sharing#scrollTo=tzTg5U8Okfjl) - Extract information from images ```pip install cohere```

In [14]:
os.chdir("/workspace") #Use if in RunPod

In [None]:
import os
from dotenv import load_dotenv
from google import genai #pip install google-genai
import cohere
import numpy as np
import PIL
import io
import base64

# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")
cohere_api_key = os.getenv('COHERE_API_KEY')
gemini_api_key = os.getenv("GEMINI_API_KEY")  #Replace with your Gemini API key

client = genai.Client(api_key=gemini_api_key)
co = cohere.ClientV2(api_key=cohere_api_key)

gemini_model="gemini-2.5-flash-preview-04-17"

In [None]:
# from IPython.display import HTML, display

# def set_css():
#   display(HTML('''
#   <style>
#     pre {
#         white-space: pre-wrap;
#     }
#   </style>
#   '''))
# get_ipython().events.register('pre_run_cell', set_css)

In [16]:
#Some helper functions to resize images and to convert them to base64 format
max_pixels = 1568*1568  #Max resolution for images

# Resize too large images
def resize_image(pil_image):
    org_width, org_height = pil_image.size

    # Resize image if too large
    if org_width * org_height > max_pixels:
        scale_factor = (max_pixels / (org_width * org_height)) ** 0.5
        new_width = int(org_width * scale_factor)
        new_height = int(org_height * scale_factor)
        pil_image.thumbnail((new_width, new_height))

# Convert images to a base64 string before sending it to the API
def base64_from_image(img_path):
    pil_image = PIL.Image.open(img_path)
    img_format = pil_image.format if pil_image.format else "PNG"

    resize_image(pil_image)

    with io.BytesIO() as img_buffer:
        pil_image.save(img_buffer, format=img_format)
        img_buffer.seek(0)
        img_data = f"data:image/{img_format.lower()};base64,"+base64.b64encode(img_buffer.read()).decode("utf-8")

    return img_data

In [None]:
# images = { #If we want to download images from the web
#     "tesla.png": "https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fbef936e6-3efa-43b3-88d7-7ec620cdb33b_2744x1539.png",
#     "netflix.png": "https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F23bd84c9-5b62-4526-b467-3088e27e4193_2744x1539.png",
#     "nike.png": "https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5cd33ba-ae1a-42a8-a254-d85e690d9870_2741x1541.png",
#     "google.png": "https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F395dd3b9-b38e-4d1f-91bc-d37b642ee920_2741x1541.png",
#     "accenture.png": "https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08b2227c-7dc8-49f7-b3c5-13cab5443ba6_2741x1541.png",
#     "tecent.png": "https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ec8448c-c4d1-4aab-a8e9-2ddebe0c95fd_2741x1541.png"
# }

In [None]:
# Download the images and compute an embedding for each image
img_folder = "img"

img_paths = []
doc_embeddings = []
for img_path in os.listdir(img_folder):
# for name, url in tqdm.tqdm(images.items()):
    # img_path = os.path.join(img_folder, name)
    img_paths.append(os.path.join(img_folder, img_path))

    # # Download the image
    # if not os.path.exists(img_path):
    #     response = requests.get(url)
    #     response.raise_for_status()

    #     with open(img_path, "wb") as fOut:
    #         fOut.write(response.content)

    # Get the base64 representation of the image
    api_input_document = {
        "content": [
            {"type": "image", "image": base64_from_image(os.path.join(img_folder, img_path))},
        ]
    }

    # Call the Embed v4.0 model with the image information
    api_response = co.embed(
        model="embed-v4.0",
        input_type="search_document",
        embedding_types=["float"],
        inputs=[api_input_document],
    )

    # Append the embedding to our doc_embeddings list
    emb = np.asarray(api_response.embeddings.float[0])
    doc_embeddings.append(emb)

doc_embeddings = np.vstack(doc_embeddings)
print("\n\nEmbeddings shape:", doc_embeddings.shape)

In [18]:
# Search allows us to find relevant images for a given question using Cohere Embed v4
def search(question, max_img_size=800):
    # Compute the embedding for the query
    api_response = co.embed(
        model="embed-v4.0",
        input_type="search_query",
        embedding_types=["float"],
        texts=[question],
    )

    query_emb = np.asarray(api_response.embeddings.float[0])

    # Compute cosine similarities
    cos_sim_scores = np.dot(query_emb, doc_embeddings.T)

    # Get the most relevant image
    top_idx = np.argmax(cos_sim_scores)

    # Show the images
    print("Question:", question)

    hit_img_path = img_paths[top_idx]

    print("Most relevant image:", hit_img_path)
    image = PIL.Image.open(hit_img_path)
    max_size = (max_img_size, max_img_size)  # Adjust the size as needed
    image.thumbnail(max_size)
    display(image)
    return hit_img_path

# Answer the question based on the information from the image
# Here we use Gemini 2.5 as powerful Vision-LLM
def answer(question, img_path):
    prompt = [f"""Answer the question based on the following image.
Don't use markdown.
Please provide enough context for your answer.

Question: {question}""", PIL.Image.open(img_path)]

    response = client.models.generate_content(
        model=gemini_model,
        contents=prompt
    )

    answer = response.text
    print("LLM Answer:", answer)

In [None]:
# Define the query
question = "What is the Dq final equation?" #"What is the counter-clockwise head loss in a pipe?"

# Search for the most relevant image
top_image_path = search(question)

# Use the image to answer the query
answer(question, top_image_path)

Colpali and Gemini Flash [(same as above)](https://colab.research.google.com/drive/1JwZ_nWhBUFbrzJnHKmyd0qKJ3gVt5lCe?usp=sharing#scrollTo=blAg0HiXA3ae). Original notebook in [here](https://colab.research.google.com/github/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb).

<!-- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb) -->


[ColPali](https://huggingface.co/blog/manu/colpali) is a multimodal retriever that natively handles images and processes and encodes image patches to be compatible with text, thus removing need to do OCR, or image captioning.

![ColPali](https://cdn-uploads.huggingface.co/production/uploads/60f2e021adf471cbdf8bb660/La8vRJ_dtobqs6WQGKTzB.png)

[Byaldi](https://github.com/AnswerDotAI/byaldi) is a new library by answer.ai to easily use ColPali.

Install with the following (deactivate autoawq and transformers from initial installation and add pdf file and img folder with pdf images):
```bash
pip install --upgrade byaldi

apt-get update

apt-get install -y poppler-utils  # not working in macOS

pip install -q pdf2image git+https://github.com/huggingface/transformers.git qwen-vl-utils  # needed to get Qwen - 

pip uninstall flash-attn -y

pip install ninja

pip install packaging

git clone https://github.com/Dao-AILab/flash-attention

cd flash-attention

pip install .

pip uninstall torch torchvision -y

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

pip install ipywidgets
```

We should initialize `RAGMultiModalModel` object with a ColPali model from Hugging Face. By default this model uses GPU but we are going to have Qwen2-VL in the same GPU so we are loading this in CPU for now.

In [1]:
import os
import base64
import io
os.chdir("/workspace")

In [2]:
pdf_file="HardyCross.pdf"

In [None]:
from byaldi import RAGMultiModalModel

RAG = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2", verbose=1) #First time takes ~70secs to download
#can also use device="cpu"

RAG.index( #index our document using RAG
    input_path=pdf_file,#"./img/", #or pdf path
    index_name="attention", # index will be saved at index_root/index_name/
    store_collection_with_index=True,
    overwrite=True
)

In [None]:
from PIL import Image

question = text_query="What is the Dq final equation?" # "What is the counter-clockwise head loss in a pipe?"

results = RAG.search(question, k=1) #retrieve top 1 image
results

In [5]:
image_bytes = base64.b64decode(results[0].base64)

In [None]:
def show_results(question):
  results = RAG.search(question, k=1)
  image_bytes = base64.b64decode(results[0].base64)
  # Convert bytes to image using Pillow
  image = Image.open(io.BytesIO(image_bytes))

  # Display the image
  max_size = (800, 800)
  image.thumbnail(max_size)
  display(image)

show_results(question)

After indexing and retrieving data, we will use [Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) to build a RAG pipeline.


In [7]:
model_name="Qwen/Qwen2-VL-2B-Instruct"

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model = Qwen2VLForConditionalGeneration.from_pretrained(model_name,
                                                        trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval() #first time takes ~45 secs to download

In [9]:
from pdf2image import convert_from_path
images = convert_from_path("/workspace/HardyCross.pdf")

In [None]:
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

image_index = results[0]["page_num"] - 1

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": images[image_index],
            },
            {"type": "text", "text": question},
        ],
    }
]

In [11]:
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

In [12]:
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

In [None]:
generated_ids = model.generate(**inputs, max_new_tokens=50)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print(output_text)