In [88]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor


In [89]:
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 5/5 [00:01<00:00,  4.99it/s]


In [90]:
default_prompt = """
You are a proficient OCR (Optical Character Recognition) system. Your task is to accurately extract text from provided images or text-based inputs exactly as it appears, without adding, modifying, or interpreting information. Follow these rules:

- Strict Accuracy: Reproduce text verbatim, including typos, spacing, line breaks, and punctuation.

- No Additions: Do not correct errors, fill in gaps, or infer context.

- Handle Ambiguity: If entire words are unparseable, replace them with a single _ (e.g., ▒▒▓▓▓ → _).

- Languages/Symbols: Support all languages and special characters (e.g., @, #, é, 汉字).

- Language: The image is written in French, use this as the base to guess words if necessary.

## Output Format:

Return only the extracted text, without explanations inside the <transcribed></transcribed> tag

---

Begin processing:
"""

In [91]:
few_shot_image_1 = Image.open("./images/fullpage-french-1.png")
few_shot_image_2 = Image.open("./images/sample_1.png")

alternative_default = [
    { "role": "system", "content": [
        {"type": "text", "text": default_prompt}
    ]},
    { "role": "user", "content": [
        {"type": "image" }
    ]},
    {
    "role": "assistant",
    "content": [
        {
            "type": "text", "text": "<transcribed>J'étudie français.</transcribed>"}
    ]},
    { "role": "user", "content": [
        {"type": "image" },
    ]},
    {
    "role": "assistant",
    "content": [
        {
            "type": "text", "text": "<transcribed>Scott Joplin, né le 24 novembre 1868 au Texas, est un pianiste et compositeur afro-américain. Bien qu'ayant écrit des œuvres dans plusieurs styles, y compris le classique et l'opéra, sa notoriété tient principalement à ses compositions de musique ragtime. Sa musicalité, son talent et son importance dans l’histoire du ragtime et de la musique américaine sont exceptionnels.</transcribed>"}
       ],
    }
]

In [92]:
path = "./images/fullpage-french-2.png"
image = Image.open(path)

if image: 
    print("Image loaded", path)

messages = alternative_default + [
    {"role": "user", "content": [
        {"type": "image" }
    ]}
]

Image loaded ./images/fullpage-french-2.png


In [93]:
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    [
        few_shot_image_2,
        few_shot_image_1, 
        image,
    ],
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=2000)
for i in output:
    print(processor.decode(i))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 12 Feb 2025

[{'type': 'text', 'text': '\nYou are a proficient OCR (Optical Character Recognition) system. Your task is to accurately extract text from provided images or text-based inputs exactly as it appears, without adding, modifying, or interpreting information. Follow these rules:\n\n- Strict Accuracy: Reproduce text verbatim, including typos, spacing, line breaks, and punctuation.\n\n- No Additions: Do not correct errors, fill in gaps, or infer context.\n\n- Handle Ambiguity: If entire words are unparseable, replace them with a single _ (e.g., ▒▒▓▓▓ → _).\n\n- Languages/Symbols: Support all languages and special characters (e.g., @, #, é, 汉字).\n\n- Language: The image is written in French, use this as the base to guess words if necessary.\n\n## Output Format:\n\nReturn only the extracted text, without explanations inside the <transcribed></transcribed> tag\n\n---\n\nBeg