In [None]:
# from IPython.display import clear_output
# !pip install transformers datasets outlines pillow accelerate qwen-vl-utils
# clear_output()

In [1]:
import transformers
print(f"Transformers version: {transformers.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.52.3


In [2]:
import outlines
import os
os.environ['HF_HOME'] = '/workspace/huggingface/'

# Outlines vs Usual Transformer

In [3]:
import torch
from transformers import (
    AutoProcessor,
    Qwen2_5_VLForConditionalGeneration
)

In [4]:
# MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
# MODEL_ID = "yfan1997/GRIT-20-Qwen2.5-VL-3B"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16

MODEL_CLASS = Qwen2_5_VLForConditionalGeneration
PROCESSOR_CLASS = AutoProcessor

MIN_PIXELS = 20*20
MAX_PIXELS = 1024*1024

In [8]:
def get_outlines_model_processor(path, MODEL_CLASS, PROCESSOR_CLASS):
    model_kwargs = {
        "torch_dtype": torch.bfloat16,
        "attn_implementation": "eager",
        "device_map": "auto",
    }
    processor_kwargs = {
        "trust_remote_code": True,
        "use_fast": True,
    }

    model = outlines.from_transformers(
        MODEL_CLASS.from_pretrained(MODEL_ID, **model_kwargs),
        PROCESSOR_CLASS.from_pretrained(MODEL_ID, **processor_kwargs),
    )
    return model, model.processor

In [9]:
def get_hf_model_processor(path, MODEL_CLASS, PROCESSOR_CLASS):
    model = MODEL_CLASS.from_pretrained(
        path,
        torch_dtype=DTYPE,
        attn_implementation="eager",
        device_map=DEVICE,
        low_cpu_mem_usage=True,
    ).eval()
    processor = PROCESSOR_CLASS.from_pretrained(
        path, 
        trust_remote_code=True, 
        padding_side='left', 
        use_fast=True, 
        min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
    )
    processor.tokenizer.padding_side = "left"
    return model, processor

In [10]:
## USE OUTLINES LIBRARY FOR STRUCTURED OUTPUT
# model, processor = get_outlines_modelMODEL_ID, MODEL_CLASS, PROCESSOR_CLASS)

## LOAD FOR REGULAR TRANSFORMER
model, processor = get_hf_model_processor(MODEL_ID, MODEL_CLASS, PROCESSOR_CLASS)

Loading checkpoint shards: 100%|██████████| 5/5 [00:10<00:00,  2.17s/it]
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


# Define the Structured Output Schema

In [None]:
from pydantic import BaseModel, Field

class ObjectCount(BaseModel):
    count: int = Field(..., description="Number of objects in the image")

object_count_generator = outlines.Generator(model, ObjectCount)

# Inference (with `outlines`)

In [11]:
from PIL import Image
from qwen_vl_utils import process_vision_info

In [None]:
prompt = "How many red circles in the image?"
image_path = "../files/img1.png"
image = Image.open(image_path).convert("RGB")

messages = [
    {"role": "user", "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": prompt}
    ]}
]

image_inputs, _ = process_vision_info(messages)
input_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

inputs = processor(
    text=[input_text],
    images=image_inputs,
    return_tensors="pt",
    padding=True
).to("cuda")

In [13]:
print(input_text)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>How many red circles in the image?<|im_end|>



# Inference (with `transformers`)

In [14]:
MAX_NEW_TOKENS = 50
TEMP = 0.0
DO_SAMPLE = False
OUT_ATTN = True
RETURN_DICT = True

outputs = model.generate(
    **inputs,
    max_new_tokens=MAX_NEW_TOKENS,
    do_sample=DO_SAMPLE,
    output_attentions=OUT_ATTN,
    return_dict_in_generate=RETURN_DICT,
    temperature=TEMP,
)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [15]:
decoded_output = processor.tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

# equivalent to batch decoding
decoded_outputs = processor.batch_decode(outputs.sequences, skip_special_tokens=True)
decoded_output = decoded_outputs[0] # ~ first item in the batch

In [16]:
decoded_output

'system\nYou are a helpful assistant.\nuser\nHow many red circles in the image?\n addCriterion\nThe image contains 12 red circles.'

If wanna trim the input

In [37]:
input_length = len(inputs['input_ids'][0])
decoded_output = processor.tokenizer.decode(outputs.sequences[0][input_length:], skip_special_tokens=True)
decoded_output

' addCriterion\nThe image contains 12 red circles.'

## Use `outlines` library

In [18]:
res = object_count_generator({
    "text": input_text,
    "images": Image.open(image_path).convert("RGB")
})

In [25]:
import ast
print(res)
print(ast.literal_eval(res)['count'])

{"count": 11131 }
11131


# Prepare Eval Dataset

In [7]:
from datasets import load_dataset

ds = load_dataset("patrickamadeus/multitask-diagnostic-suite-vlm")

In [79]:
from qwen_vl_utils import process_vision_info

def input_ids_splitter(processor, text_query, image_inputs, video_inputs=None):
    processor_kwargs = {
        "text": [text_query],
        "images": image_inputs,
        "padding": True,
        "return_tensors": "pt",
    }
    if video_inputs is not None:
        processor_kwargs["videos"] = video_inputs

    inputs = processor(**processor_kwargs).to(DEVICE)
    input_ids = inputs['input_ids'][0].tolist()

    im_start_token_id = processor.tokenizer.convert_tokens_to_ids('<|im_start|>')
    im_end_token_id = processor.tokenizer.convert_tokens_to_ids('<|im_end|>')
    vision_start_token_id = processor.tokenizer.convert_tokens_to_ids('<|vision_start|>')
    vision_end_token_id = processor.tokenizer.convert_tokens_to_ids('<|vision_end|>')
    image_pad_token_id = processor.tokenizer.convert_tokens_to_ids('<|image_pad|>')

    vision_start_pos = input_ids.index(vision_start_token_id)
    vision_end_pos = input_ids.index(vision_end_token_id)
    start_pos = vision_start_pos + 1
    end_pos = vision_end_pos

    im_start_positions = [i for i, tid in enumerate(input_ids) if tid == im_start_token_id]
    im_end_positions = [i for i, tid in enumerate(input_ids) if tid == im_end_token_id]

    system_indices = []
    if im_start_positions and im_end_positions:
        for start_idx in im_start_positions:
            lookahead = input_ids[start_idx+1:start_idx+8]
            lookahead_text = processor.tokenizer.decode(lookahead, skip_special_tokens=False)
            if lookahead_text.startswith('system\n'):
                end_idx = next((e for e in im_end_positions if e > start_idx), None)
                if end_idx is not None:
                    system_indices = list(range(start_idx+1, end_idx))
                break

    text_indices = []
    for start_idx in im_start_positions:
        lookahead = input_ids[start_idx+1:start_idx+8]
        lookahead_text = processor.tokenizer.decode(lookahead, skip_special_tokens=False)
        if not lookahead_text.startswith('system\n'):
            end_idx = next((e for e in im_end_positions if e > start_idx), None)
            if end_idx is not None:
                for i in range(start_idx+1, end_idx):
                    tid = input_ids[i]
                    if tid not in [vision_start_token_id, image_pad_token_id]:
                        text_indices.append(i)
            break

    special_token_indices = []
    image_indices = []
    for i, token_id in enumerate(input_ids):
        token = processor.tokenizer.convert_ids_to_tokens([token_id])[0]
        if token.startswith('<|') and token.endswith('|>') and token != '<|image_pad|>':
            special_token_indices.append(i)
        elif token_id == image_pad_token_id:
            image_indices.append(i)

    system_tokens = [input_ids[i] for i in system_indices]
    vision_tokens = [input_ids[i] for i in image_indices]
    text_tokens = [input_ids[i] for i in text_indices]
    special_tokens = [input_ids[i] for i in special_token_indices]

    system_text = processor.tokenizer.decode(system_tokens, skip_special_tokens=False)
    vision_text = processor.tokenizer.decode(vision_tokens, skip_special_tokens=False)
    text_text = processor.tokenizer.decode(text_tokens, skip_special_tokens=False)
    special_text = processor.tokenizer.decode(special_tokens, skip_special_tokens=False)

    locator_info = {
        "inputs": inputs,
        "input_ids": input_ids,
        "system_indices": system_indices,
        "image_indices": image_indices,
        "text_indices": text_indices,
        "special_token_indices": special_token_indices,
        "system_tokens": system_tokens,
        "vision_tokens": vision_tokens,
        "text_tokens": text_tokens,
        "special_tokens": special_tokens,
        "system_text": system_text,
        "vision_text": vision_text,
        "text_text": text_text,
        "special_text": special_text,
        "vision_start_pos": start_pos,
        "vision_end_pos": end_pos,
    }
    
    # Clean up temporary variables
    del processor_kwargs, system_tokens, vision_tokens, text_tokens, special_tokens
    del system_text, vision_text, text_text, special_text
    del im_start_token_id, im_end_token_id, vision_start_token_id, vision_end_token_id, image_pad_token_id
    del vision_start_pos, vision_end_pos, start_pos, end_pos
    
    return locator_info

def extract_prompt_chunks(prompt: str):
    im_start = "<|im_start|>"
    im_end = "<|im_end|>"
    vision_start = "<|vision_start|>"
    vision_end = "<|vision_end|>"

    # Extract system chunk
    sys_start = prompt.find(f"{im_start}system\n")
    sys_end = prompt.find(im_end, sys_start)
    system_chunk = None
    if sys_start != -1 and sys_end != -1:
        system_chunk = prompt[sys_start:sys_end+len(im_end)]

    # Extract user chunk
    user_start = prompt.find(f"{im_start}user\n")
    user_end = prompt.find(im_end, user_start)
    user_chunk = None
    if user_start != -1 and user_end != -1:
        user_chunk = prompt[user_start:user_end+len(im_end)]

    # Extract vision chunk (inside user chunk)
    vision_chunk = None
    if user_chunk is not None:
        v_start = user_chunk.find(vision_start)
        v_end = user_chunk.find(vision_end, v_start)
        if v_start != -1 and v_end != -1:
            vision_chunk = user_chunk[v_start:v_end+len(vision_end)]

    # Extract user text (user chunk minus vision chunk)
    user_text_chunk = None
    if user_chunk is not None and vision_chunk is not None:
        before = user_chunk[:user_chunk.find(vision_start)]
        after = user_chunk[user_chunk.find(vision_end)+len(vision_end):]
        user_text_chunk = (before + after).strip()
        if user_text_chunk.startswith(f"{im_start}user\n"):
            user_text_chunk = user_text_chunk[len(f"{im_start}user\n"):]
        if user_text_chunk.endswith(im_end):
            user_text_chunk = user_text_chunk[:-len(im_end)]
        user_text_chunk = user_text_chunk.strip()

    return {
        "system": system_chunk,
        "user": user_text_chunk,
        "vision": vision_chunk
    }

def reorder_prompt_chunks(prompt: str, order: list):
    chunks = extract_prompt_chunks(prompt)
    chunk_map = {
        "system": chunks.get("system", ""),
        "user": chunks.get("user", ""),
        "vision": chunks.get("vision", "")
    }
    if chunk_map["user"]:
        chunk_map["user"] = f"<|im_start|>user\n{chunk_map['user']}<|im_end|>"
    reordered = ""
    for key in order:
        if chunk_map[key]:
            reordered += chunk_map[key]
            if not reordered.endswith("\n"):
                reordered += "\n"
    return reordered


def build_prompt(question, image, processor, ordering):
    messages = [
        {"role": "user", "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": question}
        ]}
    ]

    image_inputs, _ = process_vision_info(messages)
    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    prompt = reorder_prompt_chunks(prompt, ordering)

    tokenized_prompt = processor(
        text=[prompt],
        images=image_inputs,
        return_tensors="pt",
        padding=True
    ).to("cuda")

    return tokenized_prompt, prompt

def extract_number(text):
    return int(re.search(r'\d+', text).group())

In [80]:
df = ds['train'].to_pandas()
df = df.loc[(df.task == 'object counting')]

In [81]:
from PIL import Image

img = df.iloc[0]["image"]
img = Image.open(BytesIO(img['bytes'])).convert("RGB")
q = df.iloc[0]["question"] + " Count:"

In [82]:
q

'How many ants are in the image? Count:'

In [91]:
inputs,prompt = build_prompt(q, img, processor, ['system','vision','user'])

In [92]:
print(prompt)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
How many ants are in the image? Count:<|im_end|><|vision_start|><|image_pad|><|vision_end|><|im_end|>



In [66]:
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=False,
    output_attentions=False,
    return_dict_in_generate=True,
    temperature=0.0,
)
input_length = len(inputs['input_ids'][0])
res_text = processor.tokenizer.decode(outputs.sequences[0][input_length:], skip_special_tokens=True)
res = safe_int_parse(res_text)
res

10

In [67]:
res_text

' addCriterion\nThe image contains 10 ants.'

# Final Evaluation loop (toggle `USE_OUTLINES`)

In [None]:
import json
from tqdm import tqdm
import re

USE_OUTLINES = False
OUTPUT_PATH = "../files/results/output_trf_7b_455.jsonl"
PROMPT_SET_ORDERING = [
    ['system', 'user', 'vision'],
    ['system', 'vision', 'user'],
    ['user', 'system', 'vision'],
    ['user', 'vision', 'system'],
    ['vision', 'system', 'user'],
    ['vision', 'user', 'system'],
]

In [70]:
from tqdm import tqdm
import json, ast
from PIL import Image
from qwen_vl_utils import process_vision_info

def safe_int_parse(x):
    for fn in (
        lambda v: int(ast.literal_eval(v)['count']),
        lambda v: int(str(v).strip()),
        lambda v: extract_number(str(v))
    ):
        try:
            return fn(x)
        except Exception:
            continue
    return None

def mini_bar(curr, total, width=20):
    p = curr / total
    nfull = int(p * width)
    rem = width - nfull - 1
    frac_idx = int((p * width % 1) * 8)
    bars = "▏▎▍▌▋▊▉█"
    frac = bars[frac_idx] if 0 <= frac_idx < len(bars) else ""
    return f"{'█'*nfull}{frac}{' '*(rem if rem>0 else 0)} {curr}/{total}"

In [73]:
from tqdm import tqdm
import json, ast
from PIL import Image
from io import BytesIO

def safe_int_parse(x):
    for fn in (
        lambda v: int(ast.literal_eval(v)['count']),
        lambda v: int(str(v).strip()),
        lambda v: extract_number(str(v))
    ):
        try:
            return fn(x)
        except Exception:
            continue
    return None

def mini_bar(curr, total, width=20):
    p = curr / total
    nfull = int(p * width)
    rem = width - nfull - 1
    frac_idx = int((p * width % 1) * 8)
    bars = "▏▎▍▌▋▊▉█"
    frac = bars[frac_idx] if 0 <= frac_idx < len(bars) else ""
    return f"{'█'*nfull}{frac}{' '*(rem if rem>0 else 0)} {curr}/{total}"

# Open file once and write row by row immediately
with open(OUTPUT_PATH, "a") as f:
    with tqdm(total=len(PROMPT_SET_ORDERING), desc="Processing orders") as outer_pbar:
        for order in PROMPT_SET_ORDERING:
            for i, row in enumerate(df.itertuples(), 1):
                answer = extract_number(row.answer)
                img = Image.open(BytesIO(row.image['bytes'])).convert("RGB")
                inputs, _ = build_prompt(row.question+" Count (in number):", img, processor, order)

                if USE_OUTLINES:
                    res_text = object_count_generator({
                        "text": input_text,
                        "images": img
                    })
                    res = safe_int_parse(res_text)
                else:
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=50,
                        do_sample=False,
                        output_attentions=False,
                        return_dict_in_generate=True,
                        temperature=0.0,
                    )
                    input_length = len(inputs['input_ids'][0])
                    res_text = processor.tokenizer.decode(outputs.sequences[0][input_length:], skip_special_tokens=True)
                    res = safe_int_parse(res_text)

                # Immediately write each row
                f.write(json.dumps({
                    "id": row.Index,
                    "question": row.question,
                    "answer": answer,
                    "raw": res_text,
                    "res": res,
                    "abs_diff": abs(res - answer) if res is not None else -1,
                    "order": order,
                }) + "\n")
                f.flush()  # make sure it's written to disk

                # Update mini inner progress bar in outer_pbar
                outer_pbar.set_postfix_str(f"{order} | {mini_bar(i, len(df))}")

            outer_pbar.update(1)

Processing orders: 100%|██████████| 6/6 [27:23<00:00, 273.91s/it, ['vision', 'user', 'system'] | ████████████████████▏ 308/308]


In [77]:
!pip install transformers==4.52.3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers==4.52.3
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.52.3)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.52.3-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m148.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m402.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.22.0
    Uninstalling tokenizers-0.22.0:
      Successfully uninstalled tokenizers-0.22.0
  Attempting uninstall: transformers
    Found existing installation: transformers 4.56.1
    Unins