In [2]:
!pip install qwen_vl_utils

Collecting qwen_vl_utils
  Downloading qwen_vl_utils-0.0.8-py3-none-any.whl.metadata (3.6 kB)
Collecting av (from qwen_vl_utils)
  Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Downloading qwen_vl_utils-0.0.8-py3-none-any.whl (5.9 kB)
Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen_vl_utils
Successfully installed av-14.0.1 qwen_vl_utils-0.0.8


In [13]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

# Set environment variable for expandable segments
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Load the model with reduced precision and enable flash attention
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "prithivMLmods/Qwen2-VL-OCR-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",  # Let accelerate handle the device mapping
)

# Default processor
processor = AutoProcessor.from_pretrained("prithivMLmods/Qwen2-VL-OCR-2B-Instruct")

# Prepare messages for inference
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "/content/card.png",
            },
            {"type": "text", "text": "Extract the text from the ID image"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

# Clear any cached memory
torch.cuda.empty_cache()

# Move inputs to GPU
inputs = {key: val.to("cuda") for key, val in inputs.items()}

# Inference: Generation of the output
try:
    generated_ids = model.generate(**inputs, max_new_tokens=128)

    # Trim the generated IDs to get the output text
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    # Print the output text
    print(output_text)

except RuntimeError as e:
    if 'out of memory' in str(e):
        print("CUDA out of memory. Try reducing the batch size or using mixed precision.")
        torch.cuda.empty_cache()  # Clear cache to free up memory
    else:
        raise e

print("Final memory allocated:", torch.cuda.memory_allocated())
print("Final memory reserved:", torch.cuda.memory_reserved())

["ROYAUME DU MAROC\nCARTE NATIONALE D'IDENTITE\nMALAK\nHAFFANE\nNée le\n23.09.2006\nà MAARIF CASABLANCA ANFA\nValable jusqu'au\n26.12.2028\nFG\nBJ472690<|im_end|>"]
Final memory allocated: 5150672896
Final memory reserved: 9770631168
