# VLM + OCR Grounding

In [1]:
import os
os.environ["TRANSFORMERS_CACHE"] = "X:/Programming/Models"

In [2]:
# %pip install -q pandas numpy matplotlib seaborn scikit-learn tqdm
# %pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# %pip install -q transformers pillow nltk evaluate accelerate kagglehub
# %pip install -q rouge_score
# %pip install -q jupyter ipywidgets
# %pip install -q easyocr[ru]

In [3]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from PIL import Image, ImageDraw
import easyocr
import numpy as np



In [4]:
print("Torch Version:", torch.__version__)
print("CUDA status:", "Available" if torch.cuda.is_available() else "Not Available")
print("CUDA Device Count:", torch.cuda.device_count())
print("GPU Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

Torch Version: 2.6.0+cu124
CUDA status: Available
CUDA Device Count: 1
GPU Device: NVIDIA GeForce RTX 4070 SUPER


### VLM

In [5]:
model_id = "Qwen/Qwen2-VL-2B-Instruct"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### OCR

In [6]:
reader = easyocr.Reader(['ru', 'en'], gpu=True)

## Pipeline

In [7]:
image_path = "image_2.jpg"
image = Image.open(image_path).convert('RGB')

In [8]:
prompt = """
Extract ONLY sensitive text from the image.

Rules:
- Output ONLY text that appears verbatim in the image
- Sensitivity must depend on visual context
- One item per line
- No explanations
"""

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt}
        ]
    }
]

In [9]:
text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

inputs = processor(
    text=[text],
    images=image,
    return_tensors="pt"
).to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=128,
    do_sample=False,
)

output_vlm = []

for item in processor.decode(outputs[0], skip_special_tokens=True).split("assistant\n")[-1].split('\n'):
    output_vlm.extend(item.split())

output_vlm = [item for item in output_vlm if len(item) > 3]
print('\n'.join(output_vlm))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Apache
Kafka
Spark
Streaming
Apache
HBase
elasticsearch
MySQL


In [10]:
output_ocr = reader.readtext(image_path)
output_ocr[0]

([[np.int32(731), np.int32(81)],
  [np.int32(835), np.int32(81)],
  [np.int32(835), np.int32(97)],
  [np.int32(731), np.int32(97)]],
 'Following Task',
 np.float64(0.9999287997170756))

In [11]:
output_ocr[0][1]

'Following Task'

In [12]:
ocr_items = []
for line in output_ocr:
    bbox = line[0]
    text = line[1]
    score = line[2]
    ocr_items.append({
        "text": text,
        "bbox": bbox,
        "score": score
    })

ocr_items[0]

{'text': 'Following Task',
 'bbox': [[np.int32(731), np.int32(81)],
  [np.int32(835), np.int32(81)],
  [np.int32(835), np.int32(97)],
  [np.int32(731), np.int32(97)]],
 'score': np.float64(0.9999287997170756)}

In [13]:
grounded = []

for phrase in output_vlm:
    for item in ocr_items:
        if phrase.lower() in item["text"].lower():
            grounded.append({
                "text": phrase,
                "bbox": item["bbox"]
            })

In [14]:
set([item['text'] for item in grounded])

{'Kafka', 'MySQL', 'Spark', 'Streaming', 'elasticsearch'}

## Demonstration

In [15]:
def draw_results(image_pil, results):
    draw = ImageDraw.Draw(image_pil)
    
    for item in results:
        bbox = item['bbox']
        
        points = []
        for point in bbox:
            points.extend([point[0], point[1]])

        fill_color = (0, 0, 0, 64)  # RGBA с прозрачностью 64/255
        outline_color = (0, 0, 0)
        
        draw.polygon(points, fill=fill_color, outline=outline_color, width=8)
    
    return image_pil

# Рисуем bbox на изображении
annotated_image = draw_results(image.copy(), grounded)

# Сохраняем результат
annotated_image.save('ocr_with_boxes.jpg', 'JPEG', quality=95)