In [None]:
!pip install transformers



In [None]:
!pip install qwen-vl-utils



Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.8-py3-none-any.whl.metadata (3.6 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading qwen_vl_utils-0.0.8-py3-none-any.whl (5.9 kB)
Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen-vl-utils
Successfully installed av-13.1.0 qwen-vl-utils-0.0.8


In [None]:
import requests
from PIL import Image
from io import BytesIO
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import torch

# Setting up the model and processor paths
model_directory = "Qwen/Qwen2-VL-7B-Instruct"

# Loading the fine-tuned model from the specified directory
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_directory,
    torch_dtype="auto",  # Automatically selects the appropriate torch data type
    device_map="auto"    # Automatically maps the model to the available GPU/CPU
).to('cuda')

# Loading the processor from the same directory as the model
processor = AutoProcessor.from_pretrained(model_directory)

# Function to download an image from a URL and resize it
def download_image(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img = img.resize((512, 512))  # Resize image to ensure model compatibility
        return img
    except Exception as e:
        print(f"Error downloading image: {e}")
        return None

# Function to process a single image and extract all relevant details
def process_image(image, entity_name=None):
    if image:
        # Customize the prompt based on the entity_name if provided, otherwise ask for all details
        text_prompt = f"Tell me all things about the image."
        if entity_name:
            text_prompt = f"Tell me all things about the image, especially the entity: {entity_name}."

        messages = [{
            "role": "user",
            "content": [{"type": "image"}, {"type": "text", "text": text_prompt}]
        }]

        # Creating input for the model using the processor
        processed_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=[processed_prompt], images=[image], padding=True, return_tensors="pt").to("cuda")

        # Model inference
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=256)

        # Decode the model's output
        output_text = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        extracted_text = output_text[0] if output_text else "No answer found."
        return extracted_text
    else:
        return "Error: Unable to process the image."

# Example usage: Provide a direct image URL or path here
image_url = "https://www.google.com/url?sa=i&url=https%3A%2F%2Fwww.alamy.com%2Fstock-photo%2Ffood-expiration-date-label.html&psig=AOvVaw2-Rau9TNatrbiCTpX8pFRE&ust=1729455517005000&source=images&cd=vfe&opi=89978449&ved=0CBQQjRxqFwoTCOjBxIGim4kDFQAAAAAdAAAAABAE"  # Replace with a valid image URL or load an image file
entity_name = "ExampleEntity"  # Optional: Specify entity of interest

# Download and process the image
image = download_image(image_url)
extracted_information = process_image(image, entity_name)

# Output the extracted information
print(f"Extracted Information:\n{extracted_information}")


In [None]:
pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-_91q2zn6
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-_91q2zn6
  Resolved https://github.com/huggingface/transformers to commit 816f4424964c1a1631e303b663fc3d68f731e923
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers<0.21,>=0.20 (from transformers==4.46.0.dev0)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: transformers
  

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "/content/maxresdefault.jpg",
            },
            # {"type": "text", "text": "extract all the datas from the image in json format where all the keys manufacturing data or expiry date if any of data is not present then give it as."" give only these 2 keys bnot other than these. if manufacuting data is present then vaue of this key to blank, do same for expiry date if not present"},
            {"type": "text", "text": "Extract the expiry date from this Image if present and return answer as json.if multiple dates are there in images then return the data that is most recent companred ti the others."},

        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['```json\n{\n  "expiry_date": "21/7/22"\n}\n```']
