In [3]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
import requests

from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

#workaround for unnecessary flash_attn requirement
import os
from unittest.mock import patch
from transformers.dynamic_module_utils import get_imports
from typing import Union, List

def fixed_get_imports(filename: Union[str, os.PathLike]) -> List[str]:
    if not str(filename).endswith("/modeling_florence2.py"):
        return get_imports(filename)
    imports = get_imports(filename)
    imports.remove("flash_attn")
    return imports


with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports): #workaround for unnecessary flash_attn requirement
    model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base",trust_remote_code=True)

processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

prompt = "<OD>"

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=prompt, images=image, return_tensors="pt")

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    num_beams=3,
    do_sample=False
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))

print(parsed_answer)




config.json:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

configuration_florence2.py:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-base:
- configuration_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_florence2.py:   0%|          | 0.00/127k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-base:
- modeling_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/464M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

processing_florence2.py:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- processing_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'<OD>': {'bboxes': [[34.23999786376953, 160.0800018310547, 597.4400024414062, 371.7599792480469], [272.32000732421875, 241.67999267578125, 303.67999267578125, 247.4399871826172], [454.0799865722656, 276.7200012207031, 553.9199829101562, 370.79998779296875], [96.31999969482422, 280.55999755859375, 198.0800018310547, 371.2799987792969]], 'labels': ['car', 'door handle', 'wheel', 'wheel']}}


In [11]:
print(generated_text)

</s><s>car<loc_53><loc_333><loc_933><loc_774>door handle<loc_425><loc_503><loc_474><loc_515>wheel<loc_709><loc_576><loc_865><loc_772><loc_150><loc_584><loc_309><loc_773></s>


In [14]:
def run_example(task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))

    print(parsed_answer)

In [15]:
prompt = "<CAPTION>"
run_example(prompt)

{'<CAPTION>': '\nA green car parked in front of a yellow building.\n'}


In [16]:
def filter_caption(output):
    return output.get('<CAPTION>', '').strip()

# Example usage
output = {'<CAPTION>': '\nA green car parked in front of a yellow building.\n'}
caption = filter_caption(output)
print(caption)

A green car parked in front of a yellow building.


In [20]:
def generate_image_captions(input, output, folder_path):
    with open(output, 'w') as output_file: 
        # Loop through the image files and open each one
        for image_file in input:
            image_path = os.path.join(folder_path, image_file)
            with Image.open(image_path) as image:
                inputs = processor(text='<CAPTION>', images=image, return_tensors="pt")
                generated_ids = model.generate(
                    input_ids=inputs["input_ids"],
                    pixel_values=inputs["pixel_values"],
                    max_new_tokens=1024,
                    num_beams=3,
                    do_sample=False
                )
                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
                parsed_answer = processor.post_process_generation(generated_text, task='<CAPTION>', image_size=(image.width, image.height))
                parsed_answer = parsed_answer.get('<CAPTION>', '').strip()
                output_file.write(f"{parsed_answer}\n")

folder_path_story = '/Users/mimi/Desktop/internship/study/all_images_story'
story_files = os.listdir(folder_path_story)
story_image_files = [f for f in story_files if f.lower().endswith(('.png'))]
generate_image_captions(story_image_files, '/Users/mimi/Desktop/internship/study/story_image_captions_florence.txt',folder_path_story)

In [21]:
folder_path_sound = '/Users/mimi/Desktop/internship/study/all_images_sound'
sound_files = os.listdir(folder_path_sound)
sound_image_files = [f for f in sound_files if f.lower().endswith(('.png'))]
generate_image_captions(sound_image_files, '/Users/mimi/Desktop/internship/study/sound_image_captions_florence.txt',folder_path_sound)