# Florence-2: Open Source Vision Foundation Model
Florence-2 is a lightweight vision-language foundation model developed by Microsoft Azure AI and open-sourced under the MIT license. It aims to achieve a unified, prompt-based representation for diverse vision and vision-language tasks, including captioning, object detection, grounding, and segmentation. Despite its compact size, Florence-2 rivals much larger models like Kosmos-2 in performance. Florence-2 represents a significant advancement in vision-language models by combining lightweight architecture with robust capabilities, making it highly accessible and versatile. Its unified representation approach, supported by the extensive FLD-5B dataset, enables it to excel in multiple vision tasks without the need for separate models. This efficiency makes Florence-2 a strong contender for real-world applications, particularly on devices with limited resources.
# Paper 
https://arxiv.org/pdf/2311.06242
# Models
https://huggingface.co/microsoft/Florence-2-large
# Dataset
Training set (FLD-5B) of 126M images, more than 500M text annotations, 1.3B region-text annotations, and 3.6B textphrase-region annotations. Each image is annotated with text, region-text pairs, and text-phrase-region triplets and each annotation type has multiple instances varying in diverse granularity


# Openvino (Open Visual Inference and Neural network Optimization)
OpenVINO is an open-source toolkit for optimizing and deploying deep learning models from cloud to edge. It accelerates deep learning inference across various use cases, such as generative AI, video, audio, and language with models from popular frameworks like PyTorch, TensorFlow, ONNX, and more. Convert and optimize models, and deploy across a mix of Intel® hardware and environments, on-premises and on-device, in the browser or in the cloud.

- https://docs.openvino.ai/2024/index.html
- https://docs.openvino.ai/2024/documentation/openvino-ir-format.html (Intermediate Representation)
# MLFLOW

MLflow is an open-source platform, purpose-built to assist machine learning practitioners and teams in handling the complexities of the machine learning process. MLflow focuses on the full lifecycle for machine learning projects, ensuring that each phase is manageable, traceable, and reproducible.

https://mlflow.org/docs/latest/index.html

In [None]:
import platform

%pip install -q "openvino>=2024.3.0" "einops" "torch>2.1" "torchvision" "timm>=0.9.8" "transformers>=4.41" "pillow" "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu

if platform.system() != "Windows":
    %pip install -q "matplotlib>=3.4"
else:
    %pip install -q "matplotlib>=3.4,<3.7"

In [None]:
#! pip install mlflow ov_helpers -q

In [None]:
import requests
from pathlib import Path
import os

In [None]:


if not Path("ov_florence2_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/florence2/ov_florence2_helper.py")
    open("ov_florence2_helper.py", "w", encoding="utf-8").write(r.text)


if not Path("gradio_helper.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/florence2/gradio_helper.py")
    open("gradio_helper.py", "w", encoding="utf-8").write(r.text)

if not Path("notebook_utils.py").exists():
    r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py")
    open("notebook_utils.py", "w" ,encoding="utf-8").write(r.text)

In [None]:
from ov_helpers.ov_florence2_helper import convert_florence2, get_model_selector

model_selector = get_model_selector()

model_selector

In [None]:
model_id = model_selector.value
model_path = Path(model_id.split("/")[-1])

# Uncomment the line to see conversion code
#??convert_florence2

In [None]:
convert_florence2(model_id, model_path)

In [None]:
from notebook_utils import device_widget

device = device_widget()

device

In [None]:
from ov_helpers.ov_florence2_helper import OVFlorence2Model

In [None]:
modelp = os.path.join(os.getcwd(), model_path.name)
modelp

In [None]:
model = OVFlorence2Model(modelp, device.value)

In [None]:
import requests
from PIL import Image

from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

prompt = "<OD>"

# url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
# image = Image.open(requests.get(url, stream=True).raw)
path = "../images/buildings.jpg"
image = Image.open(path)
#image

In [None]:
image.width

In [None]:
#processor.image_processor.crop_size = {'height': 1024, 'width': 1024}

In [None]:
inputs = processor(text=prompt, images=image, return_tensors="pt")



In [None]:
#processor

In [None]:
inputs.keys()

In [None]:
inputs["pixel_values"].shape

In [None]:
inputs["input_ids"]

In [None]:
generated_ids = model.generate(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, do_sample=False, num_beams=3)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))

In [None]:
parsed_answer

In [None]:
import io
import copy
import random
import requests
from pathlib import Path

import gradio as gr
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

from PIL import Image, ImageDraw

In [None]:
def plot_bbox(image, data):
    fig, ax = plt.subplots()
    ax.imshow(image)
    for bbox, label in zip(data["bboxes"], data["labels"]):
        x1, y1, x2, y2 = bbox
        rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=1, edgecolor="r", facecolor="none")
        ax.add_patch(rect)
        plt.text(x1, y1, label, color="white", fontsize=8, bbox=dict(facecolor="red", alpha=0.5))
    ax.axis("off")
    return fig

In [None]:
#from gradio_helper import plot_bbox

fig = plot_bbox(image, parsed_answer["<OD>"])

In [None]:
prompt = "<MORE_DETAILED_CAPTION>"
inputs = processor(text=prompt, images=image, return_tensors="pt")

In [None]:
generated_ids = model.generate(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, do_sample=False, num_beams=3)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<MORE_DETAILED_CAPTION>", image_size=(image.width, image.height))

In [None]:
parsed_answer.get('<MORE_DETAILED_CAPTION>')

In [None]:
from dotenv import load_dotenv
import os
from transformers.utils import logging
ROOT_DIR = os.getcwd()
load_dotenv(os.path.join(ROOT_DIR,".env"))

In [None]:
from dotenv import dotenv_values
import mlflow
config = dotenv_values(os.path.join(ROOT_DIR,".env"))

In [None]:
logging.set_verbosity_error()

os.environ["TRANSFORMERS_VERBOSITY"] = "error"

In [None]:
MLFLOW_TRACKING_URI=config.get('MLFLOW_TRACKING_URI')
# Specify the workspace hostname and token
DATABRICKS_HOST=config.get('DATABRICKS_HOST')
DATABRICKS_TOKEN=config.get('DATABRICKS_TOKEN')

In [None]:

os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI

os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST

os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

In [None]:
os.environ["DATABRICKS_HOST"]

In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
mlflow.set_experiment(f"{config.get('USER_DATABRICKS')}/Florence2_captioning")

In [None]:
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.pyfunc import PythonModel
import pprint

In [None]:
os.getcwd()

In [None]:
class Florence2_Captioner(PythonModel):
  def load_context(self, context):
        """
        This method initializes the tokenizer and language model
        using the specified model snapshot directory.
        """
        from ov_helpers.ov_florence2_helper import OVFlorence2Model
        from transformers import AutoProcessor

        self.model = OVFlorence2Model(model_dir=context.artifacts["snapshot"], device="AUTO")
        self.processor =  AutoProcessor.from_pretrained(context.artifacts["snapshot"], trust_remote_code=True)



  def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        # Parameters
        task = params.get("task", '<MORE_DETAILED_CAPTION>') if params else '<MORE_DETAILED_CAPTION>'
        max_new_tokens = params.get("max_new_tokens", 1024) if params else 1024
        num_beams = params.get("num_beams", 3) if params else 3
        # get Image
        image_path  = model_input["path_image"][0]
        raw_image = Image.open(image_path).convert("RGB")
        # process image
        inputs = self.processor(text=task, images=raw_image, return_tensors="pt")
        # conditional image captioning
        generated_ids = self.model.generate(input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=max_new_tokens, do_sample=False, num_beams=num_beams)
        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        
        parsed_answer = self.processor.post_process_generation(generated_text, task=task,image_size=(raw_image.width, raw_image.height))
        return  {task: [parsed_answer.get(task)]}

In [None]:
import numpy as np
import pandas as pd

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types import ColSpec, DataType, ParamSchema, ParamSpec, Schema

from mlflow.models import infer_signature


# model_output= [{'<MORE_DETAILED_CAPTION>': """This is an image of a city. The city is filled with buildings. The buildings are very tall. The building in the middle is made of glass and metal.
# The sky is gray and cloudy. There are mountains in the background. The mountains are brown and gray. The trees in the foreground are green and healthy. There is a street light by 
# the buildings."""}]

model_output = Schema([ColSpec(DataType.string, "task")])

model_input = Schema(
    [
        ColSpec(DataType.string, "path_image"),
    ]
)
parameters = ParamSchema(
    [
        ParamSpec("temperature", DataType.float, np.float32(0.1), None),
        ParamSpec("max_new_tokens", DataType.integer, np.int32(1024), None),
        ParamSpec("num_beams", DataType.integer, np.int32(3), None),
        ParamSpec("task", DataType.string, "<MORE_DETAILED_CAPTION>", None),
    ]
)

signature = ModelSignature(inputs=model_input,outputs=model_output, params=parameters)

# Define input example

input_example = pd.DataFrame({"path_image": ["D:\\repos\\openvino\\images\\buildings.jpg"]})

In [None]:
signature

In [None]:
import datetime
now = datetime.datetime.now()
now.strftime("%Y-%m-%d_%H:%M:%S")

In [None]:
import torch
import transformers
# Get the current base version of torch that is installed, without specific version modifiers
torch_version = torch.__version__.split("+")[0]

In [None]:
#os.environ['MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR']="false"

In [None]:

# Start an MLflow run context and log the Florence model wrapper along with the param-included signature to
# allow for overriding parameters at inference time
now = datetime.datetime.now()

description= """Log Florence2 
Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks
https://huggingface.co/microsoft/Florence-2-large
```
### Caption
- task_prompt = "<CAPTION>"
- task_prompt = "<DETAILED_CAPTION>"
- task_prompt = "<MORE_DETAILED_CAPTION>"

### Object detection
OD results format: {'<OD>': { 'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['label1', 'label2', ...] } }
- task_prompt = "<OD>"

### Dense region caption
Dense region caption results format: {'<DENSE_REGION_CAPTION>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['label1', 'label2', ...]}}
- task_prompt = "<DENSE_REGION_CAPTION>"

### Region proposal

Region proposal results format: {'<REGION_PROPOSAL>' : {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
- task_prompt = "<REGION_PROPOSAL>"

task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"

task_prompt = "<REFERRING_EXPRESSION_SEGMENTATION>"

task_prompt = "<REGION_TO_SEGMENTATION>"

task_prompt = "<OPEN_VOCABULARY_DETECTION>"

task_prompt = "<REGION_TO_CATEGORY>"

task_prompt = "<REGION_TO_DESCRIPTION>"

task_prompt = "<OCR>"

task_prompt = "<OCR_WITH_REGION>"
```
"""
with mlflow.start_run(run_name=f"florence2_captioner_log_{now.strftime('%Y-%m-%d_%H:%M:%S')}", description=description) as run:
    model_info = mlflow.pyfunc.log_model(
        "captioner",
        python_model=Florence2_Captioner(),
        # NOTE: the artifacts dictionary mapping is critical! This dict is used by the load_context()
        artifacts={"snapshot": modelp},

        pip_requirements=[
            "torch>2.4.1",
            f"transformers=={transformers.__version__}",
            "pillow",
            "openvino>=2024.3.0" ,
            "einops",
             "torchvision",
            "timm>=0.9.8",
            "ov_helpers",
            "nncf"
            


        ],
        input_example=input_example,
        signature=signature,
    )

In [None]:
run.to_dictionary()

In [None]:
model_info.signature_dict

In [None]:
model_info.model_uri

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

In [None]:
loaded_model.metadata

In [None]:
time1=  datetime.datetime.now()
input_image =  pd.DataFrame({"path_image": ["D:\\repos\\openvino\\images\\buildings.jpg"]})
response = loaded_model.predict(input_image, params={"task": "<MORE_DETAILED_CAPTION>" })
time2=  datetime.datetime.now()
print(time2-time1)

In [None]:

pprint.pprint(response["<MORE_DETAILED_CAPTION>"][0])

In [None]:
result = mlflow.register_model(
    model_info.model_uri, "florence2_captioner"
)

In [None]:
from mlflow import MlflowClient

client = MlflowClient()

In [None]:
import mlflow.pyfunc

model_name = "florence2_captioner"
model_version = 1

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

In [None]:
response = model.predict(input_image, params={"task": "<MORE_DETAILED_CAPTION>" })
pprint.pprint(response["<MORE_DETAILED_CAPTION>"][0])

In [None]:
response = model.predict(input_image, params={"task": "<OD>" })
pprint.pprint(response["<OD>"][0])

In [None]:
f"models:/{model_name}/{model_version}"