<a href="https://colab.research.google.com/github/olonok69/LLM_Notebooks/blob/main/mlflow/custom/BLIP_Image_Captioning_save_to_MLFLOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation

Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.

- https://arxiv.org/pdf/2201.12086

- https://huggingface.co/Salesforce/blip-image-captioning-large


###Citation
```
@misc{https://doi.org/10.48550/arxiv.2201.12086,
  doi = {10.48550/ARXIV.2201.12086},
  
  url = {https://arxiv.org/abs/2201.12086},
  
  author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
  
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
  
  publisher = {arXiv},
  
  year = {2022},
  
  copyright = {Creative Commons Attribution 4.0 International}
}
```




In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install mlflow   optimum open_clip_torch --quiet

! pip install psutil pynvml -q

In [None]:
# Transformers installation

! pip install transformers[torch] -q
! pip install accelerate -U -q
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [None]:
path_model ="/content/drive/MyDrive/models/blip_pytorch"

In [None]:
from google.colab import userdata

import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import os
import sys
import platform
from PIL import Image

In [None]:
import torch.nn as nn
from torch import cuda, bfloat16
from google.colab import userdata
import mlflow
import numpy as np

In [None]:

from google.colab import output
output.enable_custom_widget_manager()

from transformers.utils import logging
from transformers import pipeline
import transformers

In [None]:
logging.set_verbosity_error()

os.environ["TRANSFORMERS_VERBOSITY"] = "error"

In [None]:


device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device


In [None]:
MLFLOW_TRACKING_URI="databricks"
# Specify the workspace hostname and token
DATABRICKS_HOST="https://adb-2467347032368999.19.azuredatabricks.net/"
DATABRICKS_TOKEN=userdata.get('DATABRCKS_TTOKEN')

In [None]:


if "MLFLOW_TRACKING_URI" not in os.environ:
    os.environ["MLFLOW_TRACKING_URI"] = MLFLOW_TRACKING_URI
if "DATABRICKS_HOST" not in os.environ:
    os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
if "DATABRICKS_TOKEN" not in os.environ:
    os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:

mlflow.set_experiment("/Users/mlengineer@test.com/blip_captioning")


In [None]:
mlflow.end_run()

In [None]:


processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")

image_path = "/content/drive/MyDrive/data/beach.jpg"
raw_image = Image.open(image_path).convert("RGB")

# conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

In [None]:
model.save_pretrained(path_model)
processor.save_pretrained(path_model)

In [None]:
np.array(raw_image)

In [None]:
pipe = pipeline( model=path_model, image_processor=path_model, task= "image-to-text")

In [None]:
pipe.max_length=200
pipe.num_beams=5
pipe.do_sample=True
pipe.temperature=1.0

In [None]:
pipe.predict(raw_image.resize((224, 224)))

In [None]:
transformers.__version__

In [None]:

import mlflow
from mlflow.models.signature import infer_signature
from mlflow.pyfunc import PythonModel
import pprint

In [None]:
class BLIP_Captioner(PythonModel):
  def load_context(self, context):
        """
        This method initializes the tokenizer and language model
        using the specified model snapshot directory.
        """

        from transformers import BlipProcessor, BlipForConditionalGeneration
        from transformers import pipeline
        from PIL import Image
        import torch


        self.model = BlipForConditionalGeneration.from_pretrained(context.artifacts["snapshot"])
        self.processor = BlipProcessor.from_pretrained(context.artifacts["snapshot"])



  def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        image_path  = model_input["path_image"][0]
        raw_image = Image.open(image_path).convert("RGB")
        # conditional image captioning
        text = "a photography of"
        inputs = self.processor(raw_image, text, return_tensors="pt")

        out = self.model.generate(**inputs)
        result = self.processor.decode(out[0], skip_special_tokens=True)
        return result


In [None]:
import numpy as np
import pandas as pd

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types import ColSpec, DataType, ParamSchema, ParamSpec, Schema

from mlflow.models import infer_signature

model_output= [{'generated_text': 'surfers on the beach with their boards in the water'}]

model_input= {"path_image":"/content/drive/MyDrive/data/beach.jpg"}
signature = infer_signature(model_input=model_input,model_output=model_output)



# Define input example
input_example = {"path_image":"/content/drive/MyDrive/data/beach.jpg"}

In [None]:
signature

In [None]:
input_example

In [None]:
import datetime
now = datetime.datetime.now()
now.strftime("%Y-%m-%d_%H:%M:%S")

In [None]:
# Get the current base version of torch that is installed, without specific version modifiers
torch_version = torch.__version__.split("+")[0]

In [None]:

# Start an MLflow run context and log the PHi3 model wrapper along with the param-included signature to
# allow for overriding parameters at inference time
now = datetime.datetime.now()

description= """Log BLIP captioner model with mlflow
BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
https://huggingface.co/Salesforce/blip-image-captioning-large

"""
with mlflow.start_run(run_name=f"blip_captioner_log_{now.strftime('%Y-%m-%d_%H:%M:%S')}", description=description) as run:
    model_info = mlflow.pyfunc.log_model(
        "captioner",
        python_model=BLIP_Captioner(),
        # NOTE: the artifacts dictionary mapping is critical! This dict is used by the load_context()
        artifacts={"snapshot": "/content/drive/MyDrive/models/blip_pytorch"},

        pip_requirements=[
            f"torch=={torch_version}",
            f"transformers=={transformers.__version__}",
            "pillow",


        ],
        input_example=input_example,
        signature=signature,
    )

In [None]:
run.to_dictionary()

In [None]:

model_info.model_uri

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

In [None]:
loaded_model

In [None]:

time1=  datetime.datetime.now()
response = loaded_model.predict({"path_image":"/content/drive/MyDrive/data/beach.jpg"})
time2=  datetime.datetime.now()
print(time2-time1)

In [None]:

pprint.pprint(response)

In [None]:
result = mlflow.register_model(
    model_info.model_uri, "blip_captioner"
)

In [None]:
from mlflow import MlflowClient

client = MlflowClient()

In [None]:
client.get_model_version(name="blip_captioner", version=2)

In [None]:
import mlflow.pyfunc

model_name = "blip_captioner"
model_version = 2

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

model.predict({"path_image":"/content/drive/MyDrive/data/beach.jpg"})

In [None]:
f"models:/{model_name}/{model_version}"