# L2: Image captioning app 🖼️📝

Load your HF API key and relevant Python libraries

In [None]:
import os
import io
import IPython.display
from PIL import Image
import base64 
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
hf_api_key = os.environ['HF_API_KEY']

In [None]:
# Helper functions
# from transformers import pipeline
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

device = 0 if torch.cuda.is_available() else -1
#Image-to-text endpoint
get_completion  = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("mps")





## Building an image captioning app 

Here we'll be using an [Inference Endpoint](https://huggingface.co/inference-endpoints) for `Salesforce/blip-image-captioning-base` a 14M parameter captioning model.

The free images are available on: https://free-images.com/

In [None]:
import requests
image_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"
display(IPython.display.Image(url=image_url))
raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
inputs = get_completion(raw_image, return_tensors="pt").to("mps")
out = model.generate(**inputs)
caption = get_completion.decode(out[0], skip_special_tokens=True)
print(caption)


In [None]:
## Captioning with `gr.Interface()`

#### gr.Image()
- The `type` parameter is the format that the `fn` function expects to receive as its input.  If `type` is `numpy` or `pil`, `gr.Image()` will convert the uploaded file to this format before sending it to the `fn` function.
- If `type` is `filepath`, `gr.Image()` will temporarily store the image and provide a string path to that image location as input to the `fn` function.

In [None]:
import gradio as gr 
import requests

def raw_image_to_image(image_path):
    if image_path.startswith(('http://', 'https://')):
        image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB')
    else:
        image = Image.open(image_path).convert('RGB')
    display(IPython.display.Image(url=image_path))
    return image



def captioner(filepath):
    image = raw_image_to_image(filepath)
    inputs = get_completion(image, return_tensors="pt").to("mps")
    out = model.generate(**inputs)
    caption = get_completion.decode(out[0], skip_special_tokens=True)
    return caption

gr.close_all()
demo = gr.Interface(fn=captioner,
                    inputs=[gr.Image(label="Upload image or enter URL", type="filepath")],
                    outputs=[gr.Textbox(label="Caption")],
                    title="Image Captioning with BLIP",
                    description="Caption any image using the BLIP model",
                    allow_flagging="never",
                    examples=["image-gradio/christmas_dog.jpeg", "image-gradio/bird_flight.jpeg", "image-gradio/cow.jpeg"])

demo.launch(share=True)

In [None]:
gr.close_all()