# L2: Image captioning app 🖼️📝

Load your HF API key and relevant Python libraries

In [2]:
import os
import io
import IPython.display
from PIL import Image
import base64 
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
hf_api_key = os.environ['HF_API_KEY']

# Get the base project path and data path
BASE_PATH = os.environ['BASE_PATH']
DATA_PATH = os.path.join(BASE_PATH, "data")

In [3]:
# Helper functions
import requests, json

#Image-to-text endpoint
def get_completion(image_url):
    """
    Get image caption using HuggingFace's BLIP model
    """
    headers = {
        "Authorization": f"Bearer {hf_api_key}",
        "Content-Type": "application/json"
    }
    
    API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
    
    # For URLs, we can send them directly
    data = { "inputs": image_url }
    
    response = requests.post(API_URL, headers=headers, json=data)
    return response.json()

## Building an image captioning app 

Here we'll be using an [Inference Endpoint](https://huggingface.co/inference-endpoints) for `Salesforce/blip-image-captioning-base` a 14M parameter captioning model.

The free images are available on: https://free-images.com/

In [4]:
image_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"
display(IPython.display.Image(url=image_url))
result = get_completion(image_url)
print(result)

[{'generated_text': 'a dog wearing a santa hat and a red scarf'}]


## Captioning with `gr.Interface()`

#### gr.Image()
- The `type` parameter is the format that the `fn` function expects to receive as its input.  If `type` is `numpy` or `pil`, `gr.Image()` will convert the uploaded file to this format before sending it to the `fn` function.
- If `type` is `filepath`, `gr.Image()` will temporarily store the image and provide a string path to that image location as input to the `fn` function.

In [14]:
# Construct paths to the image files
dog_path = os.path.join(DATA_PATH, "christmas_dog.jpeg")
bird_path = os.path.join(DATA_PATH, "bird_flight.jpeg")
cow_path = os.path.join(DATA_PATH, "cow.jpeg")

In [None]:
import gradio as gr 

def image_to_base64_str(pil_image):
    byte_arr = io.BytesIO()
    pil_image.save(byte_arr, format='PNG')
    byte_arr = byte_arr.getvalue()
    return str(base64.b64encode(byte_arr).decode('utf-8'))

def captioner(image):
    base64_image = image_to_base64_str(image)
    result = get_completion(base64_image)
    
    # Handle the API response format
    if isinstance(result, list):
        return result[0]['generated_text']
    elif isinstance(result, dict) and 'generated_text' in result:
        return result['generated_text']
    else:
        print("Unexpected API response format:", result)  # For debugging
        return "Could not generate caption"

gr.close_all()
demo = gr.Interface(fn=captioner,
                    inputs=[gr.Image(label="Upload image", type="pil")],
                    outputs=[gr.Textbox(label="Caption")],
                    title="Image Captioning with BLIP",
                    description="Caption any image using the BLIP model",
                    flagging_mode="never",
                    examples=[dog_path, bird_path, cow_path])

demo.launch(share=True, allowed_paths=[DATA_PATH], server_port=int(os.environ['PORT1']))

In [None]:
gr.close_all()