In [1]:
# @title Imports
from PIL import Image
import IPython.display as display
import requests
from transformers import AutoProcessor, TFBlipForConditionalGeneration
from io import BytesIO

from evaluate import load
import torch

ModuleNotFoundError: No module named 'evaluate'

*italicised text*# New section

In [None]:
# @title Example Image
# Example image URL
image_url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRzn-z0Nh-DBQQpd1l4fGgI9ouleVFBrSnsUw&s"
# Display image
display.display(display.Image(url=image_url))

# Loading processor and model

In [None]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Generating example output

In [None]:
image = Image.open(requests.get(image_url, stream=True).raw)
# text = "A picture of"

# inputs = processor(images=image, text=text, return_tensors="tf")
inputs = processor(images=image, return_tensors="tf")

outputs = model.generate(**inputs)
outputs

# Example Caption

In [None]:
caption = processor.decode(outputs[0], skip_special_tokens=True)
caption

# Caption generation function

In [None]:
def generate_captions(image_urls):
    captions = []
    for url in image_urls:
        image = Image.open(BytesIO(requests.get(url, stream=True).content))
        # text = "A picture of"
        # inputs = processor(images=image, text=text, return_tensors="tf")
        inputs = processor(images=image, return_tensors="tf")
        outputs = model.generate(**inputs)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        captions.append(caption)
    return captions

# Example batch creation of captions

In [None]:
image_urls = [
    "https://cdn.cdnparenting.com/articles/2018/06/418806355-H-1024x700.jpg",
    "https://littletikescommercial.com/wp-content/uploads/2020/12/Adventureland-Park-IA_199-scaled.jpg",
    "https://www.kidstuffplaysystems.com/wp-content/uploads/2019/06/81110-Ring-trek.jpg",
    "https://images.unsplash.com/photo-1621354598022-16599af1b8b2?q=80&w=870&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
]

In [None]:
generate_captions(image_urls)

# Model evaluation

In [None]:
# Load the WER metric
wer = load("wer")

# Define the evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predicted = logits.numpy().argmax(-1)
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
    wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
    return {"wer_score": wer_score}

# Example data: List of images and their corresponding true captions
images = [
    Image.open(requests.get("https://example.com/image1.jpg", stream=True).raw),
    Image.open(requests.get("https://example.com/image2.jpg", stream=True).raw)
]
true_captions = ["A picture of a cat", "A picture of a dog"]

# Prepare the inputs and labels
inputs = processor(images=images, return_tensors="tf", padding=True)
labels = processor(text=true_captions, return_tensors="tf", padding=True).input_ids

# Generate predictions
outputs = model(**inputs)
logits = outputs.logits

# Evaluate the model
eval_pred = (logits, labels)
metrics = compute_metrics(eval_pred)
print(metrics)