<a href="https://colab.research.google.com/github/pgurazada/captioning-with-multimodal-models/blob/main/newyorker_cartoon_description.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q "google-cloud-aiplatform>=1.38" datasets evaluate bert_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
import io
import json
import vertexai
import evaluate

from vertexai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
    Image
)

from datasets import load_dataset

from google.colab import auth

# Setup

In [4]:
auth.authenticate_user()

In [5]:
with open("config-vertexai.json") as f:
    data = f.read()

In [6]:
creds = json.loads(data)

In [7]:
vertexai.init(
    project=creds["project"],
    location=creds["location"]
)

In [8]:
multimodal_model = GenerativeModel("gemini-pro-vision")

# Data

In [10]:
captions_examples_ds = load_dataset(
    "jmhessel/newyorker_caption_contest", "explanation",
    split="validation"
)

captions_gold_examples_ds = load_dataset(
    "jmhessel/newyorker_caption_contest", "explanation",
    split="test"
)

Downloading readme:   0%|          | 0.00/40.3k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.97M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.80M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2340 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/130 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/131 [00:00<?, ? examples/s]

In [11]:
description_prompt = """
Your task is to generate a description for the cartoon presented in the input.
Write a 2-3 sentence description focusing on:
- Where is the scene taking place?
- Who/Whats in the scene? What are they doing?
- What objects and actions are being depicted?
- Is anyone particularly happy/unhappy/mad/etc?
There is no need to be formal, but please do your best to write full, grammatical sentences.
Here are a few examples to guide your generation process.
"""

In [12]:
task_prompt = """Now generate a description for the following cartoon:"""

In [13]:
examples_for_prompt = []

n_examples = 5 # 5-shot as in the paper
few_shot_examples = captions_examples_ds.shuffle()[0: n_examples]

In [16]:
for example_image, example_description in zip(
    few_shot_examples['image'],
    few_shot_examples['image_description']):

    with io.BytesIO() as buffer:
        example_image.save(buffer, format='JPEG')
        example_image_bytes = buffer.getvalue()
        example_image_input = Image.from_bytes(example_image_bytes)

        examples_for_prompt.append(example_image_input)
        examples_for_prompt.append(example_description)

few_shot_prompt = [description_prompt] + examples_for_prompt + [task_prompt]


# Generation

In [15]:
description_generation_config = GenerationConfig(
    temperature=0.8,
    top_p=.95,
    max_output_tokens=64
)

# Evaluation

In [18]:
n_test_examples = 30

gold_examples = captions_gold_examples_ds.shuffle()[0: n_test_examples]

In [19]:
model_predictions, ground_truths = [], []

for gold_example_image, gold_example_description in zip(
    gold_examples['image'],
    gold_examples['image_description']):

    with io.BytesIO() as buffer:
        gold_example_image.save(buffer, format='JPEG')
        gold_example_image_bytes = buffer.getvalue()
        gold_example_image_input = Image.from_bytes(gold_example_image_bytes)

        gold_example_prompt = few_shot_prompt + [gold_example_image_input]

        try:
            generated_description = multimodal_model.generate_content(
                gold_example_prompt,
                generation_config=description_generation_config
            )
        except Exception as e:
            print(e)
            continue

        model_predictions.append(generated_description.text.strip())
        ground_truths.append(gold_example_description)

In [23]:
with open('generated-descriptions.txt', 'w') as f:
    for prediction in model_predictions:
        f.write(f'{prediction}\n')

with open('gold-descriptions.txt', 'w') as f:
    for description in ground_truths:
        f.write(f'{description}\n')

In [24]:
bert_scorer = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [25]:
score = bert_scorer.compute(
    predictions=model_predictions,
    references=ground_truths,
    lang="en",
    rescale_with_baseline=True
)

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
sum(score['f1'])/len(score['f1'])

0.39895525376001995