In [2]:
import os
import shutil

# Ensure the .kaggle directory exists
os.makedirs("/root/.kaggle", exist_ok=True)

#  Move kaggle.json to the correct location

shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")

# Set permissions to avoid access issues
os.chmod("/root/.kaggle/kaggle.json", 600)

print(" Kaggle API is now set up correctly!")

 Kaggle API is now set up correctly!


In [3]:
!kaggle datasets download -d adityajn105/flickr8k -p /content --unzip


Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
License(s): CC0-1.0


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load captions dataset
df = pd.read_csv('/content/captions.txt', delimiter=',')
df.columns = ['image', 'caption']

# Ensure paths are correct
df['image'] = "/content/Images/" + df['image']  # Update image path

# Split 80% train, 20% test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training Set: {len(train_df)} images")
print(f"Testing Set: {len(test_df)} images")


Training Set: 32364 images
Testing Set: 8091 images


In [6]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load ResNet50 for feature extraction
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove final classification layer
resnet.eval()

# Load GPT-2 tokenizer & model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

model.eval()  # Set to evaluation mode
model.config.pad_token_id = model.config.eos_token_id  # Avoid warnings


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def extract_features(image_path):
    """Extract image features using ResNet50"""
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        features = resnet(image)

    return features.squeeze().numpy()


#(b) Generate Scene Graph from Caption

In [8]:
import spacy
import networkx as nx

nlp = spacy.load("en_core_web_sm")

def create_scene_graph(caption):
    """Generate scene graph from a given caption"""
    doc = nlp(caption)
    G = nx.DiGraph()

    for token in doc:
        if token.dep_ in ("nsubj", "dobj", "pobj"):  # Only key dependencies
            G.add_edge(token.head.text, token.text)

    return G


In [9]:
def train_model(train_df, num_epochs=5):
    """Train captioning model using 80% dataset"""
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    model.train()  # Set to training mode

    for epoch in range(num_epochs):
        total_loss = 0

        for _, row in train_df.iterrows():
            image_path = row['image']
            caption = row['caption']

            # Extract image features
            image_features = extract_features(image_path)

            # Generate scene graph
            scene_graph = create_scene_graph(caption)

            # Convert scene graph to text (limit to 5 key relationships)
            scene_text = " ".join([f"{src} {dst}" for src, dst in list(scene_graph.edges)[:5]])

            # Reduce image features to first 10 values for representation
            feature_summary = np.round(image_features[:10], 2).tolist()

            # Prepare input text
            input_text = f"Image features: {feature_summary} Scene: {scene_text} Caption: {caption}"
            input_ids = tokenizer.encode(input_text, return_tensors="pt")

            # Forward pass
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_df):.4f}")

    # Save trained model
    model.save_pretrained("captioning_model")
    tokenizer.save_pretrained("captioning_model")
    print("✅ Training Completed! Model Saved.")


In [10]:
import pandas as pd

# Select only 1% of the test data (which is already 20% of the total dataset)
def sample_1_percent(test_df):
    sample_size = max(1, int(len(test_df) * 0.01))  # Ensure at least one image is tested
    sampled_test_df = test_df.sample(n=sample_size, random_state=42)  # Randomly sample
    print(f"🔹 Testing on {sample_size} images out of {len(test_df)} total test images.\n")
    return sampled_test_df

# Sample 1% from test data
test_sample_df = sample_1_percent(test_df)


🔹 Testing on 80 images out of 8091 total test images.



In [11]:
def generate_caption(image_features, scene_graph, max_new_tokens=30):
    """Generate caption using trained GPT-2 model with proper attention mask and pad token handling."""
    model.eval()  # Set to evaluation mode

    # Convert scene graph to text
    scene_text = " ".join([f"{src} {dst}" for src, dst in list(scene_graph.edges)[:5]])

    # Reduce image features to first 10 values
    feature_summary = np.round(image_features[:10], 2).tolist()

    # Prepare input text
    input_text = f"Image features: {feature_summary} Scene: {scene_text} Caption:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Set pad_token_id and attention_mask
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  # Ensure attention mask is applied
    model.config.pad_token_id = tokenizer.eos_token_id  # Set pad token ID

    # Generate caption with attention mask
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Fix warning by setting attention mask
        max_new_tokens=max_new_tokens,
        do_sample=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [12]:
def test_model(test_sample_df):
    """Test caption generation on 1% of 20% test dataset."""
    for _, row in test_sample_df.iterrows():
        image_path = row['image']
        reference_caption = row['caption']

        # Extract image features
        image_features = extract_features(image_path)

        # Generate scene graph
        scene_graph = create_scene_graph(reference_caption)

        # Generate caption
        generated_caption = generate_caption(image_features, scene_graph)

        print(f"\n📌 Image: {image_path}")
        print(f"✅ Reference Caption: {reference_caption}")
        print(f"🚀 Generated Caption: {generated_caption}")

# Run the test on only 1% of the test data
test_model(test_sample_df)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2174206711_11cb712a8d.jpg
✅ Reference Caption: Two asian girls cheerleading for a sporting event
🚀 Generated Caption: Image features: [0.25999999046325684, 0.44999998807907104, 2.390000104904175, 0.12999999523162842, 1.440000057220459, 0.47999998927116394, 0.07999999821186066, 0.0, 0.3499999940395355, 1.0700000524520874] Scene: cheerleading girls for event Caption: Scene: cheerleading girls for event

RAW Paste Data

Scene: cheerleading girls for event A scene of a cheering cheerleader, cheer

📌 Image: /content/Images/3477672764_7f07657a26.jpg
✅ Reference Caption: A soccer player is about to kick the ball near the goal .
🚀 Generated Caption: Image features: [1.1200000047683716, 0.5, 1.6200000047683716, 0.03999999910593033, 0.5199999809265137, 0.12999999523162842, 0.6000000238418579, 0.3499999940395355, 0.10000000149011612, 1.2799999713897705] Scene: is player kick ball near goal Caption: Frame size: 1.1b Scene: is player kick ball near goal Caption: Frame size

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3667157255_4e66d11dc2.jpg
✅ Reference Caption: An Olympic winner takes home two medals .
🚀 Generated Caption: Image features: [0.47999998927116394, 0.9800000190734863, 1.1799999475479126, 0.0, 1.149999976158142, 0.27000001072883606, 1.3799999952316284, 0.019999999552965164, 0.20999999344348907, 1.75] Scene: takes winner takes medals Caption: [Screenshot] http://i.imgur.com/QKQgJh5.png [/script/screenshots] Source: [URL


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/1130401779_8c30182e3e.jpg
✅ Reference Caption: Two brown dogs runs through the water .
🚀 Generated Caption: Image features: [0.12999999523162842, 0.2800000011920929, 0.30000001192092896, 0.1599999964237213, 0.05999999865889549, 0.4300000071525574, 0.27000001072883606, 0.6399999856948853, 0.4000000059604645, 0.75] Scene: runs dogs through water Caption: [ 0.128332216142924, 0.3300000935341537, 0.3999999834343543


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3306464579_1b16a0caf2.jpg
✅ Reference Caption: The person with the tattoos is holding a dirty frying pan .
🚀 Generated Caption: Image features: [0.15000000596046448, 0.27000001072883606, 1.659999966621399, 0.1599999964237213, 0.5, 0.949999988079071, 0.3799999952316284, 0.07000000029802322, 0.47999998927116394, 0.6499999761581421] Scene: holding person holding pan with tattoos Caption: <h2>I'm still waiting for you to find your way… (LF2)</h2> #ifihadglass #projectglass


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3015898903_70bebb8903.jpg
✅ Reference Caption: Three people wear a lot of makeup and stand together .
🚀 Generated Caption: Image features: [0.18000000715255737, 0.550000011920929, 0.41999998688697815, 0.38999998569488525, 0.30000001192092896, 0.3700000047683716, 1.059999942779541, 0.25999999046325684, 0.07999999821186066, 0.5600000023841858] Scene: wear people wear lot of makeup Caption: Download a sample size: 44.4 bytes

For a full length video of the incident, please see:

http://www.youtube


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2097398349_ff178b3f1b.jpg
✅ Reference Caption: A person stands in the snow at the top of a mountian , arms raised .
🚀 Generated Caption: Image features: [0.07999999821186066, 0.4000000059604645, 0.2800000011920929, 0.5699999928474426, 0.10000000149011612, 0.5199999809265137, 0.18000000715255737, 0.019999999552965164, 0.28999999165534973, 0.8199999928474426] Scene: stands person in snow at top of mountian raised arms Caption: [1, 4, 5, 6, 7, 8, 9] Scene: person walking to moon during sunset Scene: person moving along snow on

📌 Image: /content/Images/2367816288_7c2d11d3c5.jpg
✅ Reference Caption: Young boy in a brown shirt doing a back flip
🚀 Generated Caption: Image features: [0.6499999761581421, 0.9200000166893005, 1.440000057220459, 0.11999999731779099, 0.9300000071525574, 0.23999999463558197, 0.07000000029802322, 0.3499999940395355, 0.11999999731779099, 1.4900000095367432] Scene: in shirt doing flip Caption: [0.6399799977291718, 1.817980125457885, 1.8500000

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/340425915_490293058f.jpg
✅ Reference Caption: A dog wearing a jacket rolls in the snow .
🚀 Generated Caption: Image features: [0.11999999731779099, 0.4399999976158142, 0.11999999731779099, 0.4000000059604645, 0.7099999785423279, 0.10000000149011612, 0.33000001311302185, 0.09000000357627869, 0.3799999952316284, 0.2199999988079071] Scene: wearing rolls in snow Caption: Photo: HANDLING HIGHS/AFP/Getty Images

In the new video, taken around 10:20am local time,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2045109977_b00ec93491.jpg
✅ Reference Caption: The man is wearing a black shirt and holding up a blue item in a window .
🚀 Generated Caption: Image features: [0.17000000178813934, 0.6899999976158142, 1.8799999952316284, 0.23000000417232513, 1.1200000047683716, 0.009999999776482582, 0.8799999952316284, 0.7699999809265137, 0.2800000011920929, 1.4900000095367432] Scene: wearing man wearing shirt holding item in window Caption: Scene of wearing man walking to the store's door Scene of wearing man walking to the store's door, walking with his head towards the shopping cart Scene


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3155361712_2cbf59c78e.jpg
✅ Reference Caption: A man jumps while on a snowboard .
🚀 Generated Caption: Image features: [0.1899999976158142, 0.75, 0.0, 0.6200000047683716, 0.029999999329447746, 0.1899999976158142, 0.3799999952316284, 0.10000000149011612, 0.10999999940395355, 0.6800000071525574] Scene: jumps man on snowboard Caption: [0.1899999976158142, 0.75, 0.0, 0.6200000047683716, 0.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2424398046_1a55c71376.jpg
✅ Reference Caption: A little boy in a black shirt is holding a red guitar .
🚀 Generated Caption: Image features: [0.5, 0.3799999952316284, 0.3799999952316284, 0.44999998807907104, 0.7200000286102295, 0.3199999928474426, 0.3799999952316284, 0.8199999928474426, 0.8399999737739563, 1.8700000047683716] Scene: holding boy holding guitar in shirt Caption: [0.5, 0.3799999952316284, 0.3799999952316284, 0.449999988079


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3690107455_0fdb4ecee7.jpg
✅ Reference Caption: a woman walks across a large rock .
🚀 Generated Caption: Image features: [0.38999998569488525, 0.33000001311302185, 1.5700000524520874, 0.800000011920929, 0.09000000357627869, 0.1599999964237213, 0.20000000298023224, 0.07999999821186066, 0.009999999776482582, 0.3799999952316284] Scene: walks woman across rock Caption: 0:00:00 -0300 The scene starts in the woods. There are two men in the foreground carrying a baby and three female children.

📌 Image: /content/Images/2946016853_ceca4f5a07.jpg
✅ Reference Caption: A large white and grey dog runs through a yellow tunnel in an obstacle course .
🚀 Generated Caption: Image features: [0.7699999809265137, 0.2199999988079071, 1.440000057220459, 0.6399999856948853, 0.6000000238418579, 0.36000001430511475, 0.019999999552965164, 0.07000000029802322, 1.3300000429153442, 0.5299999713897705] Scene: runs dog through tunnel in course Caption: (1) Scene: runs dog through tunnel at s

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2666078276_f7b3056997.jpg
✅ Reference Caption: There is a brown dog leaping on the beach .
🚀 Generated Caption: Image features: [0.550000011920929, 0.36000001430511475, 0.7300000190734863, 0.5299999713897705, 0.25999999046325684, 0.5099999904632568, 0.27000001072883606, 0.0, 0.10999999940395355, 0.23000000417232513] Scene: on beach Caption: Image from http://img.photobucket.com/albums/v3/Squadron/Nebula/Gravity-g


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2236016316_f476cbbf06.jpg
✅ Reference Caption: The young basketball player moves into the front court .
🚀 Generated Caption: Image features: [0.15000000596046448, 0.17000000178813934, 0.3700000047683716, 0.07000000029802322, 1.7400000095367432, 0.2800000011920929, 0.9100000262260437, 0.2199999988079071, 0.029999999329447746, 0.30000001192092896] Scene: moves player into court Caption: Scene Name Name Size Name Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size Size


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/1433142189_cda8652603.jpg
✅ Reference Caption: A man is backpacking up a grassy hill .
🚀 Generated Caption: Image features: [0.10999999940395355, 0.2800000011920929, 0.029999999329447746, 0.33000001311302185, 0.11999999731779099, 0.4099999964237213, 0.47999998927116394, 0.009999999776482582, 0.029999999329447746, 0.029999999329447746] Scene: backpacking man backpacking hill Caption: <br /> <a href="http://i.imgur.com/U9kDuBZr.jpg" name="John.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/275002371_5b200e6a92.jpg
✅ Reference Caption: A dog rolls in the grass .
🚀 Generated Caption: Image features: [0.25, 0.09000000357627869, 0.8999999761581421, 0.4000000059604645, 0.2199999988079071, 1.0700000524520874, 0.2199999988079071, 0.2800000011920929, 0.25, 0.8199999928474426] Scene: in grass Caption: [0.65, 0.01000000355224858,0.8500000101752955,0.70000000845


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2198484810_50a893824a.jpg
✅ Reference Caption: A dog runs through an obstacle course .
🚀 Generated Caption: Image features: [0.3199999928474426, 0.4000000059604645, 0.6700000166893005, 0.5299999713897705, 0.8399999737739563, 0.11999999731779099, 1.3899999856948853, 0.6200000047683716, 0.47999998927116394, 1.0] Scene: runs dog through course Caption: http://cps.souper.net/go/kitten/0 http://cps.souper.net/go/

📌 Image: /content/Images/2540326842_bb26cec999.jpg
✅ Reference Caption: A group of kids look out of a fence .
🚀 Generated Caption: Image features: [0.18000000715255737, 0.6700000166893005, 0.6499999761581421, 0.029999999329447746, 0.3700000047683716, 0.5400000214576721, 0.8399999737739563, 0.5600000023841858, 0.25, 2.509999990463257] Scene: look group of kids of fence Caption:

Scene: Crayon Dummy

Scene: Crayon Dummy [2, 2, 2] Players: [2,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2730819220_b58af1119a.jpg
✅ Reference Caption: A young girl reaches the bottom of a slide .
🚀 Generated Caption: Image features: [0.20999999344348907, 0.4399999976158142, 1.600000023841858, 0.5600000023841858, 0.41999998688697815, 0.9300000071525574, 0.4699999988079071, 0.23999999463558197, 0.7400000095367432, 1.9600000381469727] Scene: reaches girl reaches bottom of slide Caption: [1.0.000000271438349560, 0.0000001, 0.0000001, 0.0000001, 0.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/97731718_eb7ba71fd3.jpg
✅ Reference Caption: People enjoy a horse draw open carriage in the rain .
🚀 Generated Caption: Image features: [0.15000000596046448, 0.6000000238418579, 0.20999999344348907, 0.18000000715255737, 0.23000000417232513, 0.4000000059604645, 0.03999999910593033, 0.3199999928474426, 0.11999999731779099, 0.11999999731779099] Scene: enjoy People enjoy horse draw carriage in rain Caption: People enjoy horse draw carriage in rain Scene: enjoying People enjoy horse draw carriage in rain Caption: enjoy People enjoy horse draw carriage in rain Scene: getting


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/263850317_5bb3a18a08.jpg
✅ Reference Caption: Girl wearing a mini dress and long sleve white jacket with a pink bag walking through a crowded area .
🚀 Generated Caption: Image features: [0.10000000149011612, 0.9399999976158142, 1.2300000190734863, 0.07000000029802322, 0.3100000023841858, 0.15000000596046448, 0.25, 0.05999999865889549, 0.05999999865889549, 0.550000011920929] Scene: wearing jacket with bag through area Caption: 1 - (84470.0000000117281417, 1.01337536002514, 0.5500000107


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3107463441_7c68606450.jpg
✅ Reference Caption: Boy and girl building metal structure .
🚀 Generated Caption: Image features: [1.0, 0.7900000214576721, 1.3799999952316284, 0.2800000011920929, 1.1100000143051147, 1.2599999904632568, 0.019999999552965164, 1.2100000381469727, 0.7900000214576721, 0.6499999761581421] Scene:  Caption: D.Va. Reworks [0.29, 0.60000000867361868, 1.1, 1.7,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3258874419_23fec1bdc1.jpg
✅ Reference Caption: A dog , wearing a number 8 , running in a race .
🚀 Generated Caption: Image features: [1.090000033378601, 0.8399999737739563, 0.5899999737739563, 0.30000001192092896, 0.28999999165534973, 0.20999999344348907, 0.20999999344348907, 0.30000001192092896, 1.2200000286102295, 0.7099999785423279] Scene: wearing number in race Caption: No race Caption: No race Loading... Scene: wearing number in race Show more by: All [4] 0.8299979606578

📌 Image: /content/Images/3244171699_ace4b5d999.jpg
✅ Reference Caption: Two women and a man are sitting at a round table .
🚀 Generated Caption: Image features: [0.1599999964237213, 0.5899999737739563, 2.4100000858306885, 0.05999999865889549, 2.6700000762939453, 0.5699999928474426, 1.2799999713897705, 0.05999999865889549, 0.7300000190734863, 0.8799999952316284] Scene: sitting women at table Caption: The scene at the beginning of the scene at the beginning of the scene at the end of the sc

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3534046564_4f8546e364.jpg
✅ Reference Caption: two black and white dogs running towards each other in the grass
🚀 Generated Caption: Image features: [1.4900000095367432, 0.6000000238418579, 1.059999942779541, 0.2800000011920929, 0.6800000071525574, 0.6299999952316284, 0.11999999731779099, 0.05000000074505806, 0.8700000047683716, 0.4699999988079071] Scene: towards other in grass Caption: Scene: towards other in fire Scene: about to escape from fire Caption: Scene: about to escape from fire Caption: about to escape from fire Caption


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3319405494_58dee86b21.jpg
✅ Reference Caption: Two people are going over a skii jump while one looks into the camera .
🚀 Generated Caption: Image features: [0.8799999952316284, 0.8999999761581421, 1.1299999952316284, 0.4000000059604645, 0.3400000035762787, 0.5400000214576721, 0.8399999737739563, 0.07000000029802322, 0.09000000357627869, 1.149999976158142] Scene: going people over jump looks one into camera Caption: The world: [0.9899999754038, 1.0, 2.25000000, 0.000000, 0.08990000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2444741900_5cb3ef3e1d.jpg
✅ Reference Caption: A bunch of people are standing or sitting in a snow valley .
🚀 Generated Caption: Image features: [0.5600000023841858, 0.6899999976158142, 0.3799999952316284, 0.17000000178813934, 0.03999999910593033, 0.09000000357627869, 0.33000001311302185, 0.14000000059604645, 0.41999998688697815, 0.5199999809265137] Scene: standing bunch of people in valley Caption: Screen capture from video capture with Canon 50D Mark IV Canon 50D Mark IV Camera: The Canon 50D Mark IV is a unique camera. Like


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3616525288_9c19223de6.jpg
✅ Reference Caption: a horses with five jockeys and five horses
🚀 Generated Caption: Image features: [0.14000000059604645, 0.9800000190734863, 0.1899999976158142, 0.2199999988079071, 0.8500000238418579, 0.10000000149011612, 0.6800000071525574, 0.15000000596046448, 0.25999999046325684, 0.15000000596046448] Scene: with jockeys Caption: <player name="Zac Sutter" onClick={type='player' onMouseover={type=player/window} > /><button

📌 Image: /content/Images/3351357065_a6a9b3d485.jpg
✅ Reference Caption: A man climbs a snowy mountain .
🚀 Generated Caption: Image features: [0.8500000238418579, 0.7300000190734863, 0.7900000214576721, 0.6299999952316284, 0.3799999952316284, 1.4700000286102295, 0.15000000596046448, 0.03999999910593033, 0.12999999523162842, 0.6600000262260437] Scene: climbs man climbs mountain Caption: scene 1 of 9

KARACHI: In a sense, we're a new country, but there's already quite a bit of work


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2515247156_c1b759fc33.jpg
✅ Reference Caption: A boy goes down a blue slide with his eyes closed .
🚀 Generated Caption: Image features: [0.6600000262260437, 0.4300000071525574, 2.009999990463257, 0.4699999988079071, 0.23000000417232513, 0.9300000071525574, 0.029999999329447746, 0.029999999329447746, 0.3400000035762787, 1.2999999523162842] Scene: goes boy down slide with eyes Caption:

1

2

3

4

5

6

7

8

9

10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3273091032_98f724b36b.jpg
✅ Reference Caption: A chipmunk stands on the edge of a field and road
🚀 Generated Caption: Image features: [0.20000000298023224, 0.4000000059604645, 0.019999999552965164, 0.25, 0.1899999976158142, 1.5399999618530273, 0.6200000047683716, 0.05000000074505806, 0.019999999552965164, 0.3100000023841858] Scene: stands chipmunk on edge of field Caption: [[Aerial view showing field and field of view]] Model: SES838-G0-S1-AR2-G01-


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3630332976_fdba22c50b.jpg
✅ Reference Caption: One child lifts another on his back , inside a room .
🚀 Generated Caption: Image features: [0.14000000059604645, 0.5899999737739563, 0.44999998807907104, 0.029999999329447746, 0.36000001430511475, 0.38999998569488525, 0.3700000047683716, 0.3700000047683716, 0.07999999821186066, 1.2599999904632568] Scene: lifts child lifts another on back inside room Caption: Scene: lifts child lifts another on back inside room


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/1415591512_a84644750c.jpg
✅ Reference Caption: A girl in boots is active in the grass .
🚀 Generated Caption: Image features: [0.699999988079071, 0.9700000286102295, 0.5, 0.8799999952316284, 0.75, 0.4300000071525574, 0.25, 0.25, 0.07999999821186066, 0.9900000095367432] Scene: is girl in boots in grass Caption: [0.699999988079071, 0.9700000286102295, 0.5, 0.87999999523


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/1151466868_3bc4d9580b.jpg
✅ Reference Caption: The little girl splashes through the water .
🚀 Generated Caption: Image features: [0.18000000715255737, 0.10999999940395355, 0.9100000262260437, 0.27000001072883606, 0.11999999731779099, 0.15000000596046448, 0.5400000214576721, 0.47999998927116394, 0.10000000149011612, 0.38999998569488525] Scene: splashes girl through water Caption: Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Splash Scene: 2.0012373937, 0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3501313414_ae865b6fdf.jpg
✅ Reference Caption: A small tan dog jumps over the barbed wire fence .
🚀 Generated Caption: Image features: [1.3799999952316284, 0.949999988079071, 0.23999999463558197, 0.8500000238418579, 0.5899999737739563, 1.149999976158142, 0.36000001430511475, 0.5099999904632568, 1.1799999475479126, 0.550000011920929] Scene: jumps dog over fence Caption: image: [1.379999985263623, 0.959999996944590118, 0.249999997956

📌 Image: /content/Images/782401952_5bc5d3413a.jpg
✅ Reference Caption: Three teenagers drink Slurpees outside a convienience store .
🚀 Generated Caption: Image features: [0.28999999165534973, 0.6000000238418579, 1.190000057220459, 0.5899999737739563, 0.800000011920929, 0.7900000214576721, 0.3199999928474426, 0.3100000023841858, 0.18000000715255737, 1.399999976158142] Scene: drink teenagers drink Slurpees outside store Caption: Hide Caption: http://puu.sh/f8l9Hy/c9d8a4d.png Shirts: Black


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/307301755_48919ef1b2.jpg
✅ Reference Caption: A man in a red shirt sitting on a plastic stool in a shaded sandy area .
🚀 Generated Caption: Image features: [0.9300000071525574, 1.1799999475479126, 1.0399999618530273, 0.05000000074505806, 0.10999999940395355, 0.23999999463558197, 0.8799999952316284, 0.5400000214576721, 0.12999999523162842, 1.659999966621399] Scene: in shirt in area on stool Caption: Background: [0000001313270022, 1.00000012646036554319, 1.0000000132944709547


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2656749876_e32495bd8c.jpg
✅ Reference Caption: A priest carrying a small blue bag walking down the street talking on a cellphone .
🚀 Generated Caption: Image features: [0.03999999910593033, 0.5099999904632568, 0.6700000166893005, 0.05999999865889549, 0.47999998927116394, 0.17000000178813934, 0.9599999785423279, 0.25999999046325684, 0.09000000357627869, 0.6100000143051147] Scene: talking priest carrying bag down street on cellphone Caption: Greetings from the player player Player encounters the same encounter when riding a vehicle. You need not get rid of the encounter. The player first encounters the


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3134586018_ae03ba20a0.jpg
✅ Reference Caption: A man has is arm around the woman who is holding a metallic object up to her face .
🚀 Generated Caption: Image features: [0.07000000029802322, 0.5799999833106995, 0.8700000047683716, 0.10000000149011612, 0.800000011920929, 0.3799999952316284, 0.8500000238418579, 0.09000000357627869, 0.15000000596046448, 0.6399999856948853] Scene: has man around woman holding who holding object to face Caption: GEM.scene.create (0x0000000000A7FAA) SceneSceneBase: has woman holding a woman and two holding objects Source


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3514194772_43ba471982.jpg
✅ Reference Caption: A brown dog with white paws is trotting through a field of green grass .
🚀 Generated Caption: Image features: [0.17000000178813934, 1.0199999809265137, 0.4099999964237213, 0.11999999731779099, 0.3700000047683716, 0.20000000298023224, 0.38999998569488525, 1.0800000429153442, 0.07000000029802322, 0.2199999988079071] Scene: trotting dog with paws through field of grass Caption: [2827.000000000000029501] Player: zhoom, type 3, type 4; id: 8, playerId: 17,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3646453252_5ebbbaa6cc.jpg
✅ Reference Caption: A child wearing a helmet riding a mountain bike very fast through a forest .
🚀 Generated Caption: Image features: [0.41999998688697815, 1.4600000381469727, 0.4300000071525574, 0.3799999952316284, 0.27000001072883606, 1.100000023841858, 0.17000000178813934, 0.009999999776482582, 0.25, 1.659999966621399] Scene: wearing riding riding bike through forest Caption: [0.45200125513571533, 0.1589223823584929, 0.480000016738

📌 Image: /content/Images/3514179514_cbc3371b92.jpg
✅ Reference Caption: An umpire in a baseball game crouches to catch a ball while an opposing team member runs to homebase .


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3424424006_98f9d1921c.jpg
✅ Reference Caption: A bride and groom jump towards each other with arms outstretched .
🚀 Generated Caption: Image features: [0.20999999344348907, 0.8100000023841858, 2.569999933242798, 0.9300000071525574, 0.5199999809265137, 1.600000023841858, 1.0, 0.11999999731779099, 0.27000001072883606, 1.350000023841858] Scene: towards other with arms Caption: Fences and foliage for F2P 2h13m16s5h20m33s0m31s3h3m18s


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3328397409_092de2bd32.jpg
✅ Reference Caption: A man is snowboarding over a pipe .
🚀 Generated Caption: Image features: [0.23999999463558197, 0.6200000047683716, 0.4399999976158142, 0.5600000023841858, 0.09000000357627869, 1.1699999570846558, 0.05999999865889549, 0.2800000011920929, 0.27000001072883606, 0.49000000953674316] Scene: snowboarding man over pipe Caption: [ 0.2222222222457869, 0.4000001222270785869, 0.509510018


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2753531542_ace2c870b7.jpg
✅ Reference Caption: A boy jumps from a high rock cliff to the water below .
🚀 Generated Caption: Image features: [0.949999988079071, 0.6200000047683716, 0.44999998807907104, 0.20999999344348907, 0.07000000029802322, 0.5, 0.5400000214576721, 0.05000000074505806, 0.1599999964237213, 1.0399999618530273] Scene: jumps boy from cliff to water Caption: 1.8f0108b6f,0.8f1a4e0816,0.8f1c11c21


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3425887426_bf60b8afa3.jpg
✅ Reference Caption: This man is hitting a tennis ball with a tennis racket .
🚀 Generated Caption: Image features: [0.5600000023841858, 1.0700000524520874, 1.0, 0.25, 0.7300000190734863, 0.9900000095367432, 1.1299999952316284, 0.8899999856948853, 0.009999999776482582, 1.909999966621399] Scene: hitting man hitting ball with racket Caption: [06/28/2017 06:41:27 AM] [Client thread/INFO] [STDERR]: [java.lang.Throwable


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3222496967_45d468ee66.jpg
✅ Reference Caption: The blonde woman holding a newspaper is smiling at the man in the white shirt .
🚀 Generated Caption: Image features: [0.11999999731779099, 0.7900000214576721, 2.130000114440918, 0.15000000596046448, 0.4699999988079071, 0.3199999928474426, 0.4300000071525574, 0.6800000071525574, 0.15000000596046448, 1.1299999952316284] Scene: smiling woman holding newspaper at man in shirt Caption: smiling man with shirt at person in pants

Source: [Shenzhen] The Daily People

"The [Bike] [bike

📌 Image: /content/Images/2909875716_25c8652614.jpg
✅ Reference Caption: Woman in a field of tall grass and wildflowers holding up a yellow scarf
🚀 Generated Caption: Image features: [0.33000001311302185, 0.4300000071525574, 1.090000033378601, 0.5600000023841858, 0.3700000047683716, 1.7999999523162842, 0.20999999344348907, 0.18000000715255737, 0.25, 1.100000023841858] Scene: in field of grass holding scarf Caption: Filled in Image descriptio

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2086513494_dbbcb583e7.jpg
✅ Reference Caption: A girl in a white coat takes pictures .
🚀 Generated Caption: Image features: [0.10999999940395355, 0.36000001430511475, 2.4800000190734863, 0.03999999910593033, 1.1100000143051147, 0.9599999785423279, 0.8399999737739563, 0.41999998688697815, 0.10000000149011612, 0.6600000262260437] Scene: takes girl takes pictures in coat Caption: [/a/view/f4/images/scene.png] (Filename: C:/buildslave/unity/build/artifacts/generated/


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2261257940_449b6e6c91.jpg
✅ Reference Caption: The dogs are running in the snow .
🚀 Generated Caption: Image features: [1.5499999523162842, 0.8500000238418579, 0.07999999821186066, 0.8500000238418579, 0.20999999344348907, 0.41999998688697815, 0.9200000166893005, 0.03999999910593033, 0.2800000011920929, 0.800000011920929] Scene: running dogs in snow Caption: [1.5499999523162842, 0.8500000238418579, 0.07999999821186066


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/380041023_0dfd712ef1.jpg
✅ Reference Caption: A brown dog is running .
🚀 Generated Caption: Image features: [0.699999988079071, 0.6000000238418579, 0.23999999463558197, 0.38999998569488525, 0.1599999964237213, 0.9800000190734863, 0.6299999952316284, 0.7699999809265137, 1.159999966621399, 0.25999999046325684] Scene: running dog Caption: Scene 2: [10, 2, 2, 0] View source [0.699999988079071, 0.6000015


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2636514498_01fcc5f501.jpg
✅ Reference Caption: A child plays with a toy on a playground .
🚀 Generated Caption: Image features: [0.25999999046325684, 0.11999999731779099, 1.0700000524520874, 0.10000000149011612, 1.100000023841858, 0.3100000023841858, 0.3100000023841858, 0.47999998927116394, 0.029999999329447746, 0.9100000262260437] Scene: plays child with toy on playground Caption: -------------------------------------------------- The two players watch out for each other. As far as they know they are just trying to get in. The girls keep to the


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/245895500_a4eb97af02.jpg
✅ Reference Caption: Person fishing in river .
🚀 Generated Caption: Image features: [0.28999999165534973, 0.38999998569488525, 0.019999999552965164, 0.38999998569488525, 0.44999998807907104, 0.44999998807907104, 0.0, 0.019999999552965164, 0.18000000715255737, 0.11999999731779099] Scene: in river Caption:

This frame captures the movement from S1 to S4, where the following values are calculated:

F - S1 to S3

📌 Image: /content/Images/3634032601_2236676cdd.jpg
✅ Reference Caption: Two women with backs to the camera .
🚀 Generated Caption: Image features: [0.029999999329447746, 0.6000000238418579, 0.8700000047683716, 0.18000000715255737, 0.6200000047683716, 0.5199999809265137, 0.38999998569488525, 0.5, 0.1599999964237213, 0.10999999940395355] Scene: with backs to camera Caption: I have an extremely large image of some scene or object on the web. Using the images to simulate the scene or object and then running the camera on


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/317488612_70ac35493b.jpg
✅ Reference Caption: A dog running through deep snow pack .
🚀 Generated Caption: Image features: [1.2200000286102295, 0.4000000059604645, 0.07000000029802322, 0.6800000071525574, 0.12999999523162842, 1.4199999570846558, 0.5099999904632568, 1.1200000047683716, 0.25, 0.07000000029802322] Scene: through pack Caption: <filename unknown name>

Saves Edit

Scene: <filename unknown name>

Effects Edit

Door: [0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/1469000260_5d473c8283.jpg
✅ Reference Caption: A large dog sits in the grass .
🚀 Generated Caption: Image features: [0.5799999833106995, 0.9599999785423279, 0.47999998927116394, 0.20999999344348907, 0.10000000149011612, 0.6399999856948853, 0.41999998688697815, 1.2100000381469727, 0.5099999904632568, 0.4000000059604645] Scene: sits dog in grass Caption: File: #5e3bd6a4d711f0bc9fa7ae9c17085

[02:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3005872315_e6771763bd.jpg
✅ Reference Caption: A close-up of a man shooting a handgun into the woods .
🚀 Generated Caption: Image features: [0.6800000071525574, 0.18000000715255737, 1.5099999904632568, 1.409999966621399, 0.7599999904632568, 1.9199999570846558, 0.5799999833106995, 0.9200000166893005, 0.05000000074505806, 0.15000000596046448] Scene: of man shooting handgun into woods Caption: [0.18000000715255739, 0.02010000007162746, 0.0100000007177243413


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/367964525_b1528ac6e4.jpg
✅ Reference Caption: A large German shepherd runs though deep snow .
🚀 Generated Caption: Image features: [0.20999999344348907, 0.5899999737739563, 0.05999999865889549, 0.27000001072883606, 0.20999999344348907, 0.05999999865889549, 0.3199999928474426, 0.019999999552965164, 0.15000000596046448, 0.3400000035762787] Scene: though snow Caption: _______________________________________________________ Project Name: Project Map Color: White Location: #2 File Size: ~1,983 bytes, 15 KB Image Date


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3519942322_b37d088aae.jpg
✅ Reference Caption: A golden dog is looking at a blue and yellow toy on the grass .
🚀 Generated Caption: Image features: [0.550000011920929, 0.4300000071525574, 0.6100000143051147, 0.11999999731779099, 0.20999999344348907, 0.41999998688697815, 0.20000000298023224, 0.8199999928474426, 1.8300000429153442, 0.09000000357627869] Scene: looking dog at toy on grass Caption: Scene: looking dog at toy on grass Interaction 1: I am a man. I have good friends...

RAW Paste Data

[

📌 Image: /content/Images/2920969723_83918fe909.jpg
✅ Reference Caption: A skateboarder jumping in front of a building .
🚀 Generated Caption: Image features: [0.3100000023841858, 0.9200000166893005, 1.3899999856948853, 0.30000001192092896, 0.12999999523162842, 0.07999999821186066, 0.28999999165534973, 0.019999999552965164, 0.18000000715255737, 0.7799999713897705] Scene: in front of building Caption: Map: Tileset: Scene 0.31x29: Scene 13.1a.1 Color: Black Caption: Maps:

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3628103548_2708abcda2.jpg
✅ Reference Caption: A woman playing guitar and singing in the foreground and three others performing behind her .
🚀 Generated Caption: Image features: [0.10999999940395355, 0.4300000071525574, 0.7300000190734863, 0.3499999940395355, 0.23000000417232513, 0.28999999165534973, 1.25, 0.28999999165534973, 0.3100000023841858, 1.600000023841858] Scene: playing guitar in foreground behind her Caption: [0.105339999991658, 0.32000007033, 0.3000000166060303637, 0.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/506882688_b37d549593.jpg
✅ Reference Caption: People are playing in water fountains .
🚀 Generated Caption: Image features: [0.15000000596046448, 0.5600000023841858, 0.2199999988079071, 0.27000001072883606, 0.25999999046325684, 0.8100000023841858, 0.47999998927116394, 0.3499999940395355, 0.05000000074505806, 0.23000000417232513] Scene: playing People in fountains Caption: [0.15000000596046448, 0.5600000023841858, 0.2199999988079071,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2744690159_fe2c89e55b.jpg
✅ Reference Caption: Four dogs splashing in the water
🚀 Generated Caption: Image features: [0.550000011920929, 0.5, 0.10000000149011612, 0.4399999976158142, 0.23000000417232513, 0.7300000190734863, 1.4199999570846558, 1.0299999713897705, 0.7200000286102295, 0.6299999952316284] Scene: in water Caption: Scene: The Water Dining (G1) [0.55000000101825, 4, 0.5725, 0.4000000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3589895574_ee08207d26.jpg
✅ Reference Caption: Child in grey sweatshirt hanging from metal bar
🚀 Generated Caption: Image features: [0.5299999713897705, 0.6700000166893005, 1.5499999523162842, 0.3199999928474426, 0.30000001192092896, 0.5299999713897705, 0.41999998688697815, 0.10000000149011612, 0.23999999463558197, 1.6399999856948853] Scene: in sweatshirt from bar Caption: [0.5299999713897205, 0.49000001668930712, 0.550000016716338521

📌 Image: /content/Images/3520079657_b828d96d50.jpg
✅ Reference Caption: A group of young people jump up in the air while on the beach .
🚀 Generated Caption: Image features: [0.49000000953674316, 0.20000000298023224, 0.25999999046325684, 0.11999999731779099, 0.20999999344348907, 0.11999999731779099, 0.49000000953674316, 0.0, 0.0, 0.5199999809265137] Scene: jump group of people in air on beach Caption: [1.200000948113829, 0.279999999046897, 0.14999999072328, 0.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/498444334_a680d318a1.jpg
✅ Reference Caption: Group of young men posing and holding cans .
🚀 Generated Caption: Image features: [0.1599999964237213, 0.9800000190734863, 1.7999999523162842, 0.029999999329447746, 0.36000001430511475, 0.019999999552965164, 0.8999999761581421, 0.23000000417232513, 0.10999999940395355, 1.1799999475479126] Scene: of men holding cans Caption: [ 0.45756723773618, 0.25671535483543, 1.0199990067141176


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/2437266971_b91a8f9a00.jpg
✅ Reference Caption: A man in a button up shirt and tie is wearing a smiling mask .
🚀 Generated Caption: Image features: [0.6899999976158142, 0.8199999928474426, 2.680000066757202, 0.3700000047683716, 0.9700000286102295, 0.07000000029802322, 2.2200000286102295, 0.05999999865889549, 0.25999999046325684, 1.1699999570846558] Scene: wearing man wearing mask in button Caption: - This does not change much in the gameplay. At any rate, I thought it was quite cool, since the two have their own special abilities,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/369186134_5eef374112.jpg
✅ Reference Caption: Two yellow dogs and a small dog walk through snow .
🚀 Generated Caption: Image features: [0.3499999940395355, 0.6000000238418579, 0.3400000035762787, 0.20000000298023224, 0.27000001072883606, 0.18000000715255737, 0.28999999165534973, 0.6800000071525574, 0.4000000059604645, 0.3199999928474426] Scene: walk dogs through snow Caption: <center><img alt="Showing (small) scene" src="https://images.yunate.cn/image-gallery/j


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3497236690_a48bf7ac42.jpg
✅ Reference Caption: Man in sleeveless shirt and shorts standing in the middle of the floor .
🚀 Generated Caption: Image features: [0.07000000029802322, 1.159999966621399, 0.7900000214576721, 0.2199999988079071, 0.6600000262260437, 0.019999999552965164, 0.8299999833106995, 0.38999998569488525, 0.14000000059604645, 0.6499999761581421] Scene: in shirt in middle of floor Caption: [0.06000001143401, 2.58232575856792, 0.582325934253873,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3029472296_d429b1586c.jpg
✅ Reference Caption: Two men holding their arms out from their bodies .
🚀 Generated Caption: Image features: [0.20999999344348907, 0.28999999165534973, 0.7900000214576721, 0.1899999976158142, 0.25, 0.5099999904632568, 1.0700000524520874, 0.7900000214576721, 0.07999999821186066, 0.5600000023841858] Scene: holding arms from bodies Caption: [17.59, 0.259999991655515, 0.70000000000006027, 0.110000000000016024,

📌 Image: /content/Images/3678098428_40c1b74cc2.jpg
✅ Reference Caption: A guy with a black tank top .
🚀 Generated Caption: Image features: [0.05000000074505806, 0.9599999785423279, 0.17000000178813934, 0.09000000357627869, 0.4000000059604645, 0.6399999856948853, 0.6000000238418579, 0.7200000286102295, 0.11999999731779099, 1.100000023841858] Scene: with top Caption: -0xF7D9C16A9-5BA1C9-46A08-8A8A-DDA40


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/453756106_711c20471a.jpg
✅ Reference Caption: Bigg playing with little dog in dirt .
🚀 Generated Caption: Image features: [0.5199999809265137, 0.3199999928474426, 0.5799999833106995, 0.6299999952316284, 0.10000000149011612, 0.5400000214576721, 0.15000000596046448, 0.18000000715255737, 0.550000011920929, 0.03999999910593033] Scene: playing Bigg with dog in dirt Caption: http://www.youtube.com/watch?v=wB3lU_gS-kV5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/1271960365_e54033f883.jpg
✅ Reference Caption: Two black dogs are running on pavement
🚀 Generated Caption: Image features: [0.05999999865889549, 0.3499999940395355, 0.14000000059604645, 0.36000001430511475, 0.23000000417232513, 0.10000000149011612, 0.6000000238418579, 0.09000000357627869, 0.07000000029802322, 0.3100000023841858] Scene: running dogs on pavement Caption: File not found. (Filename: C:/buildslave/unity/build/artifacts/StandalonePlayerGenerated/UnityEngineDebug.cpp Line


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3070011270_390e597783.jpg
✅ Reference Caption: Dog with black and tan face standing in water looking up at camera .
🚀 Generated Caption: Image features: [0.05999999865889549, 0.1899999976158142, 0.3700000047683716, 0.07999999821186066, 0.5099999904632568, 0.12999999523162842, 0.5400000214576721, 0.10999999940395355, 0.20000000298023224, 0.009999999776482582] Scene: standing Dog with face in water at camera Caption:

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/133905560_9d012b47f3.jpg
✅ Reference Caption: Two yellow labs are playing in the snow .
🚀 Generated Caption: Image features: [0.4699999988079071, 0.30000001192092896, 0.5600000023841858, 0.029999999329447746, 0.3400000035762787, 0.09000000357627869, 0.03999999910593033, 0.0, 0.8999999761581421, 0.2800000011920929] Scene: playing labs in snow Caption: [0.4699999988079071, 0.30000001192092896, 0.5600000023841858,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3760400645_3ba51d27f9.jpg
✅ Reference Caption: a girl at a wedding holding some orange flowers
🚀 Generated Caption: Image features: [0.009999999776482582, 0.5400000214576721, 0.3400000035762787, 0.019999999552965164, 0.30000001192092896, 0.33000001311302185, 1.0199999809265137, 0.2199999988079071, 0.27000001072883606, 0.7200000286102295] Scene: at wedding holding flowers Caption: [0.009999999776482582, 0.5400000214576721, 0.3400000035762787, 0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



📌 Image: /content/Images/3724487641_d2096f10e5.jpg
✅ Reference Caption: A bunch of little boys in bathing trunks running .
🚀 Generated Caption: Image features: [0.6000000238418579, 1.3200000524520874, 0.20999999344348907, 0.25, 0.30000001192092896, 0.550000011920929, 1.0399999618530273, 0.3799999952316284, 0.029999999329447746, 0.8199999928474426] Scene: of boys in trunks Caption: [0.60000002389413544, 0.5000000238544787834, 0.1900000023941844

📌 Image: /content/Images/2815745115_c8479d560c.jpg
✅ Reference Caption: Lady in a stuffed animal store .
🚀 Generated Caption: Image features: [0.15000000596046448, 0.5400000214576721, 0.949999988079071, 0.029999999329447746, 0.5199999809265137, 0.1599999964237213, 0.4699999988079071, 0.3799999952316284, 0.2199999988079071, 0.75] Scene: in store Caption: [0.100000099493360, -0.03000014006500, -0.049999999061280,


In [13]:
!pip uninstall -y nltk
!pip install --upgrade nltk


Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [14]:
import shutil
import os
import nltk

# Delete existing NLTK data folder
nltk_data_path = "/root/nltk_data"
if os.path.exists(nltk_data_path):
    shutil.rmtree(nltk_data_path)  # Delete folder

# Recreate and set path
os.makedirs(nltk_data_path)
nltk.data.path.append(nltk_data_path)

# Redownload required resources
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [18]:
import nltk

# Uninstall and reinstall NLTK
!pip uninstall -y nltk
!pip install --upgrade nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [22]:
!pip install sacrebleu
import sacrebleu

def evaluate_bleu_sacrebleu(test_sample_df):
    """Calculate BLEU score using SacreBLEU instead of NLTK."""
    scores = []

    for _, row in test_sample_df.iterrows():
        reference_caption = row['caption']
        image_features = extract_features(row['image'])
        scene_graph = create_scene_graph(reference_caption)
        generated_caption = generate_caption(image_features, scene_graph)

        # Compute BLEU using sacrebleu
        score = sacrebleu.sentence_bleu(generated_caption, [reference_caption]).score
        scores.append(score)

        print(f"\n✅ Reference Caption: {reference_caption}")
        print(f"🚀 Generated Caption: {generated_caption}")
        print(f"🎯 SacreBLEU Score: {score:.4f}")

    # Calculate average BLEU score
    avg_bleu = sum(scores) / len(scores) if scores else 0
    print(f"\n📊 Average SacreBLEU Score: {avg_bleu:.4f}")

# Run the evaluation
evaluate_bleu_sacrebleu(test_sample_df)


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: Two asian girls cheerleading for a sporting event
🚀 Generated Caption: Image features: [0.25999999046325684, 0.44999998807907104, 2.390000104904175, 0.12999999523162842, 1.440000057220459, 0.47999998927116394, 0.07999999821186066, 0.0, 0.3499999940395355, 1.0700000524520874] Scene: cheerleading girls for event Caption: - - - Full name: Wahlburgs - - - Year: 18th Century - Language and culture: German - Affiliation: Austria,
🎯 SacreBLEU Score: 0.9011


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A soccer player is about to kick the ball near the goal .
🚀 Generated Caption: Image features: [1.1200000047683716, 0.5, 1.6200000047683716, 0.03999999910593033, 0.5199999809265137, 0.12999999523162842, 0.6000000238418579, 0.3499999940395355, 0.10000000149011612, 1.2799999713897705] Scene: is player kick ball near goal Caption: Loading Screen... 0.132000000, 0.2720000384039068 0.232000002, 0.304412
🎯 SacreBLEU Score: 2.2241


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: An Olympic winner takes home two medals .
🚀 Generated Caption: Image features: [0.47999998927116394, 0.9800000190734863, 1.1799999475479126, 0.0, 1.149999976158142, 0.27000001072883606, 1.3799999952316284, 0.019999999552965164, 0.20999999344348907, 1.75] Scene: takes winner takes medals Caption: CTS: [0.4800081715983636, 1.0900000136184614, 0.770000049
🎯 SacreBLEU Score: 2.0334


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: Two brown dogs runs through the water .
🚀 Generated Caption: Image features: [0.12999999523162842, 0.2800000011920929, 0.30000001192092896, 0.1599999964237213, 0.05999999865889549, 0.4300000071525574, 0.27000001072883606, 0.6399999856948853, 0.4000000059604645, 0.75] Scene: runs dogs through water Caption: [0.12999999523162842, 0.28000000119593736, 0.2700000011920929, 0
🎯 SacreBLEU Score: 1.2992


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: The person with the tattoos is holding a dirty frying pan .
🚀 Generated Caption: Image features: [0.15000000596046448, 0.27000001072883606, 1.659999966621399, 0.1599999964237213, 0.5, 0.949999988079071, 0.3799999952316284, 0.07000000029802322, 0.47999998927116394, 0.6499999761581421] Scene: holding person holding pan with tattoos Caption: View in: 3

Video: Video: CARTOON: HAYNAH / JASON WALTER GUSTA, AL
🎯 SacreBLEU Score: 1.0683


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: Three people wear a lot of makeup and stand together .
🚀 Generated Caption: Image features: [0.18000000715255737, 0.550000011920929, 0.41999998688697815, 0.38999998569488525, 0.30000001192092896, 0.3700000047683716, 1.059999942779541, 0.25999999046325684, 0.07999999821186066, 0.5600000023841858] Scene: wear people wear lot of makeup Caption: Skincare blog by Sarah Wilsen, L'Homme

[04:47:22.733000] [Client thread
🎯 SacreBLEU Score: 3.3440


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A person stands in the snow at the top of a mountian , arms raised .
🚀 Generated Caption: Image features: [0.07999999821186066, 0.4000000059604645, 0.2800000011920929, 0.5699999928474426, 0.10000000149011612, 0.5199999809265137, 0.18000000715255737, 0.019999999552965164, 0.28999999165534973, 0.8199999928474426] Scene: stands person in snow at top of mountian raised arms Caption: The scene below from the video above shows the face of the man walking on snowy-looking snow covered ground.

[0010.00000001
🎯 SacreBLEU Score: 2.3385


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: Young boy in a brown shirt doing a back flip
🚀 Generated Caption: Image features: [0.6499999761581421, 0.9200000166893005, 1.440000057220459, 0.11999999731779099, 0.9300000071525574, 0.23999999463558197, 0.07000000029802322, 0.3499999940395355, 0.11999999731779099, 1.4900000095367432] Scene: in shirt doing flip Caption: in Shirt doing flip Scene: in Shirt doing flip 1.0: using the hand with thumb from hand of finger (this should be just the thumb
🎯 SacreBLEU Score: 1.4377


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A dog wearing a jacket rolls in the snow .
🚀 Generated Caption: Image features: [0.11999999731779099, 0.4399999976158142, 0.11999999731779099, 0.4000000059604645, 0.7099999785423279, 0.10000000149011612, 0.33000001311302185, 0.09000000357627869, 0.3799999952316284, 0.2199999988079071] Scene: wearing rolls in snow Caption: Caption: Beret wearing snowy camo, wearing pants Caption: The scene begins inside the hut A male guard pulls a small box of cereal and a
🎯 SacreBLEU Score: 1.6536


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: The man is wearing a black shirt and holding up a blue item in a window .
🚀 Generated Caption: Image features: [0.17000000178813934, 0.6899999976158142, 1.8799999952316284, 0.23000000417232513, 1.1200000047683716, 0.009999999776482582, 0.8799999952316284, 0.7699999809265137, 0.2800000011920929, 1.4900000095367432] Scene: wearing man wearing shirt holding item in window Caption: Scene: wearing man wearing shirt holding item in window

RAW Paste Data

[02:42:19] [Server thread/INFO]:
🎯 SacreBLEU Score: 1.5729


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A man jumps while on a snowboard .
🚀 Generated Caption: Image features: [0.1899999976158142, 0.75, 0.0, 0.6200000047683716, 0.029999999329447746, 0.1899999976158142, 0.3799999952316284, 0.10000000149011612, 0.10999999940395355, 0.6800000071525574] Scene: jumps man on snowboard Caption: "Gunslinger" Version: "1.3.11" Time (previous frame): 11:43:53 Script name: KG
🎯 SacreBLEU Score: 0.9348


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A little boy in a black shirt is holding a red guitar .
🚀 Generated Caption: Image features: [0.5, 0.3799999952316284, 0.3799999952316284, 0.44999998807907104, 0.7200000286102295, 0.3199999928474426, 0.3799999952316284, 0.8199999928474426, 0.8399999737739563, 1.8700000047683716] Scene: holding boy holding guitar in shirt Caption: 1 2 3 4 5 0.5 0.37 0.25 0.25 0.37 0.25 0.75 0.27 0
🎯 SacreBLEU Score: 1.1373


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: a woman walks across a large rock .
🚀 Generated Caption: Image features: [0.38999998569488525, 0.33000001311302185, 1.5700000524520874, 0.800000011920929, 0.09000000357627869, 0.1599999964237213, 0.20000000298023224, 0.07999999821186066, 0.009999999776482582, 0.3799999952316284] Scene: walks woman across rock Caption: Photo by Darryl White, 3DMarking software Download this file on your computer and enjoy scenes that are based upon the images in the Image
🎯 SacreBLEU Score: 0.9011


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A large white and grey dog runs through a yellow tunnel in an obstacle course .
🚀 Generated Caption: Image features: [0.7699999809265137, 0.2199999988079071, 1.440000057220459, 0.6399999856948853, 0.6000000238418579, 0.36000001430511475, 0.019999999552965164, 0.07000000029802322, 1.3300000429153442, 0.5299999713897705] Scene: runs dog through tunnel in course Caption: [0.7999147524387577, 0.33446909567822, 0.1400008591389
🎯 SacreBLEU Score: 2.4182


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: There is a brown dog leaping on the beach .
🚀 Generated Caption: Image features: [0.550000011920929, 0.36000001430511475, 0.7300000190734863, 0.5299999713897705, 0.25999999046325684, 0.5099999904632568, 0.27000001072883606, 0.0, 0.10999999940395355, 0.23000000417232513] Scene: on beach Caption: 5 https://d3d10.imageshack.us/img/CCAAACK6J8D/d3d10-pic
🎯 SacreBLEU Score: 1.0009


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: The young basketball player moves into the front court .
🚀 Generated Caption: Image features: [0.15000000596046448, 0.17000000178813934, 0.3700000047683716, 0.07000000029802322, 1.7400000095367432, 0.2800000011920929, 0.9100000262260437, 0.2199999988079071, 0.029999999329447746, 0.30000001192092896] Scene: moves player into court Caption: GECKING GECK (0.0000000134372428), 0.00000000, 1570534468, 2137703906
🎯 SacreBLEU Score: 1.2053


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A man is backpacking up a grassy hill .
🚀 Generated Caption: Image features: [0.10999999940395355, 0.2800000011920929, 0.029999999329447746, 0.33000001311302185, 0.11999999731779099, 0.4099999964237213, 0.47999998927116394, 0.009999999776482582, 0.029999999329447746, 0.029999999329447746] Scene: backpacking man backpacking hill Caption: photo from Flickr user loya_diaz1 Caption: photo from Flickr user loya_diaz1 Caption: Photo from Flickr user
🎯 SacreBLEU Score: 0.8865


KeyboardInterrupt: 

In [23]:
!pip install jiwer
from jiwer import wer

def evaluate_meteor_jiwer(test_sample_df):
    """Calculate METEOR-like score using Word Error Rate (WER)."""
    scores = []

    for _, row in test_sample_df.iterrows():
        reference_caption = row['caption']
        image_features = extract_features(row['image'])
        scene_graph = create_scene_graph(reference_caption)
        generated_caption = generate_caption(image_features, scene_graph)

        # Compute WER (Lower WER means better match)
        score = 1 - wer(reference_caption, generated_caption)  # Convert WER to similarity
        scores.append(score)

        print(f"\n✅ Reference Caption: {reference_caption}")
        print(f"🚀 Generated Caption: {generated_caption}")
        print(f"🎯 WER-based METEOR Score: {score:.4f}")

    avg_meteor = sum(scores) / len(scores) if scores else 0
    print(f"\n📊 Average WER-based METEOR Score: {avg_meteor:.4f}")

# Run the evaluation
evaluate_meteor_jiwer(test_sample_df)


Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.12.2


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: Two asian girls cheerleading for a sporting event
🚀 Generated Caption: Image features: [0.25999999046325684, 0.44999998807907104, 2.390000104904175, 0.12999999523162842, 1.440000057220459, 0.47999998927116394, 0.07999999821186066, 0.0, 0.3499999940395355, 1.0700000524520874] Scene: cheerleading girls for event Caption: [9] <Cupcakes> 1) I just came here with more money. 2) So when the time comes for the new year,
🎯 WER-based METEOR Score: -3.5000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: A soccer player is about to kick the ball near the goal .
🚀 Generated Caption: Image features: [1.1200000047683716, 0.5, 1.6200000047683716, 0.03999999910593033, 0.5199999809265137, 0.12999999523162842, 0.6000000238418579, 0.3499999940395355, 0.10000000149011612, 1.2799999713897705] Scene: is player kick ball near goal Caption: is not set: [8:33:19] [09:08:23.0013] [09:07:19.9957
🎯 WER-based METEOR Score: -0.8462


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: An Olympic winner takes home two medals .
🚀 Generated Caption: Image features: [0.47999998927116394, 0.9800000190734863, 1.1799999475479126, 0.0, 1.149999976158142, 0.27000001072883606, 1.3799999952316284, 0.019999999552965164, 0.20999999344348907, 1.75] Scene: takes winner takes medals Caption: Results: M1 Match Results M2 Match Results M3 Match Results M4 Match Results F1 Match Results P/O1 Match Results M5
🎯 WER-based METEOR Score: -3.5000


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



✅ Reference Caption: Two brown dogs runs through the water .
🚀 Generated Caption: Image features: [0.12999999523162842, 0.2800000011920929, 0.30000001192092896, 0.1599999964237213, 0.05999999865889549, 0.4300000071525574, 0.27000001072883606, 0.6399999856948853, 0.4000000059604645, 0.75] Scene: runs dogs through water Caption: Screen shots

RAW Paste Data

Scene #0: Water from a river [0.009700000182405, 0.10
🎯 WER-based METEOR Score: -2.6250


KeyboardInterrupt: 