In [2]:
# Clear GPU memory (run this cell to start fresh)
import torch
import gc

# Clear Python garbage collector
gc.collect()

# Clear PyTorch CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache cleared")
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
else:
    print("No CUDA device available")

GPU cache cleared
GPU memory allocated: 0.00 GB
GPU memory reserved: 0.00 GB


In [10]:
pip install -U bitsandbytes>=0.46.1


SyntaxError: invalid syntax (ipython-input-1480869654.py, line 1)

In [1]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Detect if running in Google Colab
IN_COLAB = False
cache_dir = None

import os

IN_COLAB = False
cache_dir = None

try:
    import google.colab  # type: ignore
    IN_COLAB = True
    print("Running in Google Colab")

    # Use temporary Colab storage (fast, does NOT use Drive quota)
    cache_dir = "/content/hf_cache"
    os.makedirs(cache_dir, exist_ok=True)
    print(f"Using temporary cache directory: {cache_dir}")

except ImportError:
    print("Running locally")

# Configure 8-bit quantization to reduce memory usage
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)

# Load model with quantization and automatic device mapping
model = AutoModelForImageTextToText.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    quantization_config=quantization_config,
    device_map="auto",
    cache_dir=cache_dir
)

processor = AutoProcessor.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    cache_dir=cache_dir
)

Using device: cuda
Running in Google Colab
Using temporary cache directory: /content/hf_cache


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/1247 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

The image processor of type `BlipImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [3]:
from google.colab import files
uploaded = files.upload()

Saving uic.zip to uic.zip


In [4]:
import zipfile
import os
import shutil
from pathlib import Path

# --- Colab-temp paths (fast, does not use Drive quota) ---
zip_file_path = "/content/uic.zip"      # put your uploaded zip here
extract_to_path = "/content/data/uic"   # extracted dataset will live here

# Reset extract dir
if os.path.exists(extract_to_path):
    print(f"Removing existing {extract_to_path}...")
    shutil.rmtree(extract_to_path)

# Extract
if not os.path.exists(zip_file_path):
    raise FileNotFoundError(
        f"{zip_file_path} not found. Upload uic.zip to Colab (/content) or change zip_file_path."
    )

os.makedirs(extract_to_path, exist_ok=True)
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(extract_to_path)
print(f"Extracted {zip_file_path} -> {extract_to_path}")

# --- Robustly locate dataset files (don’t assume folder name) ---
extract_root = Path(extract_to_path)

captions_matches = list(extract_root.rglob("UIC-captions.txt"))
if not captions_matches:
    raise FileNotFoundError("UIC-captions.txt not found after extraction (check zip contents).")

captions_file = captions_matches[0]
dataset_base = captions_file.parent
image_dir = dataset_base / "uic_224x224_image"

if not image_dir.exists():
    raise FileNotFoundError(f"Image directory not found: {image_dir}")

print("Dataset base:", dataset_base)
print("Captions file:", captions_file)
print("Image dir:", image_dir)

# --- Load captions ---
def load_captions(captions_path: Path):
    image_captions = {}
    with open(captions_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            img_id, caption = line.split(" ", 1)
            img_filename = img_id.split("#")[0]
            image_captions.setdefault(img_filename, []).append(caption)
    return image_captions

captions_dict = load_captions(captions_file)

# --- Build dataset list ---
image_paths = sorted([p.name for p in image_dir.iterdir() if p.suffix.lower() == ".jpg"])

dataset = []
for img_filename in image_paths:
    caps = captions_dict.get(img_filename, [])
    if caps:
        dataset.append({
            "image_path": str(image_dir / img_filename),
            "image_filename": img_filename,
            "captions": caps
        })

print(f"Loaded {len(dataset)} images with captions")
if dataset:
    print("Example entry:")
    print(f"  Image: {dataset[0]['image_filename']}")
    print(f"  Number of captions: {len(dataset[0]['captions'])}")
    print(f"  First caption: {dataset[0]['captions'][0]}")


Extracted /content/uic.zip -> /content/data/uic
Dataset base: /content/data/uic/UIC(underwater image captioning dataset)
Captions file: /content/data/uic/UIC(underwater image captioning dataset)/UIC-captions.txt
Image dir: /content/data/uic/UIC(underwater image captioning dataset)/uic_224x224_image
Loaded 3176 images with captions
Example entry:
  Image: uic_img_1.jpg
  Number of captions: 5
  First caption: A dark brown turtle paddles through the water with its limbs .


In [6]:
import random

SEED = 42

# 1) Stable order first (so results don't depend on filesystem order)
dataset_sorted = sorted(dataset, key=lambda x: x["image_filename"])

# 2) Deterministic shuffle
rng = random.Random(SEED)
rng.shuffle(dataset_sorted)

# 3) 80/10/10 split
n = len(dataset_sorted)
n_train = int(0.8 * n)
n_val   = int(0.1 * n)
n_test  = n - n_train - n_val  # remainder

train_set = dataset_sorted[:n_train]
val_set   = dataset_sorted[n_train:n_train + n_val]
test_set  = dataset_sorted[n_train + n_val:]

print("Counts:", len(train_set), len(val_set), len(test_set))
print("First test example:", test_set[0]["image_filename"])


Counts: 2540 317 319
First test example: uic_img_205.jpg


In [7]:
import torch
from PIL import Image
from tqdm import tqdm

torch.manual_seed(42)

predictions = []

for item in tqdm(test_set):
    image = Image.open(item["image_path"]).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=50,
            num_beams=3,        # keep fixed
            do_sample=False     # deterministic
        )

    caption = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True
    )[0]

    predictions.append({
        "image_filename": item["image_filename"],
        "prediction": caption,
        "references": item["captions"]
    })

print("Finished inference on test set")
print("Example output:")
print(predictions[0])


100%|██████████| 319/319 [11:21<00:00,  2.13s/it]

Finished inference on test set
Example output:
{'image_filename': 'uic_img_205.jpg', 'prediction': 'a reef with many fish swimming around it\n', 'references': ['Many brightly colored tropical fish swim among the corals .', 'A school of small fish swims in the water near the coral on the seabed .', 'There is a large area of coral on the seabed, and many small fish swim near it .', 'Groups of corals swim around near the corals on the seafloor .', 'Many small fish swim near the coral on the seabed .']}





In [8]:
# Make sure each prediction has image_path (needed for CLIPScore)
# If you already stored it, skip this block.
path_by_name = {x["image_filename"]: x["image_path"] for x in test_set}
for p in predictions:
    p["prediction"] = p["prediction"].strip()
    p["image_path"] = path_by_name[p["image_filename"]]

# For text metrics
pred_texts = [p["prediction"] for p in predictions]
ref_texts  = [p["references"] for p in predictions]  # list-of-lists

In [46]:
# Save predictions to json
import json, time

out = {
    "created_utc": time.time(),
    "seed": 42,
    "split": {
        "test_filenames": [p["image_filename"] for p in predictions],
    },
    "predictions": [
        {
            "image_filename": p["image_filename"],
            "prediction": p["prediction"].strip(),
            "references": [r.strip() for r in p["references"]],
        }
        for p in predictions
    ],
}

with open("/content/uic_blip2_baseline_predictions.json", "w") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

# Download predictions

files.download("/content/uic_blip2_baseline_predictions.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [45]:
!pip -q install open_clip_torch
import open_clip
from PIL import Image
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model_clip, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k"
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")
model_clip = model_clip.to(device).eval()

scores = []

with torch.no_grad():
    for p in tqdm(predictions):
        img = preprocess(Image.open(p["image_path"]).convert("RGB")).unsqueeze(0).to(device)
        txt = tokenizer([p["prediction"]]).to(device)

        img_feat = model_clip.encode_image(img)
        txt_feat = model_clip.encode_text(txt)

        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
        txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

        sim = (img_feat @ txt_feat.T).item()   # cosine similarity
        scores.append(sim)

clipscore_mean = sum(scores) / len(scores)
clipscore_mean

KeyboardInterrupt: 

In [20]:
!pip install pycocoevalcap
!pip install pycocotools


# Build dictionaries indexed by image id
gts = {}
res = {}

for idx, p in enumerate(predictions):
    img_id = idx

    gts[img_id] = [ref.strip() for ref in p["references"]]
    res[img_id] = [p["prediction"].strip()]



In [51]:
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
!apt-get update -qq
!apt-get install -y openjdk-11-jre-headless

!update-alternatives --set java /usr/lib/jvm/java-11-openjdk-amd64/bin/java


cider_scorer = Cider()
cider_score, _ = cider_scorer.compute_score(gts, res)
print("CIDEr:", cider_score)

spice_scorer = Spice()
spice_score, _ = spice_scorer.compute_score(gts, res)
print("SPICE:", spice_score)

meteor_scorer = Meteor()
meteor_score, _ = meteor_scorer.compute_score(gts, res)
print("METEOR:", meteor_score)

bleu_scorer = Bleu(4)
bleu_scores, _ = bleu_scorer.compute_score(gts, res)
print("BLEU-1..4:", bleu_scores)

rouge_scorer = Rouge()
rouge_score, _ = rouge_scorer.compute_score(gts, res)
print("ROUGE-L:", rouge_score)

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-11-jre-headless is already the newest version (11.0.30+7-1ubuntu1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 43 not upgraded.
CIDEr: 0.47347128983641257
SPICE: 0.13837730475174786
METEOR: 0.18053706344795803
{'testlen': 2795, 'reflen': 3130, 'guess': [2795, 2476, 2157, 1838], 'correct': [1582, 530, 161, 48]}
ratio: 0.8929712460061044
BLEU-1..4: [0.5020783355455046, 0.30876051974881624, 0.1848085349695184, 0.10996441156300697]
ROUGE-L: 0.33061548424527143


In [52]:
print(f"CLIP similarity (mean): {clipscore_mean:.4f}")
print(f"CIDEr: {cider_score:.4f}")
print(f"SPICE: {spice_score:.4f}")
print(f"METEOR: {meteor_score:.4f}")
print(f"BLEU: {bleu_scores[0]:.4f} {bleu_scores[1]:.4f} {bleu_scores[2]:.4f} {bleu_scores[3]:.4f}")
print(f"ROUGE: {rouge_score:.4f}")


CLIP similarity (mean): 0.3106
CIDEr: 0.4735
SPICE: 0.1384
METEOR: 0.1805
BLEU: 0.5021 0.3088 0.1848 0.1100
ROUGE: 0.3306
