In [3]:
!cd DI725_Final_Project/src/

In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import torch
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# === Configuration ===
CSV_PATH = "DI725_Final_Project/data/RISCM/captions.csv"
IMAGE_DIR = "/content/drive/MyDrive/data/RISCM/resized/"
MODEL_ID = "google/paligemma-3b-mix-224"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

In [4]:
# === Load CSV ===
df = pd.read_csv(CSV_PATH)
train_df = df[df['split'] == 'train'].reset_index(drop=True)
val_df = df[df['split'] == 'val'].reset_index(drop=True)

In [5]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `hf-token` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-aut

In [6]:
# === Load Model and Processor ===
model = PaliGemmaForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=DTYPE,
    device_map="cuda:0"
).to(DEVICE)
model.gradient_checkpointing_enable()
model.train()

processor = AutoProcessor.from_pretrained(MODEL_ID)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.26M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

In [7]:
# === Dataset Class ===
class CaptionDataset(Dataset):
    def __init__(self, dataframe, processor, image_dir):
        self.df = dataframe
        self.processor = processor
        self.image_dir = image_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = self.image_dir + row['image']
        caption = row['caption_1']
        image = Image.open(image_path).convert("RGB")

        prompt = "<image> " + processor.tokenizer.bos_token + " caption en"
        model_inputs = self.processor(
            text=prompt,
            images=image,
            suffix=caption,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=64,
            add_special_tokens=False
        )
        input_ids = model_inputs["input_ids"].squeeze(0)
        attention_mask = model_inputs["attention_mask"].squeeze(0)
        pixel_values = model_inputs["pixel_values"].squeeze(0)

        labels = self.processor(
            text=caption,
            images=image,
            return_tensors="pt",
            padding="max_length",
            max_length=64,
            truncation=True
        )["input_ids"].squeeze(0)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pixel_values": pixel_values,
            "labels": labels,
            "text": caption
        }

In [8]:
# === DataLoader ===
train_dataset = CaptionDataset(train_df, processor, IMAGE_DIR)
val_dataset = CaptionDataset(val_df, processor, IMAGE_DIR)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

# === Optimizer ===
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [9]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
# === Training Loop ===
for epoch in range(3):
    total_loss = 0
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} - Training"):
        optimizer.zero_grad()
        with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
            outputs = model(
                input_ids=batch['input_ids'].to(DEVICE),
                attention_mask=batch['attention_mask'].to(DEVICE),
                pixel_values=batch['pixel_values'].to(DEVICE),
                labels=batch['labels'].to(DEVICE)
            )
            loss = outputs.loss

        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()


        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} | Avg Train Loss: {avg_train_loss:.4f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 1 - Training:  86%|████████▌ | 30615/35614 [5:32:56<53:21,  1.56it/s]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
Epoch 1 - Training:  86%|████████▌ | 30616/35614 [5:32:57<54:03,  1.54it/s]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
Epoch 1 - Training:  86%|████████▌ | 30617/35614 [5:32:58<53:59,  1.54it/s]You are passing both `text` and `images` to 

Epoch 1 | Avg Train Loss: 0.0436


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 2 - Training:  52%|█████▏    | 18444/35614 [1:47:30<1:38:50,  2.90it/s]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
Epoch 2 - Training:  52%|█████▏    | 18445/35614 [1:47:30<1:39:11,  2.88it/s]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
Epoch 2 - Training:  52%|█████▏    | 18446/35614 [1:47:31<1:40:01,  2.86it/s]You are passing both `text` and `image

In [None]:
    # === Validation ===
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} - Validation"):
            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                outputs = model(
                    input_ids=batch['input_ids'].to(DEVICE),
                    attention_mask=batch['attention_mask'].to(DEVICE),
                    pixel_values=batch['pixel_values'].to(DEVICE),
                    labels=batch['labels'].to(DEVICE)
                )
                val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1} | Avg Val Loss: {avg_val_loss:.4f}")

In [None]:
# === Save Final Model ===
model.save_pretrained("/content/drive/MyDrive/paligemma-finetuned")
processor.save_pretrained("/content/drive/MyDrive/paligemma-finetuned")