In [3]:
!cd DI725_Final_Project/src/

In [6]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from DI725_Final_Project.src.processing_paligemma import PaliGemmaProcessor
from DI725_Final_Project.src.utils import load_hf_model
from PIL import Image
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'gemma'

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# === Configuration ===
CSV_PATH = "captions.csv"
IMAGE_DIR = "data/images/"
MODEL_PATH = "paligemma_model/"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# === Load CSV ===
df = pd.read_csv(CSV_PATH)
train_df = df[df['split'] == 'train'].reset_index(drop=True)
val_df = df[df['split'] == 'val'].reset_index(drop=True)

In [None]:
# === Tokenizer & Processor ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side="right")
processor = PaliGemmaProcessor(tokenizer, num_image_tokens=196, image_size=224)

# === Load Model ===
model, tokenizer = load_hf_model(MODEL_PATH, DEVICE)
model.to(DEVICE).train()

In [None]:
# === Dataset Class ===
class CaptionDataset(Dataset):
    def __init__(self, dataframe, processor, image_dir):
        self.df = dataframe
        self.processor = processor
        self.image_dir = image_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = self.image_dir + row['image']
        caption = row['caption_1']

        image = Image.open(image_path).convert("RGB")
        inputs = self.processor(text=[caption], images=[image])

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "labels": inputs["input_ids"].squeeze(0),
            "text": caption
        }

In [None]:
# === DataLoader ===
train_dataset = CaptionDataset(train_df, processor, IMAGE_DIR)
val_dataset = CaptionDataset(val_df, processor, IMAGE_DIR)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# === Optimizer ===
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
# === Training Loop ===
for epoch in range(3):
    total_loss = 0
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} - Training"):
        optimizer.zero_grad()

        outputs = model(
            input_ids=batch['input_ids'].to(DEVICE),
            attention_mask=batch['attention_mask'].to(DEVICE),
            pixel_values=batch['pixel_values'].to(DEVICE),
            labels=batch['labels'].to(DEVICE)
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} | Avg Train Loss: {avg_train_loss:.4f}")

In [None]:

    # === Validation ===
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} - Validation"):
            outputs = model(
                input_ids=batch['input_ids'].to(DEVICE),
                attention_mask=batch['attention_mask'].to(DEVICE),
                pixel_values=batch['pixel_values'].to(DEVICE),
                labels=batch['labels'].to(DEVICE)
            )
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1} | Avg Val Loss: {avg_val_loss:.4f}")