In [None]:
!nvcc --version

In [None]:
!apt-get update
!apt-get install -y libvips42 libvips-dev
!pip install pyvips

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset
import json
import tqdm
import os

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
root = "/content/drive/MyDrive/MASTER_THESIS/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
FULL_DATASET = pd.read_csv(root + "fabritius_data_filtered_downloaded.csv")
# Remove rows with corrupted images
FULL_DATASET = FULL_DATASET[FULL_DATASET["recordID"] != 11546]
FULL_DATASET = FULL_DATASET[FULL_DATASET["recordID"] != 5262]
FULL_DATASET = FULL_DATASET.sample(frac=1.0).reset_index(drop=True)
FULL_DATASET

In [None]:
def loadOutputs():
  return pd.read_csv(root + "outputs.csv")

def saveOutputs(df):
  df.to_csv(root + "outputs.csv", index=False)

if os.path.exists(root + "outputs.csv"):
    captions = loadOutputs()
else:
    captions = pd.DataFrame(columns=["recordID", "question", "caption_EN", "caption_FR"])

captions

In [None]:
done_recordIDs = captions["recordID"].unique()
FULL_DATASET = FULL_DATASET[~FULL_DATASET["recordID"].isin(done_recordIDs)]
FULL_DATASET

In [None]:
def fixPath(path):
    return path.replace(".././", "../")

def get_image_path_from_recordID(dataset, recordID):
    """
    Given a recordID, return the local path for its image.
    """
    # Locate row in the downloaded DataFrame
    paths = dataset[
        dataset["recordID"] == recordID
    ]["low_res_filename"].values

    if len(paths) == 0:
        return None

    path = paths[0]
    # Merge: IMAGES_FOLDER + path[1:]
    merged_path = fixPath(root + "images/" + path[1:])
    return merged_path

In [None]:
class ImageTextDataset(Dataset):
    def __init__(self, dataframe, getImageFromRecordID):
        self.dataframe = dataframe
        self.getImageFromRecordID = getImageFromRecordID

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        recordID = row['recordID']

        path = self.getImageFromRecordID(self.dataframe, recordID)
        path = path.replace("internet", "Internet")
        path = path.replace("Mod", "mod")
        path = path.replace("Old", "old")
        path = path.replace("Stefaan", "stefaan")
        path = path.replace("Art-Foto", "art-foto")
        image = Image.open(path)

        return recordID, image

import os
# Verify that all image paths exist
for recordID, path in tqdm.tqdm(zip(FULL_DATASET["recordID"], FULL_DATASET["low_res_filename"])):
    path = get_image_path_from_recordID(FULL_DATASET, recordID)
    path = path.replace("internet", "Internet")
    path = path.replace("Mod", "mod")
    path = path.replace("Old", "old")
    path = path.replace("Stefaan", "stefaan")
    path = path.replace("Art-Foto", "art-foto")
    assert os.path.exists(path), f"Image file not found: {path}"

# Test
dataset = ImageTextDataset(FULL_DATASET, get_image_path_from_recordID)
dalaloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
for recordIDs, images in dalaloader:
    print(len(recordIDs), len(images))
    plt.imshow(images[0], cmap='gray')
    plt.show()
    break

In [None]:
# Load the pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
T_tokenizer = MarianTokenizer.from_pretrained(model_name)
T_model = MarianMTModel.from_pretrained(model_name)

# English to French translation function
def translate_to_french(text):
    # Tokenize input text
    inputs = T_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Perform translation
    translated = T_model.generate(**inputs)
    # Decode the translated text
    return T_tokenizer.decode(translated[0], skip_special_tokens=True)

In [None]:
model_id = "vikhyatk/moondream2"
revision = "2025-01-09"  # Pin to specific version
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision=revision,
    device_map={"": "cuda"}
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

In [None]:
questions = [
    "What is a short caption for this image where you speak about objects ?",
    "What is a short caption for this image where you speak about colors ?",
    "What is a short caption for this image where you speak about luminosity ?",
    "What is a short caption for this image where you speak about emotions ?",
]

In [None]:
def addRow(df, recordID, question, caption_EN, caption_FR):
    df.loc[len(df)] = [recordID, question, caption_EN, caption_FR]

batch_index = 0
for recordIDs, images in tqdm.tqdm(dalaloader):
    # Get captions
    for index, recordID in enumerate(recordIDs):
      if recordID in captions["recordID"].values:
        continue

      image = images[index]

      # Classic short caption
      caption_EN = model.caption(image, length="short")["caption"]
      caption_FR = translate_to_french(caption_EN)

      # Get additional captions using questions
      questionsOutputs = []
      for question in questions:
        answer_EN = model.query(image, question)["answer"]
        answer_FR = translate_to_french(answer_EN)
        questionsOutputs.append((question, answer_EN, answer_FR))

      # Add all at once
      addRow(captions, recordID, "caption", caption_EN, caption_FR)
      for question, answer_EN, answer_FR in questionsOutputs:
        addRow(captions, recordID, question, answer_EN, answer_FR)

    batch_index += 1

    if batch_index % 3 == 0:
        saveOutputs(captions)

saveOutputs(captions)
captions

In [None]:
recordID = captions["recordID"][0]
rows = captions[captions["recordID"] == recordID]
for question, cEN, cFR in zip(rows["question"], rows["caption_EN"], rows["caption_FR"]):
  print(question)
  print(cFR)
  print()