In [None]:
!apt-get update
!apt-get install -y libvips42 libvips-dev
!pip install pyvips

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader, Dataset
import json

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
root = "/content/drive/MyDrive/MASTER_THESIS/"

In [None]:
FULL_DATASET = pd.read_csv(root + "private_data/CSV/fabritius_data_filtered_downloaded.csv")
# Remove rows with corrupted images
FULL_DATASET = FULL_DATASET[FULL_DATASET["recordID"] != 11546]
FULL_DATASET = FULL_DATASET[FULL_DATASET["recordID"] != 5262]
FULL_DATASET = FULL_DATASET.sample(frac=1.0).reset_index(drop=True)
FULL_DATASET

In [None]:
def fixPath(path):
    return path.replace(".././", "../")

def get_image_path_from_recordID(dataset, recordID):
    """
    Given a recordID, return the local path for its image.
    """
    # Locate row in the downloaded DataFrame
    paths = dataset[
        dataset["recordID"] == recordID
    ]["low_res_filename"].values
    
    if len(paths) == 0:
        return None
    
    path = paths[0]
    # Merge: IMAGES_FOLDER + path[1:]
    merged_path = fixPath(root + "images/" + path[1:])
    return merged_path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
class ImageTextDataset(Dataset):
    def __init__(self, dataframe, getImageFromRecordID):
        self.dataframe = dataframe
        self.getImageFromRecordID = getImageFromRecordID

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        recordID = row['recordID']

        path = self.getImageFromRecordID(self.dataframe, recordID) 
        image = Image.open(path)        

        return recordID, image
    
# Test
dataset = ImageTextDataset(FULL_DATASET, get_image_path_from_recordID)
dalaloader = DataLoader(dataset, batch_size=16, shuffle=False)
for recordIDs, images in dalaloader:
    print(len(recordIDs), len(images))
    plt.imshow(images[0], cmap='gray')
    plt.show()
    break

In [None]:
# Load the pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
T_tokenizer = MarianTokenizer.from_pretrained(model_name)
T_model = MarianMTModel.from_pretrained(model_name)

In [None]:
model_id = "vikhyatk/moondream2"
revision = "2025-01-09"  # Pin to specific version
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    revision=revision,
    device_map={"": "cuda"}
)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

In [None]:
"""
Animal Bird, Butterfly, Cat, Chicken, Cow, Dog, Donkey, Fish, Horse, Insect, Mouse, Rabbit, Reptile, Sheep
Architecture Bridge, Castle, Church, Door, House, Mill, Pillar, Staircase, Window
Christianity Angel, Cross, Devil, God, Jesus Christ, Saint, Virgin Mary
Clothing Bag, Belt, Cane, Crown, Dress, Gloves, Hat, Jewellery, Mask, Shoes, Tie, Umbrella
Food Apple, Banana, Bread, Cheese, Grapes, Lobster, Orange, Pineapple, Vegetable, Watermelon, Wine
Furniture Bathtub, Bed, Chair, Easel, Sofa, Table
Human Baby, Child, Face, Hand, Man, Woman
Instrument Drum, Flute, Guitar, Harp, Piano, Violin
Interior Bird Cage, Book, Bottle, Bow, Cup, Drapery, Flag, Globe, Lamp, Mirror, Paper, Vase
Nature Bush, Cloud, Fire, Flower, Lake, Lightning, Moon, Mountain, Plant, Rock, Sea, Sky, Sun, Tree
Occultism Demon, Ghost, Skeleton, Skull, Star
Vehicle Airplane, Bicycle, Boat, Car, Carriage, Ship, Train, Wheel
Weaponry Armor, Arrow, Bow, Firearm, Hammer, Helmet, Rope, Shield, Spear, Sword,
"""

In [None]:
objectsList = [
    "man",
    "woman",
    "tree",
    "house",
    "sea",
    "river",
    "sun",
    "flower",
    "dog",
    "cat",
    "bird",
    "horse",
    "chicken",
    "castle",
    "church",
    "door",
    "moutain",
    "cloud",
    "boat"
]

In [None]:
def saveOutputs(outputs):
    with open(root + "outputs.json", "w") as f:
        json.dump(outputs, f)

In [None]:
outputs = {}

batch_index = 0
for recordIDs, images in dalaloader:
    # Get captions
    captions_EN = model.caption(images, length="short")["caption"]
    # Get translated captions
    inputs = T_tokenizer(captions_EN, return_tensors="pt", padding=True, truncation=True)
    # Perform translation
    translated = T_model.generate(**inputs)
    # Decode the translated text
    captions_FR = T_tokenizer.decode(translated, skip_special_tokens=True)
    # Get objects
    objsOutputs = {}
    for obj in objectsList:
        objsOutputs[obj] = model.detect(images, obj)["objects"]
    # Save outputs
    for i, recordID in enumerate(recordIDs):
        outputs[recordID] = {
            "caption_EN": captions_EN[i],
            "caption_FR": captions_FR[i],
            "objects": [objsOutputs[obj][i] for obj in objectsList]
        }

    batch_index += 1

    if batch_index % 10 == 0:
        saveOutputs(outputs)

    break

print(len(outputs))

saveOutputs(outputs)

In [None]:
outputs