In [25]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import os
from ast import literal_eval
from dotenv import load_dotenv 

In [26]:
# loading variables from .env file
load_dotenv("../../private_data/.env") 

# PARENT gets us to the root of the project
PARENT = "./../../"

FOLDER_TABLE = PARENT + os.getenv("FOLDER_TABLE")
FILE_FABRITIUS_DATA = PARENT + os.getenv("FILE_FABRITIUS_DATA")
FILE_FABRITIUS_DATA_FILTERED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED")
FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED")
FOLDER_FIGURES = PARENT + os.getenv("FOLDER_FIGURES")
IMAGES_FOLDER = PARENT + os.getenv("IMAGES_FOLDER")

DB_INPUT_ARTPIECES = PARENT + os.getenv("DB_INPUT_ARTPIECES")
DB_INPUT_ARTISTS = PARENT + os.getenv("DB_INPUT_ARTISTS")

BENCHMARK_2_ATTACHED = PARENT + os.getenv("BENCHMARK_2_ATTACHED")
BENCHMARK_2_EXPLODED = PARENT + os.getenv("BENCHMARK_2_EXPLODED")

FILE_SUBJECTMATTERS_PARSED = PARENT + os.getenv("FILE_SUBJECTMATTERS_PARSED")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [27]:
# Get the artworks data
ARTWORKS = pd.read_csv(DB_INPUT_ARTPIECES)

In [28]:
# Methods to get an image from a recordID
def fixPath(path):
    return path.replace(".././", "../")

recordID_to_imageLowResFilename = {}
for index, row in ARTWORKS.iterrows():
    recordID = row["recordID"]
    path = row["imageLowResFilename"]

    # Fix the imageLowResFilename
    path = fixPath(IMAGES_FOLDER + path[1:])

    path = path.replace("internet", "Internet")
    path = path.replace("Mod", "mod")
    path = path.replace("MOD", "mod")
    path = path.replace("Old", "old")
    path = path.replace("Stefaan", "stefaan")
    path = path.replace("Art-Foto", "art-foto")
    path = path.replace("\\", "/")

    recordID_to_imageLowResFilename[recordID] = path

# Test it
for recordID, path in recordID_to_imageLowResFilename.items():
    if not os.path.exists(path):
        print(f"Path does not exist: {path}")

In [29]:
def v_literal_eval(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError):
        return np.nan

In [30]:
DATA_ATTACHED = pd.read_csv(BENCHMARK_2_ATTACHED)
DATA_ATTACHED

Unnamed: 0,recordID,proper_nouns,tokenized_length
0,64,"Jésus, Evangiles, Calvaire, Jérusalem, Passion...",31
1,78,Louise van der Hecht,7
2,79,Robert Schumann,5
3,81,Marguerite Khnopff,8
4,105,"Cupidon, Ariane, Bacchus",11
...,...,...,...
1896,11252,Namur,4
1897,11521,"Andromède, Céto, Persée",12
1898,11525,"Nicolas-Henri Tardieu, Marie-Anne Hortemels",15
1899,11533,"Rik Wouters, Nel Wouters, Amsterdam",13


In [31]:
DATA_EXPLODED = pd.read_csv(BENCHMARK_2_EXPLODED, converters={'recordID': v_literal_eval})
print(DATA_EXPLODED["recordID"].sample(1).dtype)
DATA_EXPLODED

object


Unnamed: 0,proper_nouns,recordID
0,Aar,[8437]
1,Ab Urbe Condita,[1369]
2,Abigaïl,[5299]
3,Abraham,"[6622, 7295]"
4,Abraham Bloemaert,[5354]
...,...,...
1660,Zélande,"[625, 1598, 7587, 8256]"
1661,Égypte,[1062]
1662,Élysée Reclus,[10189]
1663,Énée,[10241]


# Get the model

In [32]:
base_model_february_finetuned = "openai/clip-vit-large-patch14" # art-base
base_model_march_finetuned = "openai/clip-vit-large-patch14" # art-base

base_model_mini = "openai/clip-vit-base-patch32" # art-mini
base_model_base = "openai/clip-vit-large-patch14" # art-base
base_model_large = "openai/clip-vit-large-patch14-336" # art-large

basic_mini = "openai/clip-vit-base-patch32"
basic_base = "openai/clip-vit-large-patch14"
basic_large = "openai/clip-vit-large-patch14-336"

In [33]:
#model_name = "basic-large"  # OK
#model_name = "art-large" # OK
#model_name = "art-base" # OK
#model_name = "basic-base" # OK
model_name = "february_finetuned" # OK
#model_name = "march_finetuned" # OK
#model_name = "art-mini" # OK
#model_name = "basic-mini" # OK
model_name = "art-base-TextFT"

print(f"Running benchmark on: {model_name}")

Running benchmark on: art-base-TextFT


In [34]:
# Create folder to export the results
RESULT_FOLDER = "../benchmarks/benchmark_2"
os.makedirs(RESULT_FOLDER, exist_ok=True)

In [35]:
root = "../../private_data/MODELS/"

In [36]:
if model_name == "february_finetuned":
  processor = CLIPProcessor.from_pretrained(base_model_base)
  model = CLIPModel.from_pretrained(base_model_base).to(device)
  model_weights_path = root + "2025-02-05 17_09_07_allFocus_5.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 2 # 32

elif model_name == "march_finetuned":
  processor = CLIPProcessor.from_pretrained(base_model_base)
  model = CLIPModel.from_pretrained(base_model_base).to(device)
  model_weights_path = root + "2025-03-29 16 59 53_allFocus_5.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 2 # 32

elif model_name == "art-mini":
  processor = CLIPProcessor.from_pretrained(base_model_mini)
  model = CLIPModel.from_pretrained(base_model_mini).to(device)
  model_weights_path = root + "art-mini.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 8 # 256

elif model_name == "art-base":
  processor = CLIPProcessor.from_pretrained(base_model_base)
  model = CLIPModel.from_pretrained(base_model_base).to(device)
  model_weights_path = root + "art-base.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 2 # 32

elif model_name == "art-large":
  processor = CLIPProcessor.from_pretrained(base_model_large)
  model = CLIPModel.from_pretrained(base_model_large).to(device)
  model_weights_path = root + "art-large.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 1 # 16

elif model_name == "basic-mini":
  processor = CLIPProcessor.from_pretrained(basic_mini)
  model = CLIPModel.from_pretrained(basic_mini).to(device)
  BATCH_SIZE = 8 # 256

elif model_name == "basic-base":
  processor = CLIPProcessor.from_pretrained(basic_base)
  model = CLIPModel.from_pretrained(basic_base).to(device)
  BATCH_SIZE = 2 # 32

elif model_name == "basic-large":
  processor = CLIPProcessor.from_pretrained(basic_large)
  model = CLIPModel.from_pretrained(basic_large).to(device)
  BATCH_SIZE = 1 # 16

else:
  processor = CLIPProcessor.from_pretrained(base_model_base)
  model = CLIPModel.from_pretrained(base_model_base).to(device)
  model_weights_path = root + f"{model_name}.pt"
  model.load_state_dict(torch.load(model_weights_path, weights_only=True))
  BATCH_SIZE = 2 # 32

print(f"Running benchmark on: {model_name} with batch size: {BATCH_SIZE}")

Running benchmark on: art-base-TextFT with batch size: 2


# Configure the benchmark

In [37]:
best_workers = 0 #10
criterion = nn.CrossEntropyLoss()

# Create the -ATTACHED and -EXPLODED datasets

In [38]:
artworks_recordIDs_attached = set()
for recordID in DATA_ATTACHED["recordID"]:
    artworks_recordIDs_attached.add(recordID)

artworks_recordIDs_attached = sorted(list(artworks_recordIDs_attached))

artworks_recordIDs_exploded = set()
for recordIDs in DATA_EXPLODED["recordID"]:
    for recordID in recordIDs:
        artworks_recordIDs_exploded.add(recordID)
artworks_recordIDs_exploded = sorted(list(artworks_recordIDs_exploded))

# Merge them together
artworks_recordIDs = artworks_recordIDs_attached + artworks_recordIDs_exploded
artworks_recordIDs = sorted(list(set(artworks_recordIDs)))

## Compute the images embeddings

In [39]:
class ArtworksImages(Dataset):
    def __init__(self, recordIDs):
        self.recordIDs = recordIDs

    def __len__(self):
        return len(self.recordIDs)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        recordID = self.recordIDs[idx]
        path = recordID_to_imageLowResFilename[recordID]

        image = Image.open(path)

        return image

def ArtworksImagesBBuilder(images):
    inputs = processor(text=[""] * len(images), images=images, return_tensors="pt", padding=True, truncation=True)
    return inputs

artworks_dataset = ArtworksImages(artworks_recordIDs)
artworks_dataloader = DataLoader(
    artworks_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=best_workers,
    collate_fn=ArtworksImagesBBuilder,
    pin_memory=True,
    #prefetch_factor=2,
    #persistent_workers=True,
)

def compute_images_embeddings(dataloader, model, device):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for sample in tqdm(dataloader, desc="Computing artworks embeddings", unit="batch"):
            input_ids = sample['input_ids'].to(device)
            attention_mask = sample['attention_mask'].to(device)
            pixel_values = sample['pixel_values'].to(device)

            # Compute image embeddings
            image_features = model.get_image_features(pixel_values=pixel_values)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            image_features = image_features.flatten(1)

            embeddings.append(image_features)

    embeddings = torch.cat(embeddings, dim=0)
    embeddings = embeddings.cpu().numpy()
    return embeddings

artworks_embeddings_np_array = compute_images_embeddings(artworks_dataloader, model, device)
artworks_embeddings = {}
for i, recordID in enumerate(artworks_recordIDs):
    artworks_embeddings[recordID] = artworks_embeddings_np_array[i]

# Get the embeddings for the attached artworks
artworks_embeddings_attached = []
for recordID in artworks_recordIDs_attached:
    artworks_embeddings_attached.append(artworks_embeddings[recordID])
# Make a numpy array
artworks_embeddings_attached = np.array(artworks_embeddings_attached)
print(artworks_embeddings_attached.shape)

# Get the embeddings for the exploded artworks
artworks_embeddings_exploded = []
for recordID in artworks_recordIDs_exploded:
    artworks_embeddings_exploded.append(artworks_embeddings[recordID])
# Make a numpy array
artworks_embeddings_exploded = np.array(artworks_embeddings_exploded)
print(artworks_embeddings_exploded.shape)

Computing artworks embeddings: 100%|██████████| 953/953 [00:54<00:00, 17.34batch/s]


(1901, 768)
(1905, 768)


## Compute the proper nouns embeddings

In [40]:
def compute_textual_embeddings(dataloader, model, device):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for sample in tqdm(dataloader, desc="Computing textual embeddings", unit="batch"):
            input_ids = sample['input_ids'].to(device)
            attention_mask = sample['attention_mask'].to(device)

            # Compute the embeddings
            text_features = model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
            text_features = text_features.flatten(1)

            embeddings.append(text_features)

    embeddings = torch.cat(embeddings, dim=0)
    embeddings = embeddings.cpu().numpy()
    return embeddings

def TextBBuilder(samples):
    inputs = processor(text=samples, return_tensors="pt", padding=True, truncation=True)
    return inputs

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        text = self.texts[idx]
        return text

In [41]:
attached_dataset = TextDataset(DATA_ATTACHED["proper_nouns"])
attached_dataloader = DataLoader(
    attached_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=best_workers,
    collate_fn=TextBBuilder,
    pin_memory=True,
    #prefetch_factor=2,
    #persistent_workers=True,
)

attached_embeddings = compute_textual_embeddings(attached_dataloader, model, device)

Computing textual embeddings: 100%|██████████| 951/951 [00:08<00:00, 110.61batch/s]


In [42]:
exploded_dataset = TextDataset(DATA_EXPLODED["proper_nouns"])
exploded_dataloader = DataLoader(
    exploded_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=best_workers,
    collate_fn=TextBBuilder,
    pin_memory=True,
    #prefetch_factor=2,
    #persistent_workers=True,
)

exploded_embeddings = compute_textual_embeddings(exploded_dataloader, model, device)

Computing textual embeddings: 100%|██████████| 833/833 [00:07<00:00, 109.81batch/s]


# -ATTACHED variant

In [43]:
def get_average_position(cosine_similarities):
    positions = []
    average_position = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        sorted_indices = [int(p) for p in sorted_indices]
        positions.append(sorted_indices)
        average_position += rank
    average_position /= len(cosine_similarities)
    return average_position, positions

def get_MRR(cosine_similarities):
    mrr = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        mrr += 1 / rank
    mrr /= len(cosine_similarities)
    return mrr

def get_recall_at_k(cosine_similarities, k):
    recall_at_k = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        if i in sorted_indices[:k]:
            recall_at_k += 1
    recall_at_k /= len(cosine_similarities)
    return recall_at_k

def get_nDCG_at_k(cosine_similarities, k):
    nDCG_at_k = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1]
        rank = np.where(sorted_indices == i)[0][0] + 1
        nDCG_at_k += 1 / np.log2(rank + 1) if rank <= k else 0
    nDCG_at_k /= len(cosine_similarities)
    return nDCG_at_k

def get_metrics_row(cosine_similarities):
    recalls_k = [1,3,5,10]
    nDCG_k = [1,3,5,10]

    average_position, positions = get_average_position(cosine_similarities)
    mrr = get_MRR(cosine_similarities)
    recalls = [get_recall_at_k(cosine_similarities, k) for k in recalls_k]
    nDCGs = [get_nDCG_at_k(cosine_similarities, k) for k in nDCG_k]

    metrics = [average_position, mrr] + recalls + nDCGs

    return metrics, positions

In [44]:
def run_attached_benchmark():
    results = pd.DataFrame(columns=[
        "average_position",
        "mrr",
        "recall@1",
        "recall@3",
        "recall@5",
        "recall@10",
        "nDCG@1",
        "nDCG@3",
        "nDCG@5",
        "nDCG@10"
    ])
    
    # Get the cosine similarities
    cosine_similarities = attached_embeddings @ artworks_embeddings_attached.T
    # Measure the metrics
    metrics, positions = get_metrics_row(cosine_similarities)

    results.loc[len(results)] = metrics

    return results, positions

# -EXPLODED variant

In [45]:
def run_exploded_benchmark():
    results = pd.DataFrame(columns=[
        "overall_average_rank",
        "average_rank",
        "ranks",
        "positions"
    ])
    
    # Get the cosine similarities
    cosine_similarities = exploded_embeddings @ artworks_embeddings_exploded.T
    # Measure the metrics

    overall_average_rank = 0
    for i in range(len(cosine_similarities)):
        sorted_indices = np.argsort(cosine_similarities[i])[::-1] # shape = (len(artworks_recordIDs),)

        # Get the recordIDs linked to this proper noun
        recordIDs = DATA_EXPLODED.iloc[i]["recordID"]
        ranks_i = []
        for recordID in recordIDs:
            # Get the index of the recordID in the artworks_recordIDs
            index = artworks_recordIDs_exploded.index(recordID)
            rank = np.where(sorted_indices == index)[0][0] + 1
            ranks_i.append(int(rank)) 

        sorted_indices = [int(p) for p in sorted_indices]

        mean_rank_i = np.mean(ranks_i)

        results.loc[len(results)] = [None, mean_rank_i, ranks_i, sorted_indices]
        overall_average_rank += mean_rank_i

    overall_average_rank /= len(cosine_similarities)
    results["overall_average_rank"] = overall_average_rank

    return results

In [46]:
# Run the benchmarks !
import pickle

# Attached benchmark
results_attached, positions_attached= run_attached_benchmark()
results_attached.to_csv(RESULT_FOLDER + f"/{model_name}_attached.csv", index=False)
# Save positions_attached
with open(RESULT_FOLDER + f"/{model_name}_positions_attached.pkl", "wb") as f:
    pickle.dump(positions_attached, f)

results_attached

Unnamed: 0,average_position,mrr,recall@1,recall@3,recall@5,recall@10,nDCG@1,nDCG@3,nDCG@5,nDCG@10
0,291.999474,0.082631,0.036823,0.082062,0.114676,0.168858,0.036823,0.062817,0.076103,0.093278


In [47]:
# Exploded benchmark
results_exploded = run_exploded_benchmark()
results_exploded.to_csv(RESULT_FOLDER + f"/{model_name}_exploded.csv", index=False)
results_exploded

Unnamed: 0,overall_average_rank,average_rank,ranks,positions
0,340.931638,524.0,[524],"[1225, 176, 1601, 131, 1439, 1655, 1056, 322, ..."
1,340.931638,131.0,[131],"[35, 1059, 859, 864, 187, 1875, 34, 888, 816, ..."
2,340.931638,4.0,[4],"[864, 48, 857, 932, 1507, 1045, 367, 263, 863,..."
3,340.931638,40.5,"[61, 20]","[816, 364, 876, 456, 44, 1613, 263, 1595, 1589..."
4,340.931638,423.0,[423],"[1613, 1278, 887, 1773, 841, 364, 1581, 1730, ..."
...,...,...,...,...
1660,340.931638,57.5,"[152, 1, 19, 58]","[352, 23, 1689, 112, 1568, 1429, 249, 1600, 12..."
1661,340.931638,153.0,[153],"[1174, 1898, 413, 1507, 137, 835, 1811, 12, 33..."
1662,340.931638,926.0,[926],"[1601, 1513, 187, 817, 17, 1598, 341, 1468, 44..."
1663,340.931638,287.0,[287],"[834, 864, 857, 906, 341, 1421, 1099, 528, 176..."


In [None]:
results_exploded["min_rank"] = results_exploded["ranks"].apply(lambda x: min(x))
results_exploded["min_rank"].describe()

count    1665.000000
mean      288.645646
std       398.543299
min         1.000000
25%        14.000000
50%       103.000000
75%       400.000000
max      1882.000000
Name: min_rank, dtype: float64

: 