In [1]:
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from dotenv import load_dotenv 
import random
import numpy as np
import torch
import pandas as pd
import json
import transformers

In [2]:
# loading variables from .env file
load_dotenv("../../private_data/.env") 

# PARENT gets us to the root of the project
PARENT = "./../../"

FOLDER_TABLE = PARENT + os.getenv("FOLDER_TABLE")
FILE_FABRITIUS_DATA = PARENT + os.getenv("FILE_FABRITIUS_DATA")
FILE_FABRITIUS_DATA_FILTERED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED")
FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED")
FOLDER_FIGURES = PARENT + os.getenv("FOLDER_FIGURES")
IMAGES_FOLDER = PARENT + os.getenv("IMAGES_FOLDER")

DB_INPUT_ARTPIECES = PARENT + os.getenv("DB_INPUT_ARTPIECES")
DB_INPUT_ARTISTS = PARENT + os.getenv("DB_INPUT_ARTISTS")

BENCHMARK_2_ATTACHED = PARENT + os.getenv("BENCHMARK_2_ATTACHED")
BENCHMARK_2_EXPLODED = PARENT + os.getenv("BENCHMARK_2_EXPLODED")

FILE_SUBJECTMATTERS_PARSED = PARENT + os.getenv("FILE_SUBJECTMATTERS_PARSED")

In [3]:
def fixPath(path):
    return path.replace(".././", "../")

filtered_data_downloaded = pd.read_csv(FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED)

def get_image_path_from_recordID(recordID):
    """
    Given a recordID, return the local path for its image.
    """
    # Locate row in the downloaded DataFrame
    paths = filtered_data_downloaded[
        filtered_data_downloaded["recordID"] == recordID
    ]["low_res_filename"].values
    
    if len(paths) == 0:
        return None
    
    path = paths[0]
    # Merge: IMAGES_FOLDER + path[1:]
    merged_path = fixPath(os.path.join(IMAGES_FOLDER, path[1:]))
    return merged_path

In [2]:
# Get the artworks data
ARTWORKS = pd.read_csv(DB_INPUT_ARTPIECES)
ARTWORKS.head(1)

NameError: name 'pd' is not defined

In [5]:
# Get the artworks data
ARTISTS = pd.read_csv(DB_INPUT_ARTISTS)
ARTISTS.head(1)

Unnamed: 0,creatorID,creatorLastName,creatorFirstName,creatorBirthAndDeathDescription,creatorNationality,creatorDeathDate,creatorBirthDate
0,Auth:509:309,Bouts,Dirk,Haarlem (Pays-Bas) vers 1410 ? - Louvain 1475,,1475.0,1410.0


In [6]:
recordID_to_index = {}
for i, recordID in enumerate(ARTWORKS["recordID"]):
    recordID_to_index[recordID] = i
len(recordID_to_index)

5301

In [7]:
subjectmatter_json = json.loads(open(FILE_SUBJECTMATTERS_PARSED, "r", encoding="utf-8").read())

# dict_keys()
def get_structured_flattened(recordID):

    index = recordID_to_index[recordID]

    proper_nouns = set()
    for key in ['subjectMatterSubjectTerms', 'subjectMatterIconographicTerms', 'subjectMatterConceptualTerms']:
        proper_nouns.update(subjectmatter_json[index]["structured"][key]["flattened"])

    # Remove words that do not start with a capital letter
    proper_nouns = [word for word in proper_nouns if word[0].isupper()]

    return proper_nouns

In [8]:
proper_nouns_per_recordID = {}
for recordID in tqdm(ARTWORKS["recordID"]):
    proper_nouns = get_structured_flattened(recordID)
    proper_nouns_per_recordID[recordID] = proper_nouns

# Create a DataFrame from the dictionary
proper_nouns_df = pd.DataFrame(columns=["recordID", "proper_nouns"])
proper_nouns_df["recordID"] = proper_nouns_per_recordID.keys()
proper_nouns_df["proper_nouns"] = proper_nouns_per_recordID.values()
# Keep only the rows with proper nouns
proper_nouns_df = proper_nouns_df[proper_nouns_df["proper_nouns"].str.len() > 0]
proper_nouns_df.reset_index(drop=True, inplace=True)
proper_nouns_df

100%|██████████| 5301/5301 [00:00<00:00, 331272.34it/s]


Unnamed: 0,recordID,proper_nouns
0,64,"[Jésus, Nouveau Testament, Passion, Evangiles,..."
1,78,[Louise van der Hecht]
2,79,[Robert Schumann]
3,81,[Marguerite Khnopff]
4,105,"[Cupidon, Ariane, Bacchus]"
...,...,...
1900,11252,[Namur]
1901,11521,"[Andromède, Persée, Céto]"
1902,11525,"[Nicolas-Henri Tardieu, Marie-Anne Hortemels]"
1903,11533,"[Rik Wouters, Nel Wouters, Amsterdam]"


In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") # All tokenizer tokenize the same 

def get_tokenized_text(text):
    return len(tokenizer(text)["input_ids"])

# -ATTACHED

In [10]:
attached_dataset = proper_nouns_df.copy()
attached_dataset["proper_nouns"] = proper_nouns_df["proper_nouns"].apply(lambda x: ", ".join(x))
attached_dataset["tokenized_length"] = attached_dataset["proper_nouns"].apply(get_tokenized_text)

# Remove rows with tokenized length > 75
attached_dataset = attached_dataset[attached_dataset["tokenized_length"] <= 75]
attached_dataset.reset_index(drop=True, inplace=True)

# Save the DataFrame to a CSV file
attached_dataset.to_csv(BENCHMARK_2_ATTACHED, index=False)

columns = list(attached_dataset.columns)
example_row = attached_dataset.iloc[0]
for column in columns:
    print(f"{column}: {example_row[column]}")

Token indices sequence length is longer than the specified maximum sequence length for this model (84 > 77). Running this sequence through the model will result in indexing errors


recordID: 64
proper_nouns: Jésus, Nouveau Testament, Passion, Evangiles, Vierge, Christ, Calvaire, Crucifixion, Jérusalem
tokenized_length: 31


# -EXPLODED

In [11]:
exploded_dataset = proper_nouns_df.copy()
exploded_dataset = exploded_dataset.explode("proper_nouns")

exploded_dataset = exploded_dataset.groupby("proper_nouns")["recordID"].apply(list).reset_index()

exploded_dataset.to_csv(BENCHMARK_2_EXPLODED, index=False)

columns = list(exploded_dataset.columns)
example_row_index = 0
while True:
    example_row = exploded_dataset.iloc[example_row_index]
    if len(example_row["recordID"]) > 10:
        break
    example_row_index += 1
for column in columns:
    print(f"{column}: {example_row[column]}")

proper_nouns: Afrique
recordID: [137, 717, 1465, 1848, 1849, 1854, 4672, 6163, 6164, 6165, 6166, 6167, 6168, 6460, 6462, 6463, 6464, 8445, 8720]
