In [1]:
from database import MockDB
import pandas as pd
from dotenv import load_dotenv
import os
import random
import torch
import json
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# loading variables from .env file
load_dotenv("../../../private_data/.env") 

# PARENT gets us to the root of the project
PARENT = "./../../../"

FOLDER_TABLE = PARENT + os.getenv("FOLDER_TABLE")
FILE_FABRITIUS_DATA = PARENT + os.getenv("FILE_FABRITIUS_DATA")
FILE_FABRITIUS_DATA_FILTERED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED")
FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED")
FOLDER_FIGURES = PARENT + os.getenv("FOLDER_FIGURES")
IMAGES_FOLDER = PARENT + os.getenv("IMAGES_FOLDER")
RECORD_IDS_TESTING_SET = PARENT + os.getenv("RECORD_IDS_TESTING_SET")
RECORD_IDS_VALIDATION_SET = PARENT + os.getenv("RECORD_IDS_VALIDATION_SET")
WRITTEN_CAPTIONS_TESTING_SET = PARENT + os.getenv("WRITTEN_CAPTIONS_TESTING_SET")
WRITTEN_CAPTIONS_VALIDATION_SET = PARENT + os.getenv("WRITTEN_CAPTIONS_VALIDATION_SET")
FILE_FABRITIUS_ICONOGRAPHIES_JSON = PARENT + os.getenv("FILE_FABRITIUS_ICONOGRAPHIES_JSON")
EMBEDDINGS_FOLDER = PARENT + os.getenv("EMBEDDINGS_FOLDER")
MODELS_FOLDER = PARENT + os.getenv("MODELS_FOLDER")

##

safeFormat = lambda x : x.replace("/", "_").replace(":", "_").replace(" ", "_")
model_name = "ViT-L/14"
embedding_name = safeFormat(model_name) + "_embeddings.npy"
path_imagesEmbeddings = os.path.join(EMBEDDINGS_FOLDER, "images_" + embedding_name)
path_objectsEmbeddings = os.path.join(EMBEDDINGS_FOLDER, "objects_" + embedding_name)
path_othersEmbeddings = os.path.join(EMBEDDINGS_FOLDER, "others_" + embedding_name)

In [3]:
FULL_DATASET = pd.read_csv(FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED)
# Remove rows with corrupted images
FULL_DATASET = FULL_DATASET[FULL_DATASET["recordID"] != 11546]
FULL_DATASET = FULL_DATASET[FULL_DATASET["recordID"] != 5262]

In [4]:
with open(FILE_FABRITIUS_ICONOGRAPHIES_JSON, "r", encoding="utf-8") as f:
    ICONOGRAPHIES = json.load(f)

In [5]:
ICONOGRAPHIES

{'10002': {'value': 'root',
  'children': [{'value': 'figure',
    'children': [{'value': 'homme', 'children': [], 'depth': 2},
     {'value': 'barbe', 'children': [], 'depth': 2}],
    'depth': 1}],
  'depth': 0},
 '10004': {'value': 'root',
  'children': [{'value': 'groupe de figures',
    'children': [{'value': 'homme',
      'children': [{'value': 'chapeau', 'children': [], 'depth': 3}],
      'depth': 2},
     {'value': 'femme',
      'children': [{'value': 'assis', 'children': [], 'depth': 3}],
      'depth': 2},
     {'value': 'pinceau', 'children': [], 'depth': 2}],
    'depth': 1},
   {'value': 'intérieur',
    'children': [{'value': 'atelier', 'children': [], 'depth': 2},
     {'value': 'chevalet', 'children': [], 'depth': 2},
     {'value': 'tableau', 'children': [], 'depth': 2},
     {'value': 'toile', 'children': [], 'depth': 2},
     {'value': 'table', 'children': [], 'depth': 2},
     {'value': 'livre', 'children': [], 'depth': 2},
     {'value': 'fleur', 'children': [],

In [6]:
imagesEmbeddings = np.load(path_imagesEmbeddings)
objectsEmbeddings = np.load(path_objectsEmbeddings)
othersEmbeddings = np.load(path_othersEmbeddings)
print(np.mean(imagesEmbeddings), np.std(imagesEmbeddings))
print(np.mean(objectsEmbeddings), np.std(objectsEmbeddings))
print(np.mean(othersEmbeddings), np.std(othersEmbeddings))

0.0009384602 0.036072187
0.00024383477 0.036083568
0.00022491075 0.03608369


In [7]:
DB = MockDB()
# Insert mock data
DB.insert_mock_data(FULL_DATASET.to_dict(orient="records"), imagesEmbeddings, ICONOGRAPHIES)
average_ico_length = [len(ico) for ico in ICONOGRAPHIES]
print("Average iconography length:", np.mean(average_ico_length))

Average iconography length: 4.003322259136213


In [8]:
DB.get_columns()

['recordLanguage',
 'recordID',
 'DatabaseId',
 'LinkToVubis',
 'objectWork.creatorDescription',
 'objectWork.termClassification',
 'objectWork.workID',
 'objectWork.titleText',
 'objectWork.objectWorkType',
 'objectWork.measurementsDescription',
 'objectWork.termMaterialsTech',
 'objectWork.inscriptionDescription',
 'objectWork.creationDateDescription',
 'creator.creatorAuthID',
 'creator.lastNameCreator',
 'creator.firstNameCreator',
 'creator.birthDeathDatesPlacesCreatorDescription',
 'creator.nationalityCreator',
 'creator.deathDateCreator',
 'creator.earliestActivityCreator',
 'creator.copyrightHolderName',
 'creator.copyrightStatement',
 'measurements',
 'creation.earliestDate',
 'creation.latestDate',
 'formalDescription.physicalAppearanceDescription',
 'subjectMatter.subjectTerms',
 'imageType',
 'imageColor',
 'low_res_filename',
 'high_res_filename',
 'DcaProjectNumber.DcaProjectID',
 'objectWork.exhibitionDescription',
 'creator.birthDateCreator',
 'subjectMatter.iconographi

In [9]:
L = DB.get_unique_for_column("creation.earliestDate")
np.amin(L), np.amax(L)

(np.float64(1401.0), np.float64(1949.0))

In [None]:
FILTERS_SIMPLE = (
    {
        "type": "contains",
        "location": "iconography",
        "term": "homme"
    },
    "OR",
    (
        {
            "type": "interval",
            "location": "metadatas",
            "column": "creation.earliestDate",
            "min": 1948,
            "max": 1949
        },
        "OR",
        {
            "type": "interval",
            "location": "metadatas",
            "column": "creation.earliestDate",
            "min": 1910,
            "max": 1910
        }
    )
)

def formatResult(result):
    return {
        "recordID": result["recordID"],
        "title": result["objectWork.titleText"],
        "dates": (result["creation.earliestDate"], result["creation.latestDate"]),
        "iconography": result["iconography"],
    }

results = DB.query(filters=FILTERS_SIMPLE, query_embedding=None, page=0, page_size=5000)
print(len(results))

results = [formatResult(result) for result in results[:5]]
for result in results:
    print(result["title"], result["dates"])
    print(result["iconography"])
    print("")

1490
Portrait de Paul Verlaine (nan, nan)
['homme', 'barbe', 'figure']

Les collègues (1884.0, 1884.0)
['intérieur', 'groupe de figures', 'chevalet', 'tableau', 'pinceau', 'homme', 'assis', 'femme', 'toile', 'livre', 'chapeau', 'atelier', 'table', 'fleur']

Christ aux outrages (1878.0, 1930.0)
['animal', 'groupe de figures', 'soldat', 'couronne', 'cheval', 'coiffure', 'épine', 'homme', 'femme', 'arme', 'nu', 'casque', 'chien', 'lance', 'foule']

Autoportrait (1855.0, 1865.0)
['pinceau', 'veste', 'homme', 'habit', 'livre', 'figure']

L'homme au chapeau buse. Etude pour Roma (1846.0, 1912.0)
['vêtement', 'en pied', 'homme', 'chapeau', 'figure']

