In [126]:
from dotenv import load_dotenv
import os
import pandas as pd
from tqdm import tqdm
import json
import random

In [131]:
# loading variables from .env file
load_dotenv("../../private_data/.env") 

# PARENT gets us to the root of the project
PARENT = "./../../"

FOLDER_TABLE = PARENT + os.getenv("FOLDER_TABLE")
FILE_FABRITIUS_DATA = PARENT + os.getenv("FILE_FABRITIUS_DATA")
FILE_FABRITIUS_DATA_FILTERED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED")
FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED = PARENT + os.getenv("FILE_FABRITIUS_DATA_FILTERED_DOWNLOADED")
FOLDER_FIGURES = PARENT + os.getenv("FOLDER_FIGURES")
IMAGES_FOLDER = PARENT + os.getenv("IMAGES_FOLDER")
RECORD_IDS_TESTING_SET = PARENT + os.getenv("RECORD_IDS_TESTING_SET")
RECORD_IDS_VALIDATION_SET = PARENT + os.getenv("RECORD_IDS_VALIDATION_SET")
WRITTEN_CAPTIONS_TESTING_SET = PARENT + os.getenv("WRITTEN_CAPTIONS_TESTING_SET")
WRITTEN_CAPTIONS_VALIDATION_SET = PARENT + os.getenv("WRITTEN_CAPTIONS_VALIDATION_SET")
FILE_FABRITIUS_ICONOGRAPHIES_JSON = PARENT + os.getenv("FILE_FABRITIUS_ICONOGRAPHIES_JSON")
EMBEDDINGS_FOLDER = PARENT + os.getenv("EMBEDDINGS_FOLDER")
MODELS_FOLDER = PARENT + os.getenv("MODELS_FOLDER")

DB_INPUT_ARTPIECES = PARENT + os.getenv("DB_INPUT_ARTPIECES")
DB_INPUT_ARTISTS = PARENT + os.getenv("DB_INPUT_ARTISTS")
DB_INPUT_SUBJECTMATTER = PARENT + os.getenv("DB_INPUT_SUBJECTMATTER")
FILE_SUBJECTMATTERS_PARSED = PARENT + os.getenv("FILE_SUBJECTMATTERS_PARSED")

def get_db_config():
    return {
        "host": os.getenv("DB_HOST"),
        "port": os.getenv("DB_PORT"),
        "name": os.getenv("DB_NAME"),
        "user": os.getenv("DB_USER"),
        "password": os.getenv("DB_PASSWORD"),
    }

In [118]:
delimiters = ["(", ")", ";", ":"]

class Node:
    def __init__(self, value, children=None, parent=None):
        self.value = value#lower()
        while self.value[0]==" ":
            self.value = self.value[1:]
        self.parent = parent
        
        if children is None:
            self.children = []
        else:
            self.children = children
            for child in children:
                child.set_parent(self)

        self.forgotten = False
    
    def setForgotten(self, forgotten):
        self.forgotten = forgotten

    def __hash__(self):
        return hash(self.value)
    
    def __eq__(self, other):
        return self.value == other.value

    def get_as_string(self, level=0):
        st = f"'{self.value}'"
        children_st = ",".join([child.get_as_string(level+1) for child in self.children])

        if self.value.startswith("<depth"):
            return children_st

        if len(self.children) > 0:
            st += f"[{children_st}]"

        return st

    def set_parent(self, parent):
        self.parent = parent

    def get_parent(self):
        return self.parent

    def add_child(self, child):
        self.children.append(child)
        child.set_parent(self)
    
    def print_tree(self, level=0):
        print("\t"*level + self.value)
        for child in self.children:
            child.print_tree(level+1)

    def get_children(self):
        return self.children
    
    def getUniqueLabels(self):
        labels = set()
        labels.add(self.value)
        for child in self.children:
            labels.update(child.getUniqueLabels())
        return labels
    
    def remove_empty_children(self):
        newChildren = []
        for child in self.children:
            if len(child.children) > 0:
                newChildren.append(child)
        self.children = newChildren

    def as_json(self, depth=0):
        return {
            "name": self.value,
            "children": [child.as_json(depth=depth + 1) for child in self.children],
        }
    
    def as_flattened_values(self):
        if self.value in ["<group>", "<depth>", "root"]:
            values = []
        else:
            values = [self.value]
        for child in self.children:
            values += child.as_flattened_values()
        return values
        

def cleanIconography(iconography):
    while "  " in iconography:
        iconography = iconography.replace("  ", " ")
    for delimiter in delimiters + ["/"]:
        iconography = iconography.replace(delimiter, f" {delimiter}")
        iconography = iconography.replace(delimiter, f"{delimiter} ")
    return iconography

def l1(elements, parent):
    e1 = elements[0]
    if e1 not in delimiters:
        parent.add_child(Node(e1))
    
def l2(elements, parent):
    # S )
    # ; S
    # : S
    e1 = elements[0]
    e2 = elements[1]

    rule1 = (e1 not in delimiters) and (e2==")")
    if rule1:
        parent.add_child(Node(e1))
        return
    
    rule2 = (e1==";") and (e2 not in delimiters)
    if rule2:
        parent.add_child(Node(e2))
        return
    
    rule3 = (e1==":") and (e2 not in delimiters)
    if rule3:
        lastChild = parent.children[-1]
        lastChild.add_child(Node(e2))
        return

def explorer(elements, parent):

    L = len(elements)
    if L==0:
        return
    if L==1:
        return l1(elements, parent)
    if L==2:
        return l2(elements, parent)

    e1 = elements[0]
    e2 = elements[1]
    e3 = elements[2]

    # S ; S
    rule1 = (e1 not in delimiters) and (e2==";") and (e3 not in delimiters)
    if rule1:
        parent.add_child(Node(e1))
        return explorer(elements[2:], parent)
    
    # S : S
    rule2 = (e1 not in delimiters) and (e2==":") and (e3 not in delimiters)
    if rule2:
        newNode = Node(e1)
        parent.add_child(newNode)
        newNode.add_child(Node(e3))
        return explorer(elements[3:], parent)
    
    # S ( S
    rule3 = (e1 not in delimiters) and (e2=="(") and (e3 not in delimiters)
    if rule3:
        newNode = Node(e1)
        parent.add_child(newNode)
        return explorer(elements[2:], newNode)

    # S ) E
    rule4 = (e1 not in delimiters) and (e2==")")
    if rule4:
        parent.add_child(Node(e1))
        return explorer(elements[2:], parent.get_parent())
    
    # S ; E
    rule5 = (e1 not in delimiters) and (e2==";")
    if rule5:
        # A litte trick: We add a new node after ';' since we know that E is not a S
        # We add <group> after ;
        parent.add_child(Node(e1))
        return explorer(["<group>"] + elements[2:], parent)

    # ; ( E
    rule6 = (e1==";") and (e2=="(")
    if rule6:
        # A litte trick: We add a new node after ';' to create a new group
        # We add <group> after ;
        return explorer(["<group>"] + elements[1:], parent)

    # ; S )
    rule7 = (e1==";") and (e2 not in delimiters) and (e3==")")
    if rule7:
        parent.add_child(Node(e2))
        return explorer(elements[3:], parent.get_parent())
    
    # ; S E
    rule8 = (e1==";") and (e2 not in delimiters)
    if rule8:
        return explorer(elements[1:], parent)

    # : S E
    rule9 = (e1==":") and (e2 not in delimiters)
    if rule9:
        lastChild = parent.children[-1]
        lastChild.add_child(Node(e2))
        return explorer(elements[2:], parent)

    # ) ; E
    rule10 = (e1==")") and (e2==";")
    if rule10:
        return explorer(elements[2:], parent.get_parent())

    # ( E E
    rule11 = (e1=="(")
    if rule11:
        newNode = Node("<group>")
        parent.add_child(newNode)
        return explorer(elements[1:], newNode)

    # Error correction
    # :/; :/; S
    rule12 = (e1 in ";:") and (e2 in ";:") and (e3 not in delimiters)
    if rule12:
        # We discard the first separator
        return explorer(elements[1:], parent)

    raise Exception(f"Unknown rule: {elements}")

def raw_iconography_to_tree(iconography):
    iconography = cleanIconography(iconography)
    
    # Split by "/"
    iconography = iconography.split("/")
    if len(iconography) > 1:
        # This should not happen !
        iconography = [iconography[0]]
    
    # Create the root node
    root = Node("root")

    for i, depth in enumerate(iconography):

        iconography_for_depth = iconography[i]

        text = ""
        iconography_for_depth_split = []
        for ch in iconography_for_depth:
            if ch in delimiters:
                iconography_for_depth_split.append(text)
                iconography_for_depth_split.append(ch)
                text = ""
            else:
                text = text + ch
        iconography_for_depth_split.append(text)


        # Remove empty strings
        iconography_for_depth_split = [x.strip() for x in iconography_for_depth_split]  
        iconography_for_depth_split = [x for x in iconography_for_depth_split if x != ""]
        
        # Correct mistakes
        # 1) Correct ) ( to ) ; (
        corrected = []
        index = 0
        while index < len(iconography_for_depth_split):
            if (index != len(iconography_for_depth_split)-1) and (iconography_for_depth_split[index] == ")" and iconography_for_depth_split[index+1] == "("):
                corrected.append(")")
                corrected.append(";")
                corrected.append("(")
                index += 2
            else:
                corrected.append(iconography_for_depth_split[index])
                index += 1

        iconography_for_depth_split = corrected

        # 2) Correct unopened groups
        corrected = []
        depth = 0
        index = 0
        while index < len(iconography_for_depth_split):
            if iconography_for_depth_split[index] == "(":
                depth += 1
            if iconography_for_depth_split[index] == ")":
                if depth==0:
                    # We remove the )
                    index += 1
                    continue
                depth -= 1
            corrected.append(iconography_for_depth_split[index])
            index += 1

        # Create the nodes for the depth  
        explorer(corrected, root)

    return root

# Test 
iconography = "scène religieuse ([SO] : Jacques de Voragine : Légende dorée :  ; saint Sébastien : martyre ; ange)/groupe de figures (homme : blessé ; être imaginaire : aile ; arme : épée ; flèche ; cuirasse)"
#iconography = "[SO] : Térence : l'Eunuque"
iconography = "paysage"
iconography = "scène religieuse (sainte Anne) : Les apocryphes) ; figure biblique (sainte Marie)"
tree = raw_iconography_to_tree(iconography)
print(iconography)
print(tree.as_json())
print(tree.as_flattened_values())


scène religieuse (sainte Anne) : Les apocryphes) ; figure biblique (sainte Marie)
{'name': 'root', 'children': [{'name': 'scène religieuse', 'children': [{'name': 'sainte Anne', 'children': []}, {'name': 'Les apocryphes', 'children': []}]}, {'name': 'figure biblique', 'children': [{'name': 'sainte Marie', 'children': []}]}]}
['scène religieuse', 'sainte Anne', 'Les apocryphes', 'figure biblique', 'sainte Marie']


In [16]:
# Read the csv DB_INPUT_SUBJECTMATTER
SUBJECTMATTER_CSV = PARENT + os.getenv("DB_INPUT_SUBJECTMATTER")
SUBJECTMATTER_DF = pd.read_csv(SUBJECTMATTER_CSV)
for column in SUBJECTMATTER_DF.columns:
    if column=="recordID":
        continue
    # Get 5 non null values sampled randomly
    values = SUBJECTMATTER_DF[column].dropna().sample(n=5)
    print(column)
    print(values)
    print()

subjectMatterSubjectTerms
275     groupe de figures (homme ; femme ; enfant : fi...
3337                        paysage (pont ; eau ; maison)
4018                      figure (homme : tête ; chapeau)
2739    scène (homme : barbe ; foule) ; architecture (...
1370                  figure (femme : nu ; fruit : pomme)
Name: subjectMatterSubjectTerms, dtype: object

subjectMatterIconographicTerms
2916     figure mythologique (Uranie ; astronomie ; muse)
4611    scène religieuse (sainte Marie ; Vierge ; Jésu...
55      figure religieuse (saint Antoine ; ermite ; te...
878     scène biblique ([SO] : Nouveau Testament : Mon...
1436        portrait (François Fétis ; artiste : musique)
Name: subjectMatterIconographicTerms, dtype: object

subjectMatterConceptualTerms
3538                                  Pays-Bas :  Haarlem
3012    Lamoral ; comte d'Egmont ; comte de Hornes ; G...
1287                            bataille des Eperons d'or
92                                               réalisme
4

There are unstructured subject matters (structured = respecting the Garnier thesaurus).
Here are the structured subject matters:
- subjectMatterSubjectTerms
- subjectMatterIconographicTerms
- subjectMatterConceptualTerms

Here are the unstructured subject matters:
- subjectMatterIconographicInterpretation
- subjectMatterGeneralSubjectDescription
- subjectMatterSpecificSubjectIdentification

For the structured subject matters, we have to produce two jsons:
1) A flattened list of the terms
2) A json tree representation

In [116]:
def generate_structured_subject_matter_tables(columnName, recordID):
    # Get the value from the column
    value = SUBJECTMATTER_DF.loc[SUBJECTMATTER_DF["recordID"]==recordID][columnName].values[0]
    # Check if the value is nan
    if pd.isna(value):
        return {}, []
    # Convert the value to a tree
    tree = raw_iconography_to_tree(str(value))
    # Return the tree
    return tree.as_json(), tree.as_flattened_values()

# Test
for columnName in ["subjectMatterSubjectTerms", "subjectMatterIconographicTerms", "subjectMatterConceptualTerms"]:
    non_nan_index = SUBJECTMATTER_DF[columnName].dropna().sample(n=1).index[0]
    non_nan_recordID = SUBJECTMATTER_DF.loc[non_nan_index]["recordID"]
    non_nan_recordID = 230
    print("Result for column: ", columnName)
    print(SUBJECTMATTER_DF.loc[SUBJECTMATTER_DF["recordID"]==non_nan_recordID][columnName].values[0])

    tree, flattened = generate_structured_subject_matter_tables(columnName, non_nan_recordID)

    print(tree)
    print(flattened)
    print()

Result for column:  subjectMatterSubjectTerms
scène (femme ; homme ; enfant : nourrisson ; livre ; fond de paysage) ; scène (être imaginaire : aile ; animal : chien) ; scène (intérieur ; lit ; chandelle)scène (femme ; homme ; enfant : nourrisson ; livre ; fond de paysage) ; scène (être imaginaire : aile ; animal : chien) ; scène (intérieur ; lit ; chandelle)
['scène', '(', 'femme', ';', 'homme', ';', 'enfant', ':', 'nourrisson', ';', 'livre', ';', 'fond de paysage', ')', ';', 'scène', '(', 'être imaginaire', ':', 'aile', ';', 'animal', ':', 'chien', ')', ';', 'scène', '(', 'intérieur', ';', 'lit', ';', 'chandelle', ')', 'scène', '(', 'femme', ';', 'homme', ';', 'enfant', ':', 'nourrisson', ';', 'livre', ';', 'fond de paysage', ')', ';', 'scène', '(', 'être imaginaire', ':', 'aile', ';', 'animal', ':', 'chien', ')', ';', 'scène', '(', 'intérieur', ';', 'lit', ';', 'chandelle', ')']
{'name': 'root', 'children': [{'name': 'scène', 'children': [{'name': 'femme', 'children': []}, {'name': '

In [137]:
# For each recordID, we create a json with the following structure:
"""
{
    "recordID": "1234567890",
    "structured": {
        "subjectMatterSubjectTerms": {
            "tree": {...},
            "flattened": [...]
        },
        "subjectMatterIconographicTerms": { 
            "tree": {...},
            "flattened": [...]
        },
        "subjectMatterConceptualTerms": {
            "tree": {...},
            "flattened": [...]
        }
    },
    "unstructured": {
        "subjectMatterIconographicInterpretation": "...",
        "subjectMatterGeneralSubjectDescription": "...",
        "subjectMatterSpecificSubjectIdentification": "..."
    }   
}
"""
subjectMattersParsed = []
for recordID in tqdm(SUBJECTMATTER_DF["recordID"]):
    data = {}
    data["recordID"] = recordID
    data["structured"] = {}
    for columnName in ["subjectMatterSubjectTerms", "subjectMatterIconographicTerms", "subjectMatterConceptualTerms"]:
        tree, flattened = generate_structured_subject_matter_tables(columnName, recordID)
        data["structured"][columnName] = {
            "tree": tree,
            "flattened": flattened
        }
    
    data["unstructured"] = {}
    for columnName in ["subjectMatterIconographicInterpretation", "subjectMatterGeneralSubjectDescription", "subjectMatterSpecificSubjectIdentification"]:
        value = SUBJECTMATTER_DF.loc[SUBJECTMATTER_DF["recordID"]==recordID][columnName].values[0]
        data["unstructured"][columnName] = value
    subjectMattersParsed.append(data)

subjectMattersParsed[:5]

  0%|          | 0/5301 [00:00<?, ?it/s]

100%|██████████| 5301/5301 [00:06<00:00, 859.65it/s]


[{'recordID': 64,
  'structured': {'subjectMatterSubjectTerms': {'tree': {'name': 'root',
     'children': [{'name': 'scène',
       'children': [{'name': 'homme', 'children': []},
        {'name': 'femme', 'children': []},
        {'name': 'enfant', 'children': []},
        {'name': 'vêtement', 'children': [{'name': 'habit', 'children': []}]},
        {'name': 'robe', 'children': []},
        {'name': 'ceinture', 'children': []},
        {'name': 'couvre-chef',
         'children': [{'name': 'chapeau', 'children': []}]},
        {'name': 'coiffe', 'children': []},
        {'name': 'arme', 'children': [{'name': 'épée', 'children': []}]},
        {'name': 'animal', 'children': [{'name': 'cheval', 'children': []}]},
        {'name': 'croix', 'children': []},
        {'name': 'mort', 'children': []}]},
      {'name': 'fond de paysage',
       'children': [{'name': 'ville', 'children': []},
        {'name': 'tour', 'children': []},
        {'name': 'montagne', 'children': []},
        {'na

In [138]:
# We save this parsed subject matters
with open(FILE_SUBJECTMATTERS_PARSED, "w") as f:
    json.dump(subjectMattersParsed, f)

In [139]:
subjectMattersParsed = json.load(open(FILE_SUBJECTMATTERS_PARSED))
# Get a random entry
random_entry = subjectMattersParsed[random.randint(0, len(subjectMattersParsed)-1)]
import pprint
pprint.pprint(random_entry)

{'recordID': 7648,
 'structured': {'subjectMatterConceptualTerms': {'flattened': [], 'tree': {}},
                'subjectMatterIconographicTerms': {'flattened': ['portrait',
                                                                 'Ernest '
                                                                 'Meissonier',
                                                                 'Puvis de '
                                                                 'Chavannes',
                                                                 'Léon Bonnat',
                                                                 'Jean-Jacques '
                                                                 'Henner',
                                                                 'Alexandre '
                                                                 'Dumas',
                                                                 'Guy De '
                                                     