# Convert LIDO XML to CSV format (Bremen repository)

### Create a csv file for all data ###

In [9]:
import os
import csv
import pandas as pd
import numpy as np
from lxml import etree
import requests

# Directory containing the XML files
folder_path = "../data/bremen/lido"
namespace = {"lido": "http://www.lido-schema.org"}

# Output CSV file path
output_csv = "../output_csv/bremen_output_with_placeOfBirth.csv"

# Initialize a list to store all record data
data = []
artist_info_dict = {}

# Loop through each subfolder in the lido directory
for subfolder in os.listdir(folder_path):
    subfolder_path = os.path.join(folder_path, subfolder)
    if os.path.isdir(subfolder_path):
        # Loop through each XML file in the subfolder
        for filename in os.listdir(subfolder_path):
            if filename.endswith(".xml"):
                filepath = os.path.join(subfolder_path, filename)
                tree = etree.parse(filepath)

                # Fields
                lido_record_id = tree.xpath("string(//lido:lidoRecID)", namespaces=namespace)
                image_urls = tree.xpath("//lido:linkResource/text()", namespaces=namespace)
                types = tree.xpath("//lido:objectWorkType/lido:term/text()", namespaces=namespace)
                materials = tree.xpath("//lido:termMaterialsTech/lido:term/text()", namespaces=namespace)
                display_date = tree.xpath("string(//lido:eventDate/lido:displayDate)", namespaces=namespace)
                earliest_date = tree.xpath("string(//lido:eventDate/lido:date/lido:earliestDate)", namespaces=namespace)
                latest_date = tree.xpath("string(//lido:eventDate/lido:date/lido:latestDate)", namespaces=namespace)
                subjects = tree.xpath("//lido:subjectConcept/lido:term/text()", namespaces=namespace)
                artist_name = tree.xpath("string(//lido:nameActorSet/lido:appellationValue[@lido:pref='preferred'])", namespaces=namespace)
                actor_gnd_id = tree.xpath("string(//lido:actorID[@lido:pref='preferred'])", namespaces=namespace)
                gender = None
                placeOfBirth = None
                placeOfDeath = None
                placeOfActivity = None
                associatedCountry = None
                role = tree.xpath("string(//lido:roleActor/lido:term)", namespaces=namespace)
                birth = tree.xpath("string(//lido:vitalDatesActor/lido:earliestDate)", namespaces=namespace)
                death = tree.xpath("string(//lido:vitalDatesActor/lido:latestDate)", namespaces=namespace)
                location = tree.xpath("string(//lido:repositoryLocation/lido:namePlaceSet/lido:appellationValue)", namespaces=namespace)
                classifications = tree.xpath("//lido:classification/lido:term/text()", namespaces=namespace)
                titles = tree.xpath("//lido:titleSet/lido:appellationValue/text()", namespaces=namespace)
                rights_statements = tree.xpath("//lido:rightsResource/lido:rightsType/lido:conceptID[@lido:type='http://terminology.lido-schema.org/lido00099']/text()", namespaces=namespace)
                work_id = tree.xpath("string(//lido:workID)", namespaces=namespace)
                repository_name = tree.xpath("//lido:legalBodyName/lido:appellationValue/text()", namespaces=namespace)
                legal_body_link = tree.xpath("string(//lido:legalBodyWeblink)", namespaces=namespace)
                record_id = tree.xpath("string(//lido:recordID)", namespaces=namespace)
                record_links = tree.xpath("//lido:recordInfoLink/text()", namespaces=namespace)
                record_metadata_date = tree.xpath("string(//lido:recordMetadataDate)", namespaces=namespace)
                event_type = tree.xpath("string(//lido:eventType/lido:term)", namespaces=namespace)
                display_materials = tree.xpath("//lido:displayMaterialsTech/text()", namespaces=namespace)
                inscriptions = tree.xpath("//lido:inscriptionDescription/lido:descriptiveNoteValue/text()", namespaces=namespace)
                related_work_notes = tree.xpath("//lido:objectNote/text()", namespaces=namespace)
                resource_types = tree.xpath("//lido:resourceType/lido:term/text()", namespaces=namespace)
                credit_line = tree.xpath("string(//lido:creditLine)", namespaces=namespace)
                object_description = tree.xpath("string(//lido:objectDescriptionSet/lido:descriptiveNoteValue)", namespaces=namespace)
                display_measurements = tree.xpath("string(//lido:displayObjectMeasurements)", namespaces=namespace)


                # Get gender and nationality of the artist from entity facts with his/her gnd id
                if actor_gnd_id != "":
                    actor_gnd_id = actor_gnd_id.split("/")[-1]

                    if (actor_gnd_id in artist_info_dict):
                        gender = artist_info_dict[actor_gnd_id]["gender"]
                        placeOfBirth = artist_info_dict[actor_gnd_id]["placeOfBirth"]
                        placeOfDeath = artist_info_dict[actor_gnd_id]["placeOfDeath"]
                        placeOfActivity = artist_info_dict[actor_gnd_id]["placeOfActivity"]
                        associatedCountry = artist_info_dict[actor_gnd_id]["associatedCountry"]
                    else:
                        url = f"https://hub.culturegraph.org/entityfacts/{actor_gnd_id}"
                        response = requests.get(url)

                        if response.status_code == 200:
                            gnd_data = response.json()
                            gender = gnd_data.get("gender", {}).get("label", "")
                            placeOfBirth = "; ".join(
                                [entry.get("preferredName", "") for entry in gnd_data.get("placeOfBirth", [])]
                            ) or ""

                            placeOfDeath = "; ".join(
                                [entry.get("preferredName", "") for entry in gnd_data.get("placeOfDeath", [])]
                            ) or ""

                            placeOfActivity = "; ".join(
                                [entry.get("preferredName", "") for entry in gnd_data.get("placeOfActivity", [])]
                            ) or ""

                            associatedCountry_list = gnd_data.get("associatedCountry", [])
                            associatedCountry = "; ".join([n.get("preferredName", "") for n in associatedCountry_list]) if associatedCountry_list else ""
                            
                            artist_info_dict[actor_gnd_id] = {"gender": gender, "placeOfBirth": placeOfBirth, "placeOfDeath": placeOfDeath, "placeOfActivity": placeOfActivity, "associatedCountry": associatedCountry}
                        else:
                            print(f"Failed to fetch GND data: {response.status_code} for {url}")

                data.append({
                    "lidoRecordId": lido_record_id,
                    "imageUrl": "; ".join(image_urls),
                    "type": "; ".join(types),
                    "material": "; ".join(materials),
                    "displayDate": display_date,
                    "earliestDate": earliest_date,
                    "latestDate": latest_date,
                    "subject": "; ".join(subjects),
                    "artistName (preferred)": artist_name,
                    "artistGNDId": actor_gnd_id,
                    "genderOfArtist": gender,
                    "placeOfBirth": placeOfBirth,
                    "placeOfDeath": placeOfDeath,
                    "placeOfActivity": placeOfActivity,
                    "associatedCountry": associatedCountry,
                    "role": role,
                    "birth year": birth,
                    "death year": death,
                    "location": location,
                    "title": "; ".join(titles),
                    "classification": "; ".join(classifications),
                    "rightsStatement": "; ".join(rights_statements),
                    "workID": work_id,
                    "repositoryName": repository_name,
                    "recordID": record_id,
                    "recordLinks": "; ".join(record_links),
                    "recordMetadataDate": record_metadata_date,
                    "eventType": event_type,
                    "displayMaterialsTech": "; ".join(display_materials),
                    "displayMeasurements": display_measurements,
                    "objectDescription": object_description,
                    "inscriptions": "; ".join(inscriptions),
                    "relatedWorkNotes": "; ".join(related_work_notes),
                    "resourceType": "; ".join(resource_types),
                    "creditLine": credit_line
                })

# Convert to DataFrame and save to CSV
df = pd.DataFrame(data)
df.replace("", np.nan, inplace=True)
df.to_csv(output_csv, index=False, encoding="utf-8")

Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/143146866
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/entityfacts/160413494
Failed to fetch GND data: 404 for https://hub.culturegraph.org/en

In [11]:
import pandas as pd

# Output CSV file path
output_csv = "../output_csv/bremen_output_with_placeOfBirth.csv"

# Load and display the CSV as a table
df_bremen = pd.read_csv(output_csv)
print(f"The full dataset has {df_bremen.shape[0]} rows and {df_bremen.shape[1]} columns.")
print("First ten rows of the dataset")
df_bremen.head(10)

The full dataset has 29740 rows and 35 columns.
First ten rows of the dataset


Unnamed: 0,lidoRecordId,imageUrl,type,material,displayDate,earliestDate,latestDate,subject,artistName (preferred),artistGNDId,...,recordLinks,recordMetadataDate,eventType,displayMaterialsTech,displayMeasurements,objectDescription,inscriptions,relatedWorkNotes,resourceType,creditLine
0,DE-MUS-027614/lido/100104,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Lithographie; Lithographie (Druckverfahren),1838,1838.0,1838.0,Stadtansicht (allgemein); Vedute (+ Stadt(ansi...,"Raffet, Denis Auguste Marie",118787683,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T07:56:39.539,Herstellung,Lithographie,Darstellung: 169 x 275 mm,,verso: Johann Friedrich Lahmann; Lugt 1656c; r...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
1,DE-MUS-027614/lido/100109,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Radierung; Radierung (Druckverfahren),1874,1874.0,1874.0,"Jäger; männlich; auf einem Pferd, Esel oder Ma...","Detaille, Édouard",118671685,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T08:13:18.072,Herstellung,Radierung,Platte: 317 x 237 mm,,verso: E.M.; Lugt nicht bei Lugt; oben links o...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
2,DE-MUS-027614/lido/100112,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Kupferstich; Kupferstich (Druckverfahren); Kup...,1850,1850.0,1850.0,"historische Person (MEMLING, Hans) - Porträt e...","Desvachez, David",1166576817,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T05:59:05.529,Herstellung,Kupferstich,Platte: 370 x 280 mm,,unten links unterhalb der Darstellung bezeichn...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
3,DE-MUS-027614/lido/100113,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,,1880,1880.0,1880.0,Soldat ohne Dienstgrad; Soldat; Krieger; Schuß...,"Detaille, Édouard",118671685,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T08:04:34.935,Herstellung,"Lithographie in Schwarz, Weiß, Rot; Tonplatte ...",Darstellung: 290 x 221 mm,,oben Mitte oberhalb der Darstellung bezeichnet...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
4,DE-MUS-027614/lido/100114,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Lithographie; Lithographie (Druckverfahren),1816,1816.0,1816.0,Liebespaar; Paar; Szene aus: Bernardin de Sain...,"Desenne, Alexandre Joseph",121242986,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T05:59:23.453,Herstellung,Lithographie,Darstellung: 290 x 221 mm,,unten links: Kunsthalle Bremen; Lugt 292; unte...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
5,DE-MUS-027614/lido/100115,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Lithographie; Lithographie (Druckverfahren),1827-1880,1827.0,1880.0,"Volkstracht, regionale Tracht; Folklore; Kostü...","Desmaisons, Emilien",123098556,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T05:58:40.088,Herstellung,Lithographie,Darstellung: 270 x 213 mm,,unten links: Kunsthalle Bremen; Lugt 292; unte...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
6,DE-MUS-027614/lido/100116,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Radierung; Radierung (Druckverfahren),1863-1868,1863.0,1868.0,Namen von Städten und Dörfern (MEUDON); Stadt;...,"Desbrosses, Léopold",121907120,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T05:59:14.887,Herstellung,Radierung,Platte: 155 x 238 mm,,oben links oberhalb der Darstellung am Blattra...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
7,DE-MUS-027614/lido/100117,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Lithographie; Lithographie (Druckverfahren),1818,1818.0,1818.0,Stadtmauern; Wehrmauer; Stadtbefestigung; Name...,"Deseynes, A.",117500518,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T08:12:53.82,Herstellung,Lithographie,Darstellung: 199 x 263 mm,,oben Mitte: J. G. Heyse Bremen; Lugt nicht bei...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
8,DE-MUS-027614/lido/100118,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Radierung; Radierung (Druckverfahren),1879,1879.0,1879.0,historische Personen - BB - Frau (+ Dreivierte...,"Desboutin, Marcellin",116081708,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T08:10:17.454,Herstellung,Radierung,Platte: 120 x 81 mm,,unten rechts unterhalb der Darstellung monogra...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
9,DE-MUS-027614/lido/100119,https://onlinekatalog.kunsthalle-bremen.de/DE-...,Druckgraphik; Druckgraphik,Radierung; Radierung (Druckverfahren); Aquatin...,1879,1879.0,1879.0,"historische Person (BEREND, Edward) - Porträt ...","Desboutin, Marcellin",116081708,...,https://onlinekatalog.kunsthalle-bremen.de/DE-...,2024-11-19T08:03:28.473,Herstellung,Radierung und Aquatinta,Platte: 240 x 160 mm,,verso: E.M.; Lugt nicht bei Lugt; oben links o...,,image,"Kunsthalle Bremen - Der Kunstverein in Bremen,..."
