In [None]:
import pandas as pd

## Step 1: Read CSV files into DataFrames

In [None]:
met_df = pd.read_csv('./Met/output/met_metadata_final.csv')
semart_df = pd.read_csv('./SemArt/output/semart_metadata_final.csv')
rijksmuseum_df = pd.read_csv('./Rijksmuseum/output/rijksmuseum_metadata_final.csv')
ukiyoe_df = pd.read_csv('./Ukiyo-e/output/ukiyoe_metadata_final.csv', index_col=0)
wikiart_df = pd.read_csv('./Wikiart/output/wikiart_metadata_final.csv')
gac_df = pd.read_csv('./GAC/output/gac_metadata_final.csv')

In [None]:
met_df

## Step 2: Rename overlapping fields

In [None]:
met_df.rename(columns={
    'description': 'description',
    'artist': 'artist',
    'title': 'title',
    'date': 'date',
    'medium': 'technique',
    'type': 'type'
}, inplace=True)

semart_df.rename(columns={
    'IMAGE_FILE': 'image_file',
    'DESCRIPTION': 'description',
    'AUTHOR': 'artist',
    'TITLE': 'title',
    'TECHNIQUE': 'technique',
    'DATE': 'date',
    'TYPE': 'type',
    'SCHOOL': 'school',
    'SPLIT': 'split',
    'TIMEFRAME': 'timeframe'
}, inplace=True)

rijksmuseum_df.rename(columns={
    'filename': 'image_file',
    'description': 'description',
    'creator': 'artist',
    'title': 'title',
    'date': 'date',
    'type': 'type'
}, inplace=True)

ukiyoe_df.rename(columns={
    'image_file': 'image_file',
    'description': 'description',
    'artistString': 'artist',
    'title': 'title',
    'date': 'date',
    'type': 'type'
}, inplace=True)
# Drop the 'Unnamed: 42' column
ukiyoe_df.drop('Unnamed: 42', axis=1, inplace=True)

wikiart_df.rename(columns={
    'description': 'description',
    'filename': 'image_file',
    'artist': 'artist',
    'title': 'title',
    'date': 'date',
    'genre': 'type'
}, inplace=True)

gac_df.rename(columns={
    'artwork_path': 'image_file',
    'main_text': 'description',
    'creator': 'artist',
    'title': 'title',
    'date': 'date',
    'type': 'type'
}, inplace=True)

## Step 3: Convert each DataFrame to XML 

In [None]:
import xml.etree.ElementTree as ET 

def dataframe_to_xml(df, root_name, row_name):
    """Convert a DataFrame to an XML string."""
    root = ET.Element(root_name)
    for i, row in df.iterrows():
        row_elem = ET.SubElement(root, row_name)
        for field, value in row.items():
            field_elem = ET.SubElement(row_elem, field)
            field_elem.text = str(value)
    return ET.tostring(root, encoding='unicode')
    
# Step 3: Convert each DataFrame to XML
met_xml = dataframe_to_xml(met_df, 'Met', 'artwork')
semart_xml = dataframe_to_xml(semart_df, 'SemArt', 'artwork')
rijksmuseum_xml = dataframe_to_xml(rijksmuseum_df, 'Rijksmuseum', 'artwork')
ukiyoe_xml = dataframe_to_xml(ukiyoe_df, 'Ukiyo-e', 'artwork')
wikiart_xml = dataframe_to_xml(wikiart_df, 'WikiArt', 'artwork')
gac_df_xml = dataframe_to_xml(gac_df, 'GAC', 'artwork')

## Step 4: Step 4: Merge the XMLs into a hierarchical structure

In [None]:
datasets_root = ET.Element('Datasets')

met_tree = ET.ElementTree(ET.fromstring(met_xml))
semart_tree = ET.ElementTree(ET.fromstring(semart_xml))
rijksmuseum_tree = ET.ElementTree(ET.fromstring(rijksmuseum_xml))
ukiyoe_tree = ET.ElementTree(ET.fromstring(ukiyoe_xml))
wikiart_tree = ET.ElementTree(ET.fromstring(wikiart_xml))
gac_tree = ET.ElementTree(ET.fromstring(gac_df_xml))

# datasets_root.append(met_tree.getroot())
datasets_root.append(semart_tree.getroot())
datasets_root.append(rijksmuseum_tree.getroot())
datasets_root.append(ukiyoe_tree.getroot())
datasets_root.append(wikiart_tree.getroot())
datasets_root.append(gac_tree.getroot())

## Step 5: Save the combined XML to a file

In [None]:
combined_tree = ET.ElementTree(datasets_root)
combined_tree.write('merged_datasets.xml', encoding='unicode')

print("Merged XML saved as 'merged_datasets.xml'.")