# SGV 12 Comments & Keywords

A notebook to create a data sets of images from the collection Ernst Brunner that contain comments and keywords. 

In [2]:
import os
import json
from collections import defaultdict

# path to the complete, local, collection data
input_dir = "/Users/maxfrischknecht/mfvk_cloud/A_PhD/07_Data/SGV_12N_Metadata_full"

In [3]:
# loop over all the files in the input_dir
result = []
for file_name in os.listdir(input_dir):
    if file_name.endswith('.json'):
        with open(os.path.join(input_dir, file_name), 'r') as f:
            # the file is open
            data = json.load(f)
            # place to save the extracted data
            reduced_data = []

            for obj in data:
                
                # test if image has id and keywords
                if 'schema:identifier' in obj and 'schema:about' in obj and 'schema:comment' in obj:
                    reduced_obj = {}

                    # save the filename as id
                    reduced_obj['schema:identifier'] = obj['schema:identifier'][0]['@value']

                    # loop over the array of comments
                    comment_list = []
                    for item in obj['schema:comment']:
                        if '@value' in item:
                            comment_list.append(item['@value'])
                    reduced_obj["schema:comment"] = comment_list;

                    # loop over the array of keywords
                    about_list = []
                    for item in obj['schema:about']:
                        if 'display_title' in item:
                            about_list.append(item['display_title'])
                    reduced_obj["schema:about"] = about_list;
 
                    # save the new objects with its properties in a list
                    # print(reduced_obj)
                    reduced_data.append(reduced_obj)

            # save the list of reduced objects in a global list
            result.extend(reduced_data)

In [4]:
print(result[5])
print(len(result))

{'schema:identifier': 'SGV_12N_08021', 'schema:about': ['Murbacherstrasse 31'], 'schema:comment': ['Auf der Negativhülle (Pergamin) befindet sich folgender Stempel: B']}
18007


Aggregate all objects together that have the same comment and same date to reduce file size

In [6]:
# Aggregating items
aggregated_result = defaultdict(lambda: {'schema:identifier': []})

for item in result:
    key = (tuple(item['schema:about']), tuple(item['schema:comment']))
    aggregated_result[key]['schema:identifier'].append(item['schema:identifier'])
    aggregated_result[key]['schema:about'] = item['schema:about']
    aggregated_result[key]['schema:comment'] = item['schema:comment']

# Convert aggregated_result to a list
final_result = list(aggregated_result.values())
print(len(final_result))
print(final_result[6])

6606
{'schema:identifier': ['SGV_12N_08298'], 'schema:about': ['Valorisierung: Publikation Ausstellung passadis- über alle Berge'], 'schema:comment': ['Auf der Negativhülle (Pergamin) befindet sich oben rechts und in der Mitte je ein roter Punkt.']}


In [7]:
with open('./export/sgv-12_comments_keywords.json', 'w') as json_file:
    json.dump(final_result, json_file, indent=4)

print("Data has been exported")

Data has been exported
