# SGV 12 Comments & Dates

A notebook to create a data sets of images from the collection Ernst Brunner that contain comments and dates. 

In [8]:
import os
import json
from collections import defaultdict

# path to the complete, local, collection data
input_dir = "/Users/maxfrischknecht/mfvk_cloud/A_PhD/07_Data/SGV_12N_Metadata_full"

In [3]:
# loop over all the files in the input_dir
result = []
for file_name in os.listdir(input_dir):
    if file_name.endswith('.json'):
        with open(os.path.join(input_dir, file_name), 'r') as f:
            # the file is open
            data = json.load(f)
            # place to save the extracted data
            reduced_data = []

            for obj in data:
                
                # test if image has id and keywords
                if 'schema:identifier' in obj and 'schema:temporal' in obj and 'schema:comment' in obj:
                    reduced_obj = {}

                    # save the filename as id
                    reduced_obj['schema:identifier'] = obj['schema:identifier'][0]['@value']

                    # loop over the array of dates
                    dates_list = []
                    for item in obj['schema:temporal']:
                        if '@value' in item:
                            dates_list.append(item['@value'])
                    reduced_obj["schema:temporal"] = dates_list;

                    # loop over the array of dates
                    comment_list = []
                    for item in obj['schema:comment']:
                        if '@value' in item:
                            comment_list.append(item['@value'])
                    reduced_obj["schema:comment"] = comment_list;
 
                    # save the new objects with its properties in a list
                    # print(reduced_obj)
                    reduced_data.append(reduced_obj)

            # save the list of reduced objects in a global list
            result.extend(reduced_data)

In [7]:
print(result[5])
print(len(result))

{'schema:identifier': 'SGV_12N_07517', 'schema:temporal': ['1942-05'], 'schema:comment': ['Auf der Negativhülle (Pergamin) befindet sich folgender Stempel: B']}
28675


Aggregate all objects together that have the same comment and same date to reduce file size

In [10]:
# Aggregating items
aggregated_result = defaultdict(lambda: {'schema:identifier': []})

for item in result:
    key = (tuple(item['schema:temporal']), tuple(item['schema:comment']))
    aggregated_result[key]['schema:identifier'].append(item['schema:identifier'])
    aggregated_result[key]['schema:temporal'] = item['schema:temporal']
    aggregated_result[key]['schema:comment'] = item['schema:comment']

# Convert aggregated_result to a list
final_result = list(aggregated_result.values())
print(len(final_result))
print(final_result[5])

3391
{'schema:identifier': ['SGV_12N_07561', 'SGV_12N_07562', 'SGV_12N_07563', 'SGV_12N_07564', 'SGV_12N_07565', 'SGV_12N_07566', 'SGV_12N_07567', 'SGV_12N_07568', 'SGV_12N_07569', 'SGV_12N_07570', 'SGV_12N_07571', 'SGV_12N_07572', 'SGV_12N_07573', 'SGV_12N_07574', 'SGV_12N_07575', 'SGV_12N_07576', 'SGV_12N_07577', 'SGV_12N_07578', 'SGV_12N_07579', 'SGV_12N_07580', 'SGV_12N_07581'], 'schema:temporal': ['1942-06'], 'schema:comment': ['Auf der Negativhülle (Pergamin) befindet sich folgender Stempel: B']}


In [11]:
with open('./export/sgv-12_comments_dates.json', 'w') as json_file:
    json.dump(final_result, json_file, indent=4)

print("Data has been exported")

Data has been exported
