# SGV12 Keywords

A notebook to create a data sets of images from the collection Ernst Brunner that contain keywords. 

In [2]:
import os
import json
import csv
from collections import defaultdict

# path to the complete, local, collection data
input_dir = "/Users/maxfrischknecht/mfvk_cloud/A_PhD/07_Data/SGV_12N_Metadata_full"

In [3]:
result = []
temp_result = defaultdict(lambda: {"count": 0, "image_ids": []})

# Variables to store total counts
total_objects = 0
total_objects_with_dates = 0
total_dates_given = 0
total_dates_unique = 0

for file_name in os.listdir(input_dir):
    if file_name.endswith('.json'):
        with open(os.path.join(input_dir, file_name), 'r') as f:
            # the file is open
            data = json.load(f)
            for item in data:

                total_objects += 1
                # Get the 'schema:temporal' field, if it exists
                if 'schema:temporal' in item:
                    total_objects_with_dates += 1
                    for about in item['schema:temporal']:
                        # Extract the display_title
                        temporal = about.get('@value')
                        if temporal:
                            # Increment the count for the temporal
                            temp_result[temporal]['count'] += 1
                            total_dates_given += 1

                            # Collect the identifiers in schema:identifier, if it exists
                            if 'schema:identifier' in item:
                                identifiers = item['schema:identifier']
                                # Add all identifiers' @value to the image_ids list
                                temp_result[temporal]['image_ids'].append(identifiers[0]['@value'])


# Convert the temp_result dictionary to the required format
for label, data in temp_result.items():
    result.append({
        "label": label,
        "count": data["count"],
        "image_ids": data["image_ids"]
    })

total_dates_unique = len(temp_result)

In [4]:
print(f"Total objects: {total_objects}")
print(f"Total objects with dates: {total_objects_with_dates}")
print(f"Total dates given: {total_dates_given}")
print(f"Total unique dates given: {total_dates_unique}")

percentage = (total_objects_with_dates / total_objects) * 100
print(f"Percentage of dates to images: {percentage:.2f}%")

Total objects: 47837
Total objects with dates: 33539
Total dates given: 39489
Total unique dates given: 574
Percentage of dates to images: 70.11%


In [5]:
with open('./export/sgv-12_dates.json', 'w') as json_file:
    json.dump(result, json_file, indent=4)

print("Data has been exported")

Data has been exported
