# SGV 12 Locations

A notebook to create a data sets of images from the collection Ernst Brunner that contain locations. 

In [2]:
import os
import json
from collections import defaultdict

# path to the complete, local, collection data
input_dir = "/Users/maxfrischknecht/mfvk_cloud/A_PhD/07_Data/SGV_12N_Metadata_full"

In [3]:
result = []
temp_result = defaultdict(lambda: {"count": 0, "image_ids": []})

# Variables to store total counts
total_objects = 0
total_objects_with_locations = 0
total_objects_with_coordinates = 0
total_locations_given = 0
total_locations_unique = 0

for file_name in os.listdir(input_dir):
    if file_name.endswith('.json'):
        with open(os.path.join(input_dir, file_name), 'r') as f:
            # the file is open
            data = json.load(f)
            for item in data:

                total_objects += 1
                location = None
                coordinates = None

                # Get the 'schema:location' field, if it exists
                if 'schema:location' in item:
                    total_objects_with_locations += 1
                    for about in item['schema:location']:
                        loc = about.get('display_title')
                        if loc:
                            location = loc

                # Get the 'schema:geo' field, if it exists
                if 'schema:geo' in item:
                    total_objects_with_coordinates += 1
                    for geo in item['schema:geo']:
                        coords = geo.get('@value')
                        if coords:
                            coordinates = coords

                # Determine which fields to add
                if location or coordinates:
                    if location and not coordinates:
                        coordinates = "unknown"
                    elif coordinates and not location:
                        location = "unknown"

                    temp_result[location]['count'] += 1
                    temp_result[location]['coordinates'] = coordinates
                    temp_result[location]['location'] = location
                    total_locations_given += 1

                    # Collect the identifiers in schema:identifier, if it exists
                    if 'schema:identifier' in item:
                        identifiers = item['schema:identifier']
                        # Add all identifiers' @value to the image_ids list
                        temp_result[location]['image_ids'].append(identifiers[0]['@value'])


# Convert the temp_result dictionary to the required format
for label, data in temp_result.items():
    result.append({
        "label": label,
        "count": data["count"],
        "image_ids": data["image_ids"],
        "coordinates": data["coordinates"],
        "location": data["location"]
    })

total_locations_unique = len(temp_result)

In [6]:
print(f"Total objects: {total_objects}")
print(f"Total objects with locations: {total_objects_with_locations}")
print(f"Total objects with coordinates: {total_objects_with_coordinates}")
print(f"Total locations given: {total_locations_given}")
print(f"Total unique locations/coordinates given: {total_locations_unique}")

percentageLoc = (total_objects_with_locations / total_objects) * 100
print(f"Percentage of locations to images: {percentageLoc:.2f}%")

percentageGeo = (total_objects_with_coordinates / total_objects) * 100
print(f"Percentage of coordinates to images: {percentageGeo:.2f}%")

Total objects: 47837
Total objects with locations: 42107
Total objects with coordinates: 42107
Total locations given: 42107
Total unique locations/coordinates given: 1087
Percentage of locations to images: 88.02%
Percentage of coordinates to images: 88.02%


In [5]:
with open('./export/sgv-12_locations.json', 'w') as json_file:
    json.dump(result, json_file, indent=4)

print("Data has been exported")

Data has been exported
