# SGV 12 Keyword Co-Occurence Network Graph

A notebook that analyses co-occurences of keywords and exports a json file for network graph visualization. 

In [56]:
import pandas as pd
from collections import defaultdict
import json

input_dir = "./export/sgv-12_keywords_ids.json"

## Load the Objects with Keywords

In [57]:
with open(input_dir, "r") as file:
    data = json.load(file)

print("Available images with keywords:", len(data))

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)
print(df[:5])

Available images with keywords: 22364
  schema:identifier                      schema:about
0     SGV_12N_07501             [Murbacherstrasse 31]
1     SGV_12N_07502             [Murbacherstrasse 31]
2     SGV_12N_07503             [Murbacherstrasse 31]
3     SGV_12N_07773  [Valorisierung: Museum Burgrain]
4     SGV_12N_07833                           [Suone]


## Count Co-Occurences

In [58]:
# Initialize a defaultdict to store the count of co-occurrences
co_occurrence = defaultdict(int)
keywords = set()  # A set to store unique keywords

# Loop through each list in the data
for keywords_list in df["schema:about"]:
    # Loop through each keyword in the current list
    for i in range(len(keywords_list)):
        # Add each keyword to the keywords set
        keywords.add(keywords_list[i])
        # Compare the current keyword with every other keyword in the same list
        for j in range(i + 1, len(keywords_list)):
            # Create a sorted tuple (pair) so that the order doesn't matter
            pair = tuple(sorted([keywords_list[i], keywords_list[j]]))
            # Increment the count for this co-occurrence pair
            co_occurrence[pair] += 1

print(f"found {len(co_occurrence)} co-occurences")
print(co_occurrence)

# Print the co-occurrence dictionary
for pair, count in co_occurrence.items():
    print(f"Keywords {pair} occurred together {count} times.")

found 16804 co-occurences
defaultdict(<class 'int'>, {('Heuen', 'Mähen'): 7, ('Heuen', 'Sense (Werkzeug)'): 18, ('Feldarbeit', 'Heuen'): 13, ('Heuen', 'Wiese'): 9, ('Heuen', 'Steilhang'): 1, ('Heuen', 'Mann'): 9, ('Gras', 'Heuen'): 1, ('Dorf', 'Heuen'): 1, ('Heuen', 'Landwirtschaft'): 7, ('Heuen', 'Valorisierung: Museum Burgrain'): 3, ('Mähen', 'Sense (Werkzeug)'): 19, ('Feldarbeit', 'Mähen'): 11, ('Mähen', 'Wiese'): 2, ('Mähen', 'Steilhang'): 1, ('Mann', 'Mähen'): 2, ('Gras', 'Mähen'): 2, ('Dorf', 'Mähen'): 1, ('Landwirtschaft', 'Mähen'): 12, ('Mähen', 'Valorisierung: Museum Burgrain'): 3, ('Feldarbeit', 'Sense (Werkzeug)'): 15, ('Sense (Werkzeug)', 'Wiese'): 6, ('Sense (Werkzeug)', 'Steilhang'): 1, ('Mann', 'Sense (Werkzeug)'): 7, ('Gras', 'Sense (Werkzeug)'): 2, ('Dorf', 'Sense (Werkzeug)'): 1, ('Landwirtschaft', 'Sense (Werkzeug)'): 12, ('Sense (Werkzeug)', 'Valorisierung: Museum Burgrain'): 6, ('Feldarbeit', 'Wiese'): 9, ('Feldarbeit', 'Steilhang'): 1, ('Feldarbeit', 'Mann'): 14, 

## Reduce Co-Occurences to a Minimum

This step is done to reduce the amount of nodes and links in the network graph:

- Without reduction a data set of 1578 nodes (keywords) and 16804 links (co-occurences) ins produced. 
- With a threshold of `5` a data set of 835 nodes and 4160 links is produced. 
- With a threshold of `10` a data set of 568 nodes and 2248 links is produced. 


In [65]:
# Set the threshold for minimum co-occurrences
threshold = 5

# Remove co-occurrences below the threshold
co_occurrence_reduced = {pair: count for pair, count in co_occurrence.items() if count >= threshold}
print(len(co_occurrence_reduced))

# Collect only the keywords that are still in the co-occurrence pairs after filtering
keywords_in_links = set()
for pair in co_occurrence_reduced.keys():
    keywords_in_links.update(pair)

4160


In [66]:

# Prepare nodes and links for JSON
# 'nodes' is a list of unique keywords, each represented by a dictionary with an 'id' field
nodes = [{'id': keyword} for keyword in keywords_in_links]

# 'links' is a list of dictionaries that represents the connections (co-occurrences)
# Each link contains a source, target, and value (co-occurrence count)
links = [{'source': k[0], 'target': k[1], 'value': v} for k, v in co_occurrence_reduced.items()]

# Create the final JSON structure with nodes and links
graph_data = {
    'nodes': nodes,
    'links': links
}

# Output the number of nodes (unique keywords)
print("Node count:", len(graph_data["nodes"]))
print("Links count:", len(graph_data["links"]))

# Convert the graph data to a JSON string with pretty formatting
graph_json = json.dumps(graph_data, indent=4)

# Print the JSON string (or save to a file, if needed)
print(graph_json)

Node count: 835
Links count: 4160
{
    "nodes": [
        {
            "id": "Karte (Kartografie)"
        },
        {
            "id": "Garbe (Getreide)"
        },
        {
            "id": "Viehmarkt"
        },
        {
            "id": "Seedamm von Rapperswil"
        },
        {
            "id": "1. Mai"
        },
        {
            "id": "Fluss"
        },
        {
            "id": "Holzschuh"
        },
        {
            "id": "Kupferkessel"
        },
        {
            "id": "Chalet"
        },
        {
            "id": "Chorgest\u00fchl"
        },
        {
            "id": "Nasenschild"
        },
        {
            "id": "Rechen"
        },
        {
            "id": "Silberschmied/-in"
        },
        {
            "id": "Weinrebe"
        },
        {
            "id": "Viehschau"
        },
        {
            "id": "Butter"
        },
        {
            "id": "Talboden"
        },
        {
            "id": "Christbaum"
        }

In [67]:
# Export JSON to a file
with open('./export/sgv-12-keywords_graph_data_threshold5.json', 'w') as json_file:
    json_file.write(graph_json)