# Convert json to turtle 

In [29]:
import os

import json
from urllib.parse import unquote_plus,urlparse
from langdetect import detect, LangDetectException
import re
from urllib.parse import urlparse, urlunparse, quote

def clean_integer(value):
    # Remove commas, non-numeric characters, and trim whitespace
    cleaned_value = re.sub(r"[^\d]", "", str(value).strip())

    # Ensure we don't return an empty string (which would cause errors)
    return cleaned_value if cleaned_value else "0"  # Default to "0" if invalid


def clean_url(url):
    if not url or not isinstance(url, str) or url.strip() == "":
        return ""  # Return empty string if URL is None or empty
    
    url = url.strip()  # Remove leading/trailing spaces
    
    # If multiple URLs are mistakenly concatenated, keep only the first valid one
    url_parts = url.split()  # Split by spaces
    url = url_parts[0] if url_parts else ""  # Safely get the first part

    parsed_url = urlparse(url)

    # Fix invalid file URLs (encode spaces)
    if parsed_url.scheme == "file":
        url = quote(url, safe=":/")

    # Remove invalid characters
    allowed_chars = r"[^A-Za-z0-9\-._~:/?#\[\]@!$&'()*+,;=%]"
    
    cleaned_scheme = re.sub(allowed_chars, "", parsed_url.scheme)
    cleaned_netloc = re.sub(allowed_chars, "", parsed_url.netloc)
    cleaned_path = quote(parsed_url.path.strip(), safe="/:")  # Encode spaces
    cleaned_params = re.sub(allowed_chars, "", parsed_url.params)
    cleaned_query = quote(parsed_url.query, safe="&=")  # Encode query
    cleaned_fragment = quote(parsed_url.fragment, safe="")  # Encode fragment

    # Reconstruct the cleaned URL
    cleaned_url = urlunparse((
        cleaned_scheme, 
        cleaned_netloc, 
        cleaned_path, 
        cleaned_params, 
        cleaned_query, 
        cleaned_fragment
    ))

    return cleaned_url   # Return cleaned URL or empty string



# Define the prefixes as before, now including the vcard prefix
prefixes = """
@prefix : <http://example.org/> . 
@prefix dct: <http://purl.org/dc/terms/> . 
@prefix void: <http://rdfs.org/ns/void#> . 
@prefix dcat: <http://www.w3.org/ns/dcat#> . 
@prefix foaf: <http://xmlns.com/foaf/0.1/> . 
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix schema: <http://schema.org/> .
@prefix adms:<http://www.w3.org/TR/vocab-adms/>.
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix vcard: <http://www.w3.org/2006/vcard/ns#> .  # vCard for contact details
"""
ttl_data = ""

def fix_email(email):
    """ Fix common issues in email, e.g., replacing 'at' with '@', removing spaces and quotes."""
    if email:
        # Remove spaces
        email = email.replace(" ", "")
        # Remove any quotation marks (either single or double)
        email = email.replace("\"", "").replace("'", "")
        # Replace "at" with "@" if it's present
        email = email.replace("at", "@")
    return email

def fix_name(name):
   
    if name:
        # Remove any quotation marks (either single or double)
        name = name.replace("\"", "").replace("'", "").replace(" - ", "").replace(" ", "_")
    return name


def json_to_ttl(json_data, dataset_id, sanitized_id):
    ttl_data = f":{sanitized_id} a dcat:Dataset ;\n"
    linkset_data = ""  # Separate storage for links
    dataset = json_data[dataset_id]
    owner_details = ""
    contact_details = ""
    # Iterate over the dataset fields and convert them to TTL format
    for key, value in dataset.items():
        if not value:  # Skip keys with empty or None values
            continue
            
        if key == "links":  # Skip links, we will process them separately
            continue

        
        # Handle each key based on its specific structure and add it to TTL
        if key == "title":
            value= fix_name(value)
            ttl_data += f"    dct:title \"{value}\" ;\n"
        elif key == "description" and isinstance(value, dict):
            for lang, desc in value.items():
                # Multiline string for description
                ttl_data += f"    dct:description \"\"\"\n{desc}\n\"\"\"@{lang} ;\n\n"  # Add language tag to description
        elif key == "full_download" and isinstance(value, list):
            for item in value:
                ttl_data += "    dcat:distribution [\n"
                if isinstance(item, dict):
                    for subkey, subvalue in item.items():
                        if subkey == "access_url":
                            if isinstance(subvalue, str):
                                subvalue = subvalue.strip()  # Remove spaces
                                if subvalue and subvalue.startswith(("http://", "https://")):
                                    subvalue = clean_url(subvalue)
                                    ttl_data += f"        dcat:accessURL <{subvalue}> ;\n"
                                else:
                                    continue  # ✅ Completely skip empty or invalid URLs
                        elif subkey == "download_url":
                            if isinstance(subvalue, str):
                                subvalue = subvalue.strip()  # Remove spaces
                                if subvalue and subvalue.startswith(("http://", "https://")):
                                    subvalue = clean_url(subvalue)
                                    ttl_data += f"        dcat:downloadURL <{subvalue}> ;\n"
                                else:
                                    continue  # ✅ Completely skip empty or invalid URLs
                        elif subkey == "title":
                            subvalue = fix_name(subvalue)
                            ttl_data += f"        dct:title \"{subvalue}\" ;\n"
                        elif subkey == "description":
                            ttl_data += f"        dct:description \"\"\"\n{subvalue}\n\"\"\" ;\n"
                        elif subkey == "media_type":
                            subvalue = fix_name(subvalue)
                            ttl_data += f"        dcat:mediaType \"{subvalue}\" ;\n"
                        elif subkey == "status":
                            ttl_data += f"        adms:status \"\"\"\n{subvalue}\n\"\"\" ;\n"
                        elif subkey == "mirror":
                            ttl_data += f"        dct:mirror \"{subvalue}\" ;\n"
                        elif subkey == "_id":
                            ttl_data += f"        dct:identifier \"{subvalue}\" ;\n"
                ttl_data += "    ] ;\n"
        elif key == "other_download" and isinstance(value, list):
            for item in value:
                ttl_data += "    dcat:distribution [\n"  # Start a new blank node for each download entry
                if isinstance(item, dict):
                    for subkey, subvalue in item.items():
                        if subkey == "access_url":
                            if isinstance(subvalue, str):
                                subvalue = subvalue.strip()  # Remove spaces
                                if subvalue and subvalue.startswith(("http://", "https://")):
                                    subvalue = clean_url(subvalue)
                                    ttl_data += f"        dcat:accessURL <{subvalue}> ;\n"
                                else:
                                    continue  # ✅ Completely skip empty or invalid URLs                        elif subkey == "title":
                        elif subkey == "title":
                                subvalue = fix_name(subvalue) 
                                ttl_data += f"        dct:title \"{subvalue}\" ;\n"
                        elif subkey == "description":
                            ttl_data += f"        dct:description \"\"\"\n{subvalue}\n\"\"\" ;\n"
                        elif subkey == "media_type":
                            subvalue = fix_name(subvalue) 
                            ttl_data += f"        dcat:mediaType \"{subvalue}\" ;\n"
                        elif subkey == "status":
                            ttl_data += f"        adms:status \"\"\"\n{subvalue}\n\"\"\" ;\n"
                        elif subkey == "mirror":
                            ttl_data += f"        dct:mirror \"{subvalue}\" ;\n"
                        elif subkey == "_id":
                            ttl_data += f"        dct:identifier \"{subvalue}\" ;\n"
                ttl_data += "    ] ;\n"  # End the blank node for the current item
                
        elif key == "sparql" and isinstance(value, list):
            for item in value:
                ttl_data += "    void:sparqlEndpoint [\n"  # Start a new blank node for each download entry
                if isinstance(item, dict):
                    for subkey, subvalue in item.items():
                        if subkey == "access_url":
                            if isinstance(subvalue, str):
                                subvalue = subvalue.strip()  # Remove spaces
                                if subvalue and subvalue.startswith(("http://", "https://")):
                                    subvalue = clean_url(subvalue)
                                    ttl_data += f"        dcat:accessURL <{subvalue}> ;\n"
                                else:
                                    continue  # ✅ Completely skip empty or invalid URLs                        elif subkey == "title":
                        elif subkey == "title":
                            subvalue = fix_name(subvalue)
                            ttl_data += f"        dct:title \"{subvalue}\" ;\n"
                        elif subkey == "description":
                            ttl_data += f"        dcat:endpointDescription \"\"\"\n{subvalue}\n\"\"\" ;\n"
                        elif subkey == "status":
                            ttl_data += f"        adms:status \"\"\"\n{subvalue}\n\"\"\" ;\n"
                        elif subkey == "media_type":
                            subvalue = fix_name(subvalue) 
                            ttl_data += f"        dcat:mediaType \"{subvalue}\" ;\n"
                        elif subkey == "mirror":
                            ttl_data += f"        dct:mirror \"{subvalue}\" ;\n"
                        elif subkey == "_id":
                            ttl_data += f"        dct:identifier \"{subvalue}\" ;\n"
                ttl_data += "    ] ;\n"  # End the blank node for the current item
        elif key == "example" and isinstance(value, list):
            for item in value:
                ttl_data += "    void:exampleResource [\n"
                if isinstance(item, dict):
                    for subkey, subvalue in item.items():
                        if subkey == "access_url":
                            if isinstance(subvalue, str):
                                subvalue = subvalue.strip()  # Remove spaces
                                if subvalue and subvalue.startswith(("http://", "https://")):
                                    subvalue = clean_url(subvalue)
                                    ttl_data += f"        dcat:accessURL <{subvalue}> ;\n"
                                else:
                                    continue  # ✅ Completely skip empty or invalid URLs                        elif subkey == "title":
                        elif subkey == "title":
                            subvalue = fix_name(subvalue)
                            ttl_data += f"        dct:title \"{subvalue}\" ;\n"
                        elif subkey == "description":
                            ttl_data += f"        dct:description \"\"\"\n{subvalue}\n\"\"\" ;\n"
                        elif subkey == "media_type":
                            subvalue= fix_name(subvalue)
                            ttl_data += f"        dcat:mediaType \"{subvalue}\" ;\n"
                        elif subkey == "status":
                            ttl_data += f"        adms:status \"\"\"\n{subvalue}\n\"\"\" ;\n"
                ttl_data += "    ] ;\n"
        elif key == "contact_point" and isinstance(value, dict):  
            ttl_data += "    prov:qualifiedAttribution [\n"
            ttl_data += "        prov:agent :contactAgent ;\n"
            ttl_data += "        dcat:hadRole :contact_point ;\n"
            ttl_data += "    ] ;\n"

            # Store contact details separately to append at the end
            contact_details = "\n:contactAgent a prov:Agent ;\n"

            # Handle multiple names
            names = value.get("name")
            if names:
                if isinstance(names, str):  # Convert single name to a list for uniform processing
                    names = [names]
                for name in names:
                    contact_details += "     foaf:name \"{}\"^^xsd:string ;\n".format(fix_name(name))

            # Handle email
            email = fix_email(value.get("email", ""))
            if email:
                contact_details += "     foaf:mbox <mailto:{}> ; \n".format(email)
            contact_details += " .\n\n"
            # Ensure correct syntax: If no names or emails exist, don't write this block
            if "foaf:name" not in contact_details and "foaf:mbox" not in contact_details:
                contact_details = ""  # Remove empty block

        elif key == "owner" and isinstance(value, dict):  
            ttl_data += "    prov:qualifiedAttribution [\n"
            ttl_data += "        prov:agent  :OwnerAgent ;\n"
            ttl_data += "        dcat:hadRole :owner ;\n"
            ttl_data += "    ] ;\n"

            # Store owner details separately to append at the end
            owner_details = "\n:OwnerAgent a prov:Agent ;\n"

            # Handle multiple names
            names = value.get("name")
            if names:
                if isinstance(names, str):  # Convert single name to a list for uniform processing
                    names = [names]
                for name in names:
                    owner_details += "     foaf:name \"{}\"^^xsd:string ;\n".format(fix_name(name))

            # Handle email
            email = fix_email(value.get("email", ""))
            if email:
                owner_details += "     foaf:mbox <mailto:{}> ; \n".format(email)
            owner_details += " .\n\n"
            # Ensure correct syntax: If no names or emails exist, don't write this block
            if "foaf:name" not in owner_details and "foaf:mbox" not in owner_details:
                owner_details = ""  # Remove empty block

        elif key == "website":
            ttl_data += f"    foaf:page <{value}> ;\n"
        elif key == "triples":
            cleaned_integer = clean_integer(value)
            ttl_data += f"    void:triples \"{cleaned_integer}\"^^xsd:integer ;\n"          
            # ttl_data += f"    void:triples \"{value}\" ;\n"
        elif key == "license":
            ttl_data += f"    dct:license <{value}> ;\n"
        elif key == "namespace":
            value = fix_name(value)
            ttl_data += f"    void:uriSpace <{value}> ;\n"
        elif key == "doi":
            ttl_data += f"    dct:identifier \"{value}\" ;\n"
        elif key == "domain":
            value = fix_name(value)
            ttl_data += f"    dcat:theme \"{value}\" ;\n"
        elif key == "keywords" and isinstance(value, list):  # Fix for keywords      
            ttl_data += "    dcat:keyword " + ", ".join(f"\"{fix_name(kw)}\"" for kw in value) + " ;\n"
        elif key == "image":
            ttl_data += f"    foaf:depiction <{value}> ;\n"
    
    # Remove trailing semicolon for the last predicate
    ttl_data = ttl_data.rstrip(" ;\n") + " .\n\n"
     
    # Process Links Separately (Appending at the end)
    if "links" in dataset and isinstance(dataset["links"], list):
        linkset_data += "@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . \n"
        for item in dataset["links"]:
            if isinstance(item, dict) and "target" in item:
                target = fix_name(item["target"])
                linkset_data += f":{target} a void:Linkset ;\n"
                linkset_data += f"    void:target :{sanitized_id} ;\n"
                if "value" in item:
                    cleaned_integer = clean_integer(item["value"])
                    linkset_data += f"    void:triples \"{cleaned_integer}\"^^xsd:integer ;\n"
                linkset_data += " .\n\n"

    return ttl_data + linkset_data + owner_details+ contact_details  # Append linkset data at the end


def sanitize_filename(dataset_id):
    """Sanitize dataset ID to a valid filename."""
    return re.sub(r'[^a-zA-Z0-9_-]', '_', dataset_id)

# Read the JSON file
input_file = "lod.json"
with open(input_file, "r") as f:
    data = json.load(f)  # Load the JSON data

# Create output directory
output_dir = "ttl_files"
os.makedirs(output_dir, exist_ok=True)

# Convert each dataset to TTL and save
for dataset_id in data:
    sanitized_id = sanitize_filename(dataset_id)
    ttl_content = prefixes + "\n" + json_to_ttl(data, dataset_id, sanitized_id)
    ttl_output_path = os.path.join(output_dir, f"{sanitized_id}.ttl")
    with open(ttl_output_path, "w") as file:
        file.write(ttl_content)
        
def is_valid_email(email):
    """Helper function to check if the email is valid."""
    if not isinstance(email, str):
        return False  # If the email is not a string, it's invalid
    # Simple regex to check if the email has a valid format
    pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
    return re.match(pattern, email) is not None


print(f"TTL files saved in the folder: {output_dir}")


TTL files saved in the folder: ttl_files


# run shacl against turtle 

In [30]:
import pyshacl
import rdflib
import os

# Path to the SHACL file
shacl_file_path = "LOD_v4_mary-dct_shacl-Copy1.ttl"  # Your SHACL file here

# Directory containing the TTL files
ttl_dir_path = "ttl_files2"  # Your TTL files directory

# Output report file
output_txt = "output_report_test_shacl.txt"

valid_count = 0
invalid_count = 0

with open(output_txt, "w") as result_file:
    for ttl_file in os.listdir(ttl_dir_path):
        if ttl_file.endswith(".ttl"):
            ttl_file_path = os.path.join(ttl_dir_path, ttl_file)

            try:
                # Load the TTL file into a graph
                data_graph = rdflib.Graph()
                data_graph.parse(ttl_file_path, format="turtle")

                # Create a SHACL graph
                shapes_graph = rdflib.Graph()
                shapes_graph.parse(shacl_file_path, format="turtle")

                # Validate using pyshacl
                results = pyshacl.validate(
                    data_graph,
                    shacl_graph=shapes_graph,
                    data_graph_format="ttl",
                    shacl_graph_format="ttl",
                    inference="rdfs",
                    debug=False,
                    serialize_report_graph="ttl",
                )

                conforms, report_graph, report_text = results

                if conforms:
                    valid_count += 1
                    result_file.write(f"{ttl_file} - VALID\n")
                else:
                    invalid_count += 1
                    result_file.write(f"{ttl_file} - INVALID\n")
                    # Parse the report graph for violations
                    report_graph = rdflib.Graph().parse(data=report_graph, format="turtle")
                    violation_count = 0
                    for s in report_graph.subjects(predicate=rdflib.RDF.type, object=rdflib.URIRef("http://www.w3.org/ns/shacl#ValidationResult")):
                        violation_count += 1
                        focus_node = report_graph.value(subject=s, predicate=rdflib.URIRef("http://www.w3.org/ns/shacl#focusNode"))
                        result_message = report_graph.value(subject=s, predicate=rdflib.URIRef("http://www.w3.org/ns/shacl#resultMessage"))
                        result_path = report_graph.value(subject=s, predicate=rdflib.URIRef("http://www.w3.org/ns/shacl#resultPath"))
                        result_file.write(f"  Violation {violation_count}:\n")
                        result_file.write(f"    Focus Node: {focus_node}\n")
                        result_file.write(f"    Message: {result_message}\n")
                        if result_path:
                            result_file.write(f"    Result Path: {result_path}\n")

                    result_file.write(f"  Total Violations: {violation_count}\n")
            
            except Exception as e:
                result_file.write(f"{ttl_file} - ERROR: {e}\n")
                invalid_count += 1

# Print the final validation summary
print(f"Validation complete. {valid_count} TTL files were validated successfully.")
print(f"{invalid_count} TTL files could not be validated.")

Validation complete. 0 TTL files were validated successfully.
1 TTL files could not be validated.


# Show the violation report for 17 elements of the LOD

In [23]:
import statistics

# List of vocabularies from the table with full URLs
vocabularies = {
    'dct:title': 'http://purl.org/dc/terms/title',
    'dct:description': 'http://purl.org/dc/terms/description',
    'foaf:page': 'http://xmlns.com/foaf/0.1/page',
    'void:triples': 'http://rdfs.org/ns/void#triples',
    'dct:license': 'http://purl.org/dc/terms/license',
    'void:uriSpace': 'http://rdfs.org/ns/void#uriSpace',
    'dct:identifier': 'http://purl.org/dc/terms/identifier',
    'foaf:depiction': 'http://xmlns.com/foaf/0.1/depiction',
    'dcat:keyword': 'http://www.w3.org/ns/dcat#keyword',
    'dcat:theme': 'http://www.w3.org/ns/dcat#theme',
    'void:sparqlEndpoint': 'http://rdfs.org/ns/void#sparqlEndpoint',       
    'dcat:mediaType': 'http://www.w3.org/ns/dcat#mediaType',
    'dcat:downloadURL': 'http://www.w3.org/ns/dcat#downloadURL',
    'adms:status': 'http://www.w3.org/ns/adms#status',
    'dcat:accessURL': 'http://www.w3.org/ns/dcat#accessURL',
    'dcat:endpointURL': 'http://www.w3.org/ns/dcat#endpointURL',
    'dcat:endpointDescription': 'http://www.w3.org/ns/dcat#endpointDescription',
    'dcat:distribution': 'http://www.w3.org/ns/dcat#distribution',
    'void:exampleResource': 'http://rdfs.org/ns/void#exampleResource',
    'dct:mirror': 'http://purl.org/dc/terms/mirror',
    'prov:qualifiedAttribution': 'http://www.w3.org/ns/prov#qualifiedAttribution',
    'prov:agent': 'http://www.w3.org/ns/prov#agent',
    'foaf:name': 'http://xmlns.com/foaf/0.1/name',
    'foaf:mbox': 'http://xmlns.com/foaf/0.1/mbox',
    'void:Linkset': 'http://rdfs.org/ns/void#Linkset',
    'void:target': 'http://rdfs.org/ns/void#target'
}

# Path to the 'output_report_new2.txt' file
file_path = 'output_report_new7.txt'

# Initialize a dictionary to store the errors per vocabulary
vocabulary_errors = {vocab: 0 for vocab in vocabularies}

# Open the file and process it
with open(file_path, 'r') as file:
    for line in file:
        # Look for the vocabularies in each line after the first colon
        if 'Result Path:' in line:
            result_path = line.split(':', 1)[-1].strip()
            for vocab, url in vocabularies.items():
                if url in result_path:
                    vocabulary_errors[vocab] += 1

# Calculate statistics
total_errors = sum(vocabulary_errors.values())
average_errors = total_errors / len(vocabularies) if len(vocabularies) > 0 else 0
max_errors = max(vocabulary_errors.values())
min_errors = min(vocabulary_errors.values())
median_errors = statistics.median(vocabulary_errors.values())

# Output results
print("Vocabulary Errors Summary:")
for vocab, errors in vocabulary_errors.items():
    print(f"{vocab} ({vocabularies[vocab]}): {errors} errors")

print("\nStatistics:")
print(f"Total Errors: {total_errors}")
print(f"Average Errors per Vocabulary: {average_errors:.2f}")
print(f"Maximum Errors in a Vocabulary: {max_errors}")
print(f"Minimum Errors in a Vocabulary: {min_errors}")
print(f"Median Errors: {median_errors}")


Vocabulary Errors Summary:
dct:title (http://purl.org/dc/terms/title): 8 errors
dct:description (http://purl.org/dc/terms/description): 14 errors
foaf:page (http://xmlns.com/foaf/0.1/page): 318 errors
void:triples (http://rdfs.org/ns/void#triples): 0 errors
dct:license (http://purl.org/dc/terms/license): 738 errors
void:uriSpace (http://rdfs.org/ns/void#uriSpace): 0 errors
dct:identifier (http://purl.org/dc/terms/identifier): 0 errors
foaf:depiction (http://xmlns.com/foaf/0.1/depiction): 0 errors
dcat:keyword (http://www.w3.org/ns/dcat#keyword): 80 errors
dcat:theme (http://www.w3.org/ns/dcat#theme): 0 errors
void:sparqlEndpoint (http://rdfs.org/ns/void#sparqlEndpoint): 0 errors
dcat:mediaType (http://www.w3.org/ns/dcat#mediaType): 1 errors
dcat:downloadURL (http://www.w3.org/ns/dcat#downloadURL): 3489 errors
adms:status (http://www.w3.org/ns/adms#status): 0 errors
dcat:accessURL (http://www.w3.org/ns/dcat#accessURL): 522 errors
dcat:endpointURL (http://www.w3.org/ns/dcat#endpointURL):

# show the total number of key-value pairs in LOD json file

In [97]:
import json
from collections import defaultdict

# Define the LOD fields from the image
lod_fields = [
    "Title", "Description", "Full Download", "Other Download", "SPARQL", "Example",
    "Keywords", "Domain", "Owner", "Website", "Triples", "License", "Namespace",
    "DOI", "Image URL", "Links", "Contact point"
]

# Mapping JSON keys to LOD fields
json_key_mapping = {
    "title": "Title",
    "description": "Description",
    "full_download": "Full Download",
    "other_download": "Other Download",
    "sparql": "SPARQL",
    "example": "Example",
    "keywords": "Keywords",
    "domain": "Domain",
    "owner": "Owner",
    "website": "Website",
    "triples": "Triples",
    "license": "License",
    "namespace": "Namespace",
    "doi": "DOI",
    "image": "Image URL",
    "links": "Links",
    "contact_point": "Contact point"
}

# Load JSON file
file_path = "lod.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Initialize counters
field_counts = defaultdict(int)
value_counts = defaultdict(int)
subelement_counts = defaultdict(lambda: defaultdict(int))

# Function to check if a dictionary has at least one non-empty value
def has_non_empty_value(d):
    if isinstance(d, dict):
        return any(v not in [None, "", [], {}] for v in d.values())  # At least one valid value
    return False

# Count occurrences and values
for dataset in data.values():
    for key, lod_field in json_key_mapping.items():
        if key in dataset:
            value = dataset[key]
            
            # Only count occurrences if the value is NOT empty
            if value not in [None, "", [], {}]:  
                field_counts[lod_field] += 1
            
            # Count values in lists
            if isinstance(value, list):
                value_counts[lod_field] += len(value)
                
                # Count subelements for list of dictionaries
                for item in value:
                    if isinstance(item, dict):
                        for subkey, subvalue in item.items():
                            if subvalue not in [None, "", [], {}]:
                                subelement_counts[lod_field][subkey] += 1
            # Count dictionary as a value **ONLY if it has at least one non-empty value**
            elif isinstance(value, dict) and has_non_empty_value(value):
                value_counts[lod_field] += 1
                
                # Count subelements
                for subkey, subvalue in value.items():
                    if subvalue not in [None, "", [], {}]:
                        subelement_counts[lod_field][subkey] += 1
            # Count single values (strings, numbers) only if they are not empty
            elif isinstance(value, (str, int, float)) and value != "":
                value_counts[lod_field] += 1

# Print the results
print("\n--- Field Occurrences ---")
for field in lod_fields:
    print(f"{field}: {field_counts.get(field, 0)} occurrences")

print("\n--- Value Counts ---")
for field in lod_fields:
    print(f"{field}: {value_counts.get(field, 0)} total values")

print("\n--- Subelement Counts ---")
for field, subfields in subelement_counts.items():
    print(f"{field}:")
    for subkey, count in subfields.items():
        print(f"  {subkey}: {count} occurrences")



--- Field Occurrences ---
Title: 1573 occurrences
Description: 1573 occurrences
Full Download: 282 occurrences
Other Download: 1090 occurrences
SPARQL: 657 occurrences
Example: 828 occurrences
Keywords: 1493 occurrences
Domain: 1244 occurrences
Owner: 298 occurrences
Website: 1255 occurrences
Triples: 1568 occurrences
License: 835 occurrences
Namespace: 705 occurrences
DOI: 17 occurrences
Image URL: 67 occurrences
Links: 1295 occurrences
Contact point: 1573 occurrences

--- Value Counts ---
Title: 1573 total values
Description: 1456 total values
Full Download: 512 total values
Other Download: 3544 total values
SPARQL: 668 total values
Example: 1173 total values
Keywords: 15745 total values
Domain: 1244 total values
Owner: 298 total values
Website: 1255 total values
Triples: 1568 total values
License: 835 total values
Namespace: 705 total values
DOI: 17 total values
Image URL: 67 total values
Links: 17882 total values
Contact point: 1326 total values

--- Subelement Counts ---
Descript

In [14]:
import json
from collections import defaultdict

# Define the LOD fields from the image
lod_fields = [
    "Title", "Description", "Full Download", "Other Download", "SPARQL", "Example",
    "Keywords", "Domain", "Owner", "Website", "Triples", "License", "Namespace",
    "DOI", "Image URL", "Links", "Contact point"
]

# Mapping JSON keys to LOD fields
json_key_mapping = {
    "title": "Title",
    "description": "Description",
    "full_download": "Full Download",
    "other_download": "Other Download",
    "sparql": "SPARQL",
    "example": "Example",
    "keywords": "Keywords",
    "domain": "Domain",
    "owner": "Owner",
    "website": "Website",
    "triples": "Triples",
    "license": "License",
    "namespace": "Namespace",
    "doi": "DOI",
    "image": "Image URL",
    "links": "Links",
    "contact_point": "Contact point"
}

# Load JSON file
file_path = "lod.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Initialize counters
field_counts = defaultdict(int)  # Counts datasets with a non-empty field
value_counts = defaultdict(int)  # Counts the actual values inside fields
subelement_counts = defaultdict(lambda: defaultdict(int))  # Tracks nested fields

# Function to check if a dictionary has at least one non-empty value
def has_non_empty_value(d):
    if isinstance(d, dict):
        return any(v not in [None, "", [], {}] for v in d.values())
    return False

# Count occurrences and values
for dataset in data.values():
    for key, lod_field in json_key_mapping.items():
        if key in dataset:
            value = dataset[key]

            # Ensure we only count non-empty fields in field_counts
            if value not in [None, "", [], {}]:  
                field_counts[lod_field] += 1

            # Handle lists properly
            if isinstance(value, list):
                non_empty_values = [item for item in value if item not in [None, "", [], {}]]
                value_counts[lod_field] += len(non_empty_values)

                # Count subelements in lists of dictionaries
                for item in non_empty_values:
                    if isinstance(item, dict):
                        for subkey, subvalue in item.items():
                            if subvalue not in [None, "", [], {}]:
                                subelement_counts[lod_field][subkey] += 1
            
            # Handle dictionaries correctly
            elif isinstance(value, dict) and has_non_empty_value(value):
                non_empty_subfields = sum(1 for subkey, subvalue in value.items() if subvalue not in [None, "", [], {}])
                value_counts[lod_field] += non_empty_subfields if non_empty_subfields > 0 else 1

                # Count subelements
                for subkey, subvalue in value.items():
                    if subvalue not in [None, "", [], {}]:
                        subelement_counts[lod_field][subkey] += 1

            # Handle single values (strings, numbers)
            elif isinstance(value, (str, int, float)) and value != "":
                value_counts[lod_field] += 1

# Print the results
print("\n--- Field Occurrences (Number of datasets that have this field) ---")
for field in lod_fields:
    print(f"{field}: {field_counts.get(field, 0)} datasets")

print("\n--- Value Counts (Total instances of this field) ---")
for field in lod_fields:
    print(f"{field}: {value_counts.get(field, 0)} values")

print("\n--- Subelement Counts (Details of nested elements) ---")
for field, subfields in subelement_counts.items():
    print(f"{field}:")
    for subkey, count in subfields.items():
        print(f"  {subkey}: {count} occurrences")



--- Field Occurrences (Number of datasets that have this field) ---
Title: 1573 datasets
Description: 1573 datasets
Full Download: 282 datasets
Other Download: 1090 datasets
SPARQL: 657 datasets
Example: 828 datasets
Keywords: 1493 datasets
Domain: 1244 datasets
Owner: 298 datasets
Website: 1255 datasets
Triples: 1568 datasets
License: 835 datasets
Namespace: 705 datasets
DOI: 17 datasets
Image URL: 67 datasets
Links: 1295 datasets
Contact point: 1573 datasets

--- Value Counts (Total instances of this field) ---
Title: 1573 values
Description: 1456 values
Full Download: 512 values
Other Download: 3544 values
SPARQL: 668 values
Example: 1173 values
Keywords: 15743 values
Domain: 1244 values
Owner: 318 values
Website: 1255 values
Triples: 1568 values
License: 835 values
Namespace: 705 values
DOI: 17 values
Image URL: 67 values
Links: 17882 values
Contact point: 2276 values

--- Subelement Counts (Details of nested elements) ---
Description:
  en: 1456 occurrences
Full Download:
  media

In [None]:
Full Download:
  media_type: 511 occurrences
  description: 415 occurrences
  status: 512 occurrences
  title: 449 occurrences
  download_url: 511 occurrences
  mirror: 47 occurrences
  _id: 267 occurrences

In [None]:
ttl_files/wikilinks-rdf-nif.ttl