In [21]:
import json
import os
import re  # Import regex module for sanitizing file names

# Load the initial JSON file
file_path = "lod-data.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Step 1: Replace spaces with hyphens in dataset IDs
data = {key.replace(" ", "-"): value for key, value in data.items()}

# Step 2: Define fields to remove
fields_to_remove = {
    "full_download": ["title", "description", "status", "mirror", "media_type", "_id"],
    "sparql": ["_id", "title", "description", "status"],
    "other_download": ["title", "description", "media_type", "status", "mirror", "_id"],
    "example": ["title", "description", "media_type", "status"],
    "links": ["value"]
}

def remove_fields(obj, fields_to_remove):
    """Recursively remove specified fields from the JSON object."""
    if isinstance(obj, dict):
        return {k: remove_fields(v, fields_to_remove) for k, v in obj.items() if k not in fields_to_remove}
    elif isinstance(obj, list):
        return [remove_fields(item, fields_to_remove) for item in obj]
    else:
        return obj

# Process each main element in the data
for key, element in data.items():
    for field, subfields in fields_to_remove.items():
        if field in element:
            element[field] = remove_fields(element[field], subfields)

# Step 3: Flatten nested fields
def flatten_fields_in_place(element):
    """Flatten nested fields in place for all fields."""
    for key, value in element.items():
        if isinstance(value, list) and all(isinstance(item, dict) for item in value):
            # Flatten a list of dictionaries to a list of values from all keys
            flattened = []
            for item in value:
                if isinstance(item, dict):
                    flattened.extend(item.values())
            element[key] = flattened
        elif isinstance(value, dict):
            # Replace the field with its values if it's a dictionary
            flattened = list(value.values())
            # Use the first value if there's only one, otherwise keep the list
            element[key] = flattened[0] if len(flattened) == 1 else flattened

# Adjust the JSON structure for all fields
for key, element in data.items():
    if isinstance(element, dict):
        flatten_fields_in_place(element)

# Save the intermediate processed JSON
intermediate_json_path = "lod-data-processed.json"
with open(intermediate_json_path, "w") as file:
    json.dump(data, file, indent=4)

# Step 4: Convert JSON to TTL
prefixes = """
@prefix : <http://example.org/> . 
@prefix dct: <http://purl.org/dc/terms/> . 
@prefix void: <http://rdfs.org/ns/void#> . 
@prefix dcat: <http://www.w3.org/ns/dcat#> . 
@prefix foaf: <http://xmlns.com/foaf/0.1/> . 
@prefix schema: <http://schema.org/> . 
"""

def json_to_ttl(json_data, dataset_id):
    ttl_data = f":{dataset_id} a dcat:Dataset ;\n"
    dataset = json_data[dataset_id]
    for key, value in dataset.items():
        if not value:  # Skip keys with empty or None values
            continue
        if key == "title":
            ttl_data += f"    dct:title \"{value}\" ;\n"
        elif key == "description":
            ttl_data += f"    dct:description \"\"\"{value}\"\"\" ;\n"  # Multiline string
        elif key == "full_download" and isinstance(value, list):
            ttl_data += "    dct:distribution " + ", ".join(f'<{kw}>' for kw in value) + " ;\n"
        elif key == "sparql" and isinstance(value, list):
            ttl_data += "    void:sparqlEndpoint " + ", ".join(f'<{kw}>' for kw in value) + " ;\n"
        elif key == "other_download" and isinstance(value, list):
            ttl_data += "    dct:distribution " + ", ".join(f'<{kw}>' for kw in value) + " ;\n"
        elif key == "example" and isinstance(value, list):
            ttl_data += "    void:exampleResource " + ", ".join(f'"{item}"' for item in value) + " ;\n"
        elif key == "keywords" and isinstance(value, list):
            ttl_data += "    dcat:keyword " + ", ".join(f'"{kw}"' for kw in value) + " ;\n"
        elif key == "owner" and isinstance(value, list):
            ttl_data += "    dct:creator " + ", ".join(f'"{item}"' for item in value) + " ;\n"
        elif key == "website":
            ttl_data += f"    foaf:page <{value}> ;\n"
        elif key == "triples":
            ttl_data += f"    void:triples \"{value}\" ;\n"
        elif key == "license":
            ttl_data += f"    dct:license <{value}> ;\n"
        elif key == "namespace":
            ttl_data += f"    void:uriSpace <{value}> ;\n"
        elif key == "doi":
            ttl_data += f"    dct:identifier \"{value}\" ;\n"
        elif key == "contact_point" and isinstance(value, list):
            ttl_data += "    dcat:contactPoint " + ", ".join(f'"{item}"' for item in value) + " ;\n"
        elif key == "domain":
            ttl_data += f"    dcat:keyword \"{value}\" ;\n"
        elif key == "image":
            ttl_data += f"    foaf:depiction <{value}> ;\n"
        elif key == "links" and isinstance(value, list):
            ttl_data += "    void:target " + ", ".join(f'"{item}"' for item in value) + " ;\n"
    # Remove trailing semicolon for the last predicate
    ttl_data = ttl_data.rstrip(" ;\n") + " .\n\n"
    return ttl_data

# Save each dataset in a separate TTL file
output_dir = "ttl_files"  # Define the folder to save TTL files
os.makedirs(output_dir, exist_ok=True)  # Create the folder if it doesn't exist

def sanitize_filename(dataset_id):
    # Replace any non-alphanumeric characters (including ':', '/', etc.) with underscores
    return re.sub(r'[^a-zA-Z0-9_-]', '_', dataset_id)

for dataset_id in data:
    sanitized_id = sanitize_filename(dataset_id)  # Sanitize dataset ID to a valid filename
    ttl_content = prefixes + "\n" + json_to_ttl(data, dataset_id)
    ttl_output_path = os.path.join(output_dir, f"{sanitized_id}.ttl")  # Path for the TTL file
    with open(ttl_output_path, "w") as file:
        file.write(ttl_content)

print(f"TTL files saved in the folder: {output_dir}")


TTL files saved in the folder: ttl_files


In [27]:
import json
import os
import re
import rdflib
import pyshacl

# Load the initial JSON file
file_path = "lod-data.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Step 1: Replace spaces with hyphens in dataset IDs
data = {key.replace(" ", "-"): value for key, value in data.items()}

# Step 2: Define fields to remove
fields_to_remove = {
    "full_download": ["title", "description", "status", "mirror", "media_type", "_id"],
    "sparql": ["_id", "title", "description", "status"],
    "other_download": ["title", "description", "media_type", "status", "mirror", "_id"],
    "example": ["title", "description", "media_type", "status"],
    "links": ["value"]
}

def remove_fields(obj, fields_to_remove):
    """Recursively remove specified fields from the JSON object."""
    if isinstance(obj, dict):
        return {k: remove_fields(v, fields_to_remove) for k, v in obj.items() if k not in fields_to_remove}
    elif isinstance(obj, list):
        return [remove_fields(item, fields_to_remove) for item in obj]
    else:
        return obj

# Process each main element in the data
for key, element in data.items():
    for field, subfields in fields_to_remove.items():
        if field in element:
            element[field] = remove_fields(element[field], subfields)

# Step 3: Flatten nested fields
def flatten_fields_in_place(element):
    """Flatten nested fields in place for all fields."""
    for key, value in element.items():
        if isinstance(value, list) and all(isinstance(item, dict) for item in value):
            # Flatten a list of dictionaries to a list of values from all keys
            flattened = []
            for item in value:
                if isinstance(item, dict):
                    flattened.extend(item.values())
            element[key] = flattened
        elif isinstance(value, dict):
            # Replace the field with its values if it's a dictionary
            flattened = list(value.values())
            # Use the first value if there's only one, otherwise keep the list
            element[key] = flattened[0] if len(flattened) == 1 else flattened

# Adjust the JSON structure for all fields
for key, element in data.items():
    if isinstance(element, dict):
        flatten_fields_in_place(element)

# Save the intermediate processed JSON
intermediate_json_path = "lod-data-processed.json"
with open(intermediate_json_path, "w") as file:
    json.dump(data, file, indent=4)

# Step 4: Convert JSON to TTL
prefixes = """
@prefix : <http://example.org/> . 
@prefix dct: <http://purl.org/dc/terms/> . 
@prefix void: <http://rdfs.org/ns/void#> . 
@prefix dcat: <http://www.w3.org/ns/dcat#> . 
@prefix foaf: <http://xmlns.com/foaf/0.1/> . 
@prefix schema: <http://schema.org/> . 
"""

def json_to_ttl(json_data, dataset_id):
    ttl_data = f":{dataset_id} a dcat:Dataset ;\n"
    dataset = json_data[dataset_id]
    for key, value in dataset.items():
        if not value:  # Skip keys with empty or None values
            continue
        if key == "title":
            ttl_data += f"    dct:title \"{value}\" ;\n"
        elif key == "description":
            ttl_data += f"    dct:description \"\"\"{value}\"\"\" ;\n"  # Multiline string
        elif key == "full_download" and isinstance(value, list):
            ttl_data += "    dct:distribution " + ", ".join(f'"{kw}"' for kw in value) + " ;\n"
        elif key == "sparql" and isinstance(value, list):
            ttl_data += "    void:sparqlEndpoint " + ", ".join(f'"{kw}"' for kw in value) + " ;\n"
        elif key == "other_download" and isinstance(value, list):
            ttl_data += "    dct:distribution " + ", ".join(f'"{kw}"' for kw in value) + " ;\n"
        elif key == "example" and isinstance(value, list):
            ttl_data += "    void:exampleResource " + ", ".join(f'"{item}"' for item in value) + " ;\n"
        elif key == "keywords" and isinstance(value, list):
            ttl_data += "    dcat:keyword " + ", ".join(f'"{kw}"' for kw in value) + " ;\n"
        elif key == "owner" and isinstance(value, list):
            ttl_data += "    dct:creator " + ", ".join(f'"{item}"' for item in value) + " ;\n"
        elif key == "website":
            ttl_data += f"    foaf:page \"{value}\" ;\n"
        elif key == "triples":
            ttl_data += f"    void:triples \"{value}\" ;\n"
        elif key == "license":
            ttl_data += f"    dct:license \"{value}\" ;\n"
        elif key == "namespace":
            ttl_data += f"    void:uriSpace \"{value}\" ;\n"
        elif key == "doi":
            ttl_data += f"    dct:identifier \"{value}\" ;\n"
        elif key == "contact_point" and isinstance(value, list):
            ttl_data += "    dcat:contactPoint " + ", ".join(f'"{item}"' for item in value) + " ;\n"
        elif key == "domain":
            ttl_data += f"    dcat:keyword \"{value}\" ;\n"
        elif key == "image":
            ttl_data += f"    foaf:depiction \"{value}\" ;\n"
        elif key == "links" and isinstance(value, list):
            ttl_data += "    void:target " + ", ".join(f'"{item}"' for item in value) + " ;\n"
    # Remove trailing semicolon for the last predicate
    ttl_data = ttl_data.rstrip(" ;\n") + " .\n\n"
    return ttl_data

# Save each dataset in a separate TTL file
output_dir = "ttl_files"  # Define the folder to save TTL files
os.makedirs(output_dir, exist_ok=True)  # Create the folder if it doesn't exist

def sanitize_filename(dataset_id):
    # Replace any non-alphanumeric characters (including ':', '/', etc.) with underscores
    return re.sub(r'[^a-zA-Z0-9_-]', '_', dataset_id)

for dataset_id in data:
    sanitized_id = sanitize_filename(dataset_id)  # Sanitize dataset ID to a valid filename
    ttl_content = prefixes + "\n" + json_to_ttl(data, dataset_id)
    ttl_output_path = os.path.join(output_dir, f"{sanitized_id}.ttl")  # Path for the TTL file
    with open(ttl_output_path, "w") as file:
        file.write(ttl_content)

# Step 5: Validate each TTL file using SHACL
output_txt = "output_report.txt"
shacl_file_path = "mary-dct_shacl.ttl"  # Specify your SHACL file here

valid_count = 0
invalid_count = 0

with open(output_txt, "w") as result_file:
    for ttl_file in os.listdir(output_dir):
        if ttl_file.endswith(".ttl"):
            ttl_file_path = os.path.join(output_dir, ttl_file)
            
            try:
                data_graph = rdflib.Graph()
                data_graph.parse(ttl_file_path, format="turtle")

                # Create a SHACL graph
                shapes_graph = rdflib.Graph()
                shapes_graph.parse(shacl_file_path, format="turtle")

                results = pyshacl.validate(
                    data_graph,
                    shacl_graph=shapes_graph,
                    data_graph_format="ttl",
                    shacl_graph_format="ttl",
                    inference="rdfs",
                    debug=False,
                    serialize_report_graph="ttl",
                )

                conforms, report_graph, report_text = results

                if conforms:
                    valid_count += 1
                    result_file.write(f"{ttl_file} - VALID\n")
                else:
                    invalid_count += 1
                    result_file.write(f"{ttl_file} - INVALID\n")
                    # You can add more detailed violation info if needed
                    report_graph = rdflib.Graph().parse(data=report_graph, format="turtle")
                    for s in report_graph.subjects(predicate=rdflib.RDF.type, object=rdflib.URIRef("http://www.w3.org/ns/shacl#ValidationResult")):
                        violation_message = report_graph.value(subject=s, predicate=rdflib.URIRef("http://www.w3.org/ns/shacl#resultMessage"))
                        result_file.write(f"  Violation: {violation_message}\n")
                
            except Exception as e:
                result_file.write(f"{ttl_file} - ERROR: {e}\n")
                invalid_count += 1

# Print the final validation summary
print(f"Validation complete. {valid_count} TTL files were validated successfully.")
print(f"{invalid_count} TTL files could not be validated.")


Validation complete. 0 TTL files were validated successfully.
1573 TTL files could not be validated.


In [28]:
import pyshacl
import rdflib
import os

# Path to the SHACL file
shacl_file_path = "mary-dct_shacl.ttl"  # Your SHACL file here

# Directory containing the TTL files
ttl_dir_path = "ttl_files"  # Your TTL files directory

# Output report file
output_txt = "output_report.txt"

valid_count = 0
invalid_count = 0

with open(output_txt, "w") as result_file:
    for ttl_file in os.listdir(ttl_dir_path):
        if ttl_file.endswith(".ttl"):
            ttl_file_path = os.path.join(ttl_dir_path, ttl_file)

            try:
                # Load the TTL file into a graph
                data_graph = rdflib.Graph()
                data_graph.parse(ttl_file_path, format="turtle")

                # Create a SHACL graph
                shapes_graph = rdflib.Graph()
                shapes_graph.parse(shacl_file_path, format="turtle")

                # Validate using pyshacl
                results = pyshacl.validate(
                    data_graph,
                    shacl_graph=shapes_graph,
                    data_graph_format="ttl",
                    shacl_graph_format="ttl",
                    inference="rdfs",
                    debug=False,
                    serialize_report_graph="ttl",
                )

                conforms, report_graph, report_text = results

                if conforms:
                    valid_count += 1
                    result_file.write(f"{ttl_file} - VALID\n")
                else:
                    invalid_count += 1
                    result_file.write(f"{ttl_file} - INVALID\n")
                    # Parse the report graph for violations
                    report_graph = rdflib.Graph().parse(data=report_graph, format="turtle")
                    violation_count = 0
                    for s in report_graph.subjects(predicate=rdflib.RDF.type, object=rdflib.URIRef("http://www.w3.org/ns/shacl#ValidationResult")):
                        violation_count += 1
                        focus_node = report_graph.value(subject=s, predicate=rdflib.URIRef("http://www.w3.org/ns/shacl#focusNode"))
                        result_message = report_graph.value(subject=s, predicate=rdflib.URIRef("http://www.w3.org/ns/shacl#resultMessage"))
                        result_path = report_graph.value(subject=s, predicate=rdflib.URIRef("http://www.w3.org/ns/shacl#resultPath"))
                        result_file.write(f"  Violation {violation_count}:\n")
                        result_file.write(f"    Focus Node: {focus_node}\n")
                        result_file.write(f"    Message: {result_message}\n")
                        if result_path:
                            result_file.write(f"    Result Path: {result_path}\n")

                    result_file.write(f"  Total Violations: {violation_count}\n")
            
            except Exception as e:
                result_file.write(f"{ttl_file} - ERROR: {e}\n")
                invalid_count += 1

# Print the final validation summary
print(f"Validation complete. {valid_count} TTL files were validated successfully.")
print(f"{invalid_count} TTL files could not be validated.")


Validation complete. 0 TTL files were validated successfully.
1573 TTL files could not be validated.


In [29]:
# Open and analyze the provided file to count the datasets and summarize the content
file_path = 'output_report.txt'

# Initialize counters and structures for analysis
total_datasets = 0
violation_summary = []

# Reading the file content
with open(file_path, 'r') as file:
    for line in file:
        # Identify dataset lines and increment the counter
        if line.strip().endswith('- INVALID'):
            total_datasets += 1
        # Identify total violations for each dataset
        if line.strip().startswith('Total Violations:'):
            violations = int(line.strip().split(':')[-1].strip())
            violation_summary.append(violations)

# Compile results
summary = {
    'Total Datasets': total_datasets,
    'Total Violations': sum(violation_summary),
    'Average Violations per Dataset': sum(violation_summary) / total_datasets if total_datasets > 0 else 0,
    'Max Violations in a Single Dataset': max(violation_summary) if violation_summary else 0,
    'Min Violations in a Single Dataset': min(violation_summary) if violation_summary else 0
}

summary


{'Total Datasets': 1536,
 'Total Violations': 16227,
 'Average Violations per Dataset': 10.564453125,
 'Max Violations in a Single Dataset': 25,
 'Min Violations in a Single Dataset': 8}