In [2]:
import json
import rdflib

# Define a base namespace for all URIs
EXAMPLE_NS = rdflib.Namespace("http://example.org/")
WIKIDATA_NS = rdflib.Namespace("http://www.wikidata.org/entity/")
PREDICATE_NS = rdflib.Namespace("http://www.wikidata.org/prop/direct/")

def load_json(file_path):
    """Load a JSON file and return its content as a dictionary."""
    with open(file_path, 'r') as f:
        return json.load(f)

def parse_spo_file(spo_file_path):
    """Parse a tab-separated subject-predicate-object file."""
    triples = []
    with open(spo_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                subject, predicate, obj = parts
                triples.append((subject, predicate, obj))
            else:
                # Optionally, log or print the problematic line for debugging
                print(f"Skipping invalid line: {line.strip()}")
    return triples

def generate_ttl(triples, entity_data, relation_data, type_data, entity2type_data, output_ttl_path):
    """Generate a TTL file based on triples and additional info from JSON files."""
    g = rdflib.Graph()

    for subj, pred, obj in triples:
        # Create RDF triples using the base namespace
        subject_uri = WIKIDATA_NS[subj]
        predicate_uri = PREDICATE_NS[pred]
        object_uri = WIKIDATA_NS[obj]

        g.add((subject_uri, predicate_uri, object_uri))

        # Add label and type link for subject
        if subj in entity_data:
            label = entity_data[subj].get('label', '')
            
            if label:
                g.add((subject_uri, rdflib.RDFS.label, rdflib.Literal(label)))
    
        # Add label and type link for object
        if obj in entity_data:
            label = entity_data[obj].get('label', '')
            
            if label:
                g.add((object_uri, rdflib.RDFS.label, rdflib.Literal(label)))

        # Add label for predicate
        if pred in relation_data:
            pred_label = relation_data[pred].get('label', '')
            if pred_label:
                g.add((predicate_uri, rdflib.RDFS.label, rdflib.Literal(pred_label)))

    # Add type relations using P31 from entity2type data
    for entity, types in entity2type_data.items():
        entity_uri = WIKIDATA_NS[entity]
        for entity_type in types:
            type_uri = WIKIDATA_NS[entity_type]
            g.add((entity_uri, PREDICATE_NS['P31'], type_uri))

    # Serialize the graph to a Turtle file
    g.serialize(destination=output_ttl_path, format='turtle')

        
if __name__ == "__main__":
    # Input file paths
    entity_json_path = "/Users/ozgeerten/Documents/GitHub/codex/data/entities/en/entities.json"
    relation_json_path = "/Users/ozgeerten/Documents/GitHub/codex/data/relations/en/relations.json"
    type_json_path = "/Users/ozgeerten/Documents/GitHub/codex/data/types/en/types.json"
    entity2type_json_path = "/Users/ozgeerten/Documents/GitHub/codex/data/types/entity2types.json"
    spo_file_path = "/Users/ozgeerten/Documents/GitHub/codex/data/triples/codex-m/graph.txt"
    output_ttl_path = "/Users/ozgeerten/Documents/GitHub/rudik-docker/Codex/codexM-output.ttl"

    # Load JSON data
    entity_data = load_json(entity_json_path)
    relation_data = load_json(relation_json_path)
    type_data = load_json(type_json_path)
    entity2type_data = load_json(entity2type_json_path)

    # Parse the SPO file
    triples = parse_spo_file(spo_file_path)

    # Generate TTL file
    generate_ttl(triples, entity_data, relation_data, type_data, entity2type_data, output_ttl_path)

    print(f"Turtle file generated: {output_ttl_path}")


Skipping invalid line: 
Skipping invalid line: 
Turtle file generated: /Users/ozgeerten/Documents/GitHub/rudik-docker/Codex/codexM-output.ttl


In [4]:
# Load the RDF graph from the Turtle file
g = rdflib.Graph()
g.parse(output_ttl_path, format="turtle")

# Define a SPARQL query as a string
sparql_query = """
SELECT ?o (COUNT(?o) AS ?c)
WHERE {
    ?s1 <http://www.wikidata.org/prop/direct/P136> ?o1.
    ?o1 <http://www.wikidata.org/prop/direct/P31> ?o .
}
GROUP BY ?o
ORDER BY DESC(?c)
LIMIT 10

"""

# Execute the SPARQL query on the graph
query_result = g.query(sparql_query)

# Iterate through the results and print them
for row in query_result:
    print(f"Subject: {row.o}, Count: {row.c}")


Subject: http://www.wikidata.org/entity/Q188451, Count: 5597
Subject: http://www.wikidata.org/entity/Q201658, Count: 4346
Subject: http://www.wikidata.org/entity/Q11424, Count: 2507
Subject: http://www.wikidata.org/entity/Q373342, Count: 2189
Subject: http://www.wikidata.org/entity/Q223393, Count: 1041
Subject: http://www.wikidata.org/entity/Q11399, Count: 1041
Subject: http://www.wikidata.org/entity/Q4263830, Count: 843
Subject: http://www.wikidata.org/entity/Q21010853, Count: 801
Subject: http://www.wikidata.org/entity/Q37073, Count: 755
Subject: http://www.wikidata.org/entity/Q483394, Count: 750
