# This script is used to maniulate the dataset - extract labels and similar

In [1]:
# set app as default directory to address imports
import os
import sys
sys.path.append(os.path.join(os.getcwd(), './../../'))

#activate autoreload to easier test classes
%load_ext autoreload
%autoreload 2

In [2]:
from app.services.sparql_graph import SPARQLGraph
from app.config.enums import Environment
graph = SPARQLGraph(Environment.PROD, False)

Metadata loaded successfully from JSON files.
Initializing SPARQLGraph
Graph loaded with 2056777 triples after 0:01:17.033118


In [3]:
# assign graph to graph
sparql_graph = graph.graph

In [4]:
import numpy as np
import csv
import rdflib

from collections import defaultdict
from rdflib.namespace import Namespace, RDF, RDFS, XSD
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')

## Create labels directories (No needed to execute to make second part work)

In [5]:
# load the dictionaries
with open('./../too_large_dataset/ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('./../too_large_dataset/ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [6]:
ent2lbl = {ent: str(lbl) for ent, lbl in sparql_graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

predicates = set()

for s, p, o in sparql_graph:  # Iterate over all triples in the graph
    predicates.add(p)  # Collect the unique predicates

# Now map each predicate to its label (if available)
rel2lbl = {}

for pred in predicates:
    # Find the label for the predicate, if it exists
    label = sparql_graph.value(subject=pred, predicate=RDFS.label)
    if label:
        rel2lbl[pred] = str(label)
    else:
        # If no label is available, use the URI as the fallback
        rel2lbl[pred] = str(pred)

# Reverse dictionary: labels to relationships
lbl2rel = {lbl: rel for rel, lbl in rel2lbl.items()}

In [7]:
# save everything to json files
import json
with open('./../useful_dataset/graph/ent2lbl.json', 'w') as ofile:
    json.dump({str(k): v for k, v in ent2lbl.items()}, ofile)

with open('./../useful_dataset/graph/lbl2ent.json', 'w') as ofile:
    json.dump({k: str(v) for k, v in lbl2ent.items()}, ofile)

with open('./../useful_dataset/graph/rel2lbl.json', 'w') as ofile:
    json.dump({str(k): v for k, v in rel2lbl.items()}, ofile)

with open('./../useful_dataset/graph/lbl2rel.json', 'w') as ofile:
    json.dump({k: str(v) for k, v in lbl2rel.items()}, ofile)

# Extract Only Movie Labels and Movie-based relations

In [7]:
from collections import defaultdict
from rdflib import RDFS, Literal

# Initialize sets and dictionaries for storing unique items and mappings
unique_movies = set()
unique_relationships = set()
movie2ids = {}

# Function to query a movie by its label and return its ID
def get_movie_id_by_label(movie_label):
    # Query the graph to find the ID of a movie based on its label
    query_result = list(graph.graph.subjects(predicate=RDFS.label, object=Literal(movie_label, lang="en")))
    if query_result:
        # Return the first matching ID if found
        return str(query_result[0])
    else:
        # Return None if no match found
        return None

# Assume `graph` is an instance of your `SPARQLGraph` class
# WD['Q11424'] is the identifier for films in the graph (from your 'roots' dictionary)
film_class = WD['Q11424']

# Step 1: Find all subclasses of `film_class` (e.g., "manga" as a subset of movies)
subclasses = set()

# Use SPARQL to find all subclasses (direct and indirect) of `film_class`
subclass_query = f"""
    prefix wdt: <http://www.wikidata.org/prop/direct/>
    prefix wd: <http://www.wikidata.org/entity/>
    SELECT ?subclass WHERE {{
        ?subclass wdt:P279+ wd:Q11424 .  # P279 is typically the property for 'subclass of'
    }}
"""
results = graph.graph.query(subclass_query)

# Add subclasses to the set of known subclasses
for result in results:
    subclasses.add(result['subclass'])

# Step 2: Iterate over all subject-object pairs where the predicate is `P31` (instance of)
for s, o in graph.graph.subject_objects(WDT.P31):
    if o == film_class or o in subclasses:
        # Fetch the label for the film entity using the graph function
        film_label = graph.get_lbl_for_ent(str(s))

        if film_label != "Unknown Label":
            # Add the label to the unique movies set
            unique_movies.add(film_label)

            # Map movie label to its ID in movie2ids
            if film_label not in movie2ids:
                movie2ids[film_label] = [str(s)]
            else:
                ids = movie2ids[film_label]
                if str(s) not in ids:
                    ids.append(str(s))
                    movie2ids[film_label] = ids

            # For each film entity, retrieve all relations (predicates)
            for p, o in graph.graph.predicate_objects(s):
                # Get the label for the predicate (relationship) using the graph function
                rel_label = graph.get_lbl_for_rel(str(p))

                # Add the relationship label if it exists
                if rel_label != "Unknown Label":
                    unique_relationships.add(rel_label)

# Convert the sets to lists for final output
list_of_movies = list(unique_movies)
list_of_relationships = list(unique_relationships)

In [8]:
# Weathering with You in unique movies
get_movie_id_by_label("Weathering with You")

'http://www.wikidata.org/entity/Q59692464'

In [10]:
max_ids = 0
max_label = ""
movie2id = {}
movie2ids_final = {}
for label, ids in movie2ids.items():
    if len(ids) > 1:
        movie2ids_final[label] = {}
        for movie_uri in ids:
            query = f"""PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX schema: <http://schema.org/>
                        SELECT ?description
                        WHERE {{
                            OPTIONAL {{
                                wd:{movie_uri.split("/")[-1]} schema:description ?description .
                                FILTER(LANG(?description) = "en")
                            }}
                        }}"""
            result = sparql_graph.query(query)
            results_list = []
            for row in result:
                results_list.append([str(item).encode("utf-8").decode("utf-8") for item in row])
            description = '\n'.join(['\t'.join(row) for row in results_list])
            movie2ids_final[label].update({movie_uri: description})
        if len(ids) > max_ids:
            max_ids = len(ids)
            max_label = label
    else:
        movie2id[label] = ids[0]
print(f"{max_label}: {max_ids}")

Hamlet: 6


In [11]:
movie2ids_final

{'Bully': {'http://www.wikidata.org/entity/Q1004657': '2001 film by Larry Clark',
  'http://www.wikidata.org/entity/Q4997083': '2011 documentary film directed by Lee Hirsch'},
 'The Big Hit': {'http://www.wikidata.org/entity/Q100573244': '2020 film by Emmanuel Courcol',
  'http://www.wikidata.org/entity/Q1703148': '1998 film by Kirk Wong'},
 'Cash': {'http://www.wikidata.org/entity/Q1024660': '2008 film by Éric Besnard',
  'http://www.wikidata.org/entity/Q1024664': '2010 crime thriller film directed by Stephen Milburn Anderson'},
 'Caged': {'http://www.wikidata.org/entity/Q1025721': '2010 film by Yann Gozlan',
  'http://www.wikidata.org/entity/Q2541458': '2011 film by Stephan Brenninkmeijer',
  'http://www.wikidata.org/entity/Q1451738': '1950 film by John Cromwell'},
 'Elena': {'http://www.wikidata.org/entity/Q10271690': '2012 film directed by Petra Costa',
  'http://www.wikidata.org/entity/Q678643': '2011 film by Andrey Zvyagintsev'},
 'The Cave': {'http://www.wikidata.org/entity/Q102

In [12]:
movie2id

{'Jan Dara': 'http://www.wikidata.org/entity/Q1000825',
 'Moondram Pirai': 'http://www.wikidata.org/entity/Q1001777',
 "Buffalo Bill and the Indians, or Sitting Bull's History Lesson": 'http://www.wikidata.org/entity/Q1001943',
 'What We Wanted': 'http://www.wikidata.org/entity/Q100232971',
 'Wanted: Dead or Alive': 'http://www.wikidata.org/entity/Q1002480',
 'Linger': 'http://www.wikidata.org/entity/Q1003106',
 'Eastern Condors': 'http://www.wikidata.org/entity/Q1003128',
 'Amerika': 'http://www.wikidata.org/entity/Q100323695',
 'Bukowski: Born into This': 'http://www.wikidata.org/entity/Q1003411',
 'Fatal Move': 'http://www.wikidata.org/entity/Q1003782',
 'On the Mountain of Tai Hang': 'http://www.wikidata.org/entity/Q10041645',
 'Forever Enthralled': 'http://www.wikidata.org/entity/Q1004392',
 'Kill the Irishman': 'http://www.wikidata.org/entity/Q1004440',
 'Bullseye!': 'http://www.wikidata.org/entity/Q1004567',
 'Dry Wind': 'http://www.wikidata.org/entity/Q100506077',
 'Angels Fall

In [13]:
import json

# Convert sets and dictionary to lists for JSON compatibility
list_of_movies = list(unique_movies)
list_of_relationships = list(unique_relationships)

# Define paths for each JSON file
movies_path = './../useful_dataset/graph/unique_movies.json'
relationships_path = './../useful_dataset/graph/unique_relationships.json'
movie2id_path = './../useful_dataset/graph/movie2id.json'
movie2ids_path = './../useful_dataset/graph/movie2ids.json'

# Save unique movies list to JSON
with open(movies_path, 'w') as ofile:
    json.dump(list_of_movies, ofile)

# Save unique relationships list to JSON
with open(relationships_path, 'w') as ofile:
    json.dump(list_of_relationships, ofile)

# Save movie-to-ID mapping dictionary to JSON
with open(movie2id_path, 'w') as ofile:
    json.dump(movie2id, ofile)

# Save movie-to-IDs mapping dictionary to JSON
with open(movie2ids_path, 'w') as ofile:
    json.dump(movie2ids_final, ofile)

# Extract Persons

In [14]:
from collections import defaultdict

# Initialize sets and dictionaries for storing unique items and mappings
unique_persons = set()
person2ids = {}

# Function to query a person by their label and return their ID
def get_person_id_by_label(person_label):
    # Query the graph to find the ID of a person based on their label
    query_result = list(graph.graph.subjects(predicate=RDFS.label, object=Literal(person_label, lang="en")))
    if query_result:
        # Return the first matching ID if found
        return str(query_result[0])
    else:
        # Return None if no match found
        return None

# Assume `graph` is an instance of your `SPARQLGraph` class
# WD['Q5'] is the identifier for persons in the graph
person_class = WD['Q5']

# Iterate over all subject-object pairs where the predicate is `P31` (instance of)
for s, o in graph.graph.subject_objects(WDT.P31):
    if o == person_class:
        # Fetch the label for the person entity using the graph function
        person_label = graph.get_lbl_for_ent(str(s))

        if person_label != "Unknown Label":
            # Add the label to the unique persons set
            unique_persons.add(person_label)

            # Map person label to its ID in person2ids
            if person_label not in person2ids:
                person2ids[person_label] = [str(s)]
            else:
                ids = person2ids[person_label]
                if str(s) not in ids:
                    ids.append(str(s))
                    person2ids[person_label] = ids

# Convert the sets to lists for final output
list_of_persons = list(unique_persons)


In [15]:
max_ids = 0
max_label = ""
person2id = {}
person2ids_final = {}
for label, ids in person2ids.items():
    if len(ids) > 1:
        person2ids_final[label] = {}
        for person_uri in ids:
            query = f"""PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX schema: <http://schema.org/>
                        SELECT ?description
                        WHERE {{
                            OPTIONAL {{
                                wd:{person_uri.split("/")[-1]} schema:description ?description .
                                FILTER(LANG(?description) = "en")
                            }}
                        }}"""
            result = sparql_graph.query(query)
            results_list = []
            for row in result:
                results_list.append([str(item).encode("utf-8").decode("utf-8") for item in row])
            description = '\n'.join(['\t'.join(row) for row in results_list])
            if not description:
                print(f"MISSING DESCRIPTION: {label}: {person_uri}")
                description = "Unfortunately, no description is available"
            person2ids_final[label].update({person_uri: description})
        if len(ids) > max_ids:
            max_ids = len(ids)
            max_label = label
    else:
        person2id[label] = ids[0]
print(f"{max_label}: {max_ids}")

MISSING DESCRIPTION: Viktor Petrov: http://www.wikidata.org/entity/Q18640608
MISSING DESCRIPTION: David Schneider: http://www.wikidata.org/entity/Q101081610
MISSING DESCRIPTION: Michael Johnson: http://www.wikidata.org/entity/Q101209967
MISSING DESCRIPTION: Pavel Marek: http://www.wikidata.org/entity/Q106118623
MISSING DESCRIPTION: Pavel Marek: http://www.wikidata.org/entity/Q95451452
MISSING DESCRIPTION: Frank Kerr: http://www.wikidata.org/entity/Q106375399
MISSING DESCRIPTION: Frank Kerr: http://www.wikidata.org/entity/Q64489500
MISSING DESCRIPTION: Anjana: http://www.wikidata.org/entity/Q106376333
MISSING DESCRIPTION: Bob Goodman: http://www.wikidata.org/entity/Q106411340
MISSING DESCRIPTION: Éric Barbeau: http://www.wikidata.org/entity/Q50011199
MISSING DESCRIPTION: Pierre Deschamps: http://www.wikidata.org/entity/Q106712717
MISSING DESCRIPTION: Anna Kratochvílová: http://www.wikidata.org/entity/Q107349451
MISSING DESCRIPTION: Christian Becker: http://www.wikidata.org/entity/Q55140

In [16]:
person2ids_final

{'Glenn Miller': {'http://www.wikidata.org/entity/Q103651': 'American big band musician, arranger, composer and bandleader (1904-1944)',
  'http://www.wikidata.org/entity/Q66817768': 'American actor, film director'},
 'Gerald Green': {'http://www.wikidata.org/entity/Q1374400': 'American author, journalist, and television writer',
  'http://www.wikidata.org/entity/Q722629': 'American basketball player'},
 'Carlos Fuentes': {'http://www.wikidata.org/entity/Q154691': 'Mexican writer',
  'http://www.wikidata.org/entity/Q5750388': 'Spanish actor'},
 'Richard Shepherd': {'http://www.wikidata.org/entity/Q15962220': 'producer',
  'http://www.wikidata.org/entity/Q392533': 'British politician (born 1942)'},
 'John McCarthy': {'http://www.wikidata.org/entity/Q16194192': 'Canadian composer for film and television',
  'http://www.wikidata.org/entity/Q92739': 'American computer scientist and cognitive scientist',
  'http://www.wikidata.org/entity/Q21213333': 'actor (20th century)'},
 'James Weaver':

In [17]:
person2id

{'Viktor Krištof': 'http://www.wikidata.org/entity/Q100423423',
 'Yuji Nomi': 'http://www.wikidata.org/entity/Q1012658',
 'Béatrice Thiriet': 'http://www.wikidata.org/entity/Q1019375',
 'Oleg Kapanets': 'http://www.wikidata.org/entity/Q102290694',
 'Ram Lee': 'http://www.wikidata.org/entity/Q102443065',
 'Gabriel Migliori': 'http://www.wikidata.org/entity/Q10287874',
 'Alexander Geringas': 'http://www.wikidata.org/entity/Q102963',
 'José Miguel Wisnik': 'http://www.wikidata.org/entity/Q10310143',
 'Marco Antônio Guimarães': 'http://www.wikidata.org/entity/Q10325285',
 'Mauro Senise': 'http://www.wikidata.org/entity/Q10327605',
 'Oscar Francisco Nascimento': 'http://www.wikidata.org/entity/Q10342176',
 'Pierre Volto': 'http://www.wikidata.org/entity/Q103899296',
 'Carlo Siliotto': 'http://www.wikidata.org/entity/Q1042512',
 'Braguinha': 'http://www.wikidata.org/entity/Q1042647',
 'Veronika Schwarczová': 'http://www.wikidata.org/entity/Q104581329',
 'Jaroslav Bouček': 'http://www.wikidat

In [27]:
import json

# Convert sets and dictionary to lists for JSON compatibility
list_of_persons = list(unique_persons)

# Define paths for each JSON file
persons_path = './../useful_dataset/graph/unique_persons.json'
person2id_path = './../useful_dataset/graph/person2id.json'
person2ids_path = './../useful_dataset/graph/person2ids.json'

# Save unique persons list to JSON
with open(persons_path, 'w') as ofile:
    json.dump(list_of_persons, ofile)

# Save person-to-ID mapping dictionary to JSON
with open(person2id_path, 'w') as ofile:
    json.dump(person2id, ofile)

# Save person-to-IDs mapping dictionary to JSON
with open(person2ids_path, 'w') as ofile:
    json.dump(person2ids_final, ofile)


# Random dataset testing

In [28]:
from rdflib.term import Literal

# Specify the entity you want to explore
entity_id = WD['Q47703']  # Replace with the desired entity if different

# Initialize a list to store relationships and connected entity or value labels
relationship_and_entity_pairs = []

# Iterate over all predicate-object pairs where the subject is the specified entity
for p, o in sparql_graph.predicate_objects(entity_id):
    # Get the label for the relationship (predicate)
    rel_label = graph.get_lbl_for_rel(str(p))
    
    # Check if the object is a literal (like a date, number, or other value)
    if isinstance(o, Literal):
        entity_or_value_label = str(o)  # Directly use the literal value (e.g., date, number)
    else:
        # If the object is not a literal, assume it's an entity and get its label
        entity_or_value_label = graph.get_lbl_for_ent(str(o))
    
    # Only add if both the relationship label and entity/value are valid
    if rel_label != "Unknown Label" and entity_or_value_label != "Unknown Label":
        relationship_and_entity_pairs.append((rel_label, entity_or_value_label))

# Printing the results
print(f"Relationships and Entities/Values for entity {entity_id}:")
for rel_label, entity_or_value_label in relationship_and_entity_pairs:
    print(f"{rel_label}: {entity_or_value_label}")

Relationships and Entities/Values for entity http://www.wikidata.org/entity/Q47703:
node label: The Godfather
http://ddis.ch/atai/tag: action
http://ddis.ch/atai/tag: atmospheric
http://ddis.ch/atai/tag: boring
http://ddis.ch/atai/tag: cult
http://ddis.ch/atai/tag: dramatic
http://ddis.ch/atai/tag: entertaining
http://ddis.ch/atai/tag: murder
http://ddis.ch/atai/tag: revenge
http://ddis.ch/atai/tag: violence
node description: 1972 American film directed by Francis Ford Coppola
film editor: Peter Zinner
film editor: William H. Reynolds
movement: New Hollywood
genre: drama
genre: film based on a novel
genre: gangster film
genre: crime film
nominated for: Academy Award for Best Picture
nominated for: Academy Award for Best Director
nominated for: Academy Award for Best Actor
nominated for: Academy Award for Best Supporting Actor
nominated for: Academy Award for Best Writing, Adapted Screenplay
nominated for: Academy Award for Best Original Dramatic Score
nominated for: Academy Award for B