# Embeddings for the DDIS Movie Graph

In [5]:
%pip install numpy pandas scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.0 MB 599.9 kB/s eta

## Setup

In [1]:
# imports
import csv
import numpy as np
import os
import rdflib
import pandas as pd
from sklearn.metrics import pairwise_distances

In [2]:
# define some prefixes
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

## Load the data

In [3]:
# load the graph
graph = rdflib.Graph().parse('./../too_large_dataset/ddis-movie-graph.nt', format='turtle')

In [6]:
# load the embeddings
entity_emb = np.load('./../too_large_dataset/ddis-graph-embeddings/entity_embeds.npy')
relation_emb = np.load('./../too_large_dataset/ddis-graph-embeddings/relation_embeds.npy')

In [7]:
# load the dictionaries
with open('./../too_large_dataset/ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('./../too_large_dataset/ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [8]:
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

In [9]:
from rdflib import URIRef
predicates = set()

for s, p, o in graph:  # Iterate over all triples in the graph
    predicates.add(p)  # Collect the unique predicates

# Now map each predicate to its label (if available)
rel2lbl = {}

for pred in predicates:
    # Find the label for the predicate, if it exists
    label = graph.value(subject=pred, predicate=RDFS.label)
    if label:
        rel2lbl[pred] = str(label)
    else:
        # If no label is available, use the URI as the fallback
        rel2lbl[pred] = str(pred)

# Reverse dictionary: labels to relationships
lbl2rel = {lbl: rel for rel, lbl in rel2lbl.items()}

In [10]:
from rdflib import URIRef
from collections import defaultdict
from rdflib.namespace import RDFS

# Initialize a dictionary to store the count of each predicate
predicate_counts = defaultdict(int)

# Initialize a set to collect unique predicates
predicates = set()

# Iterate over all triples in the graph
for s, p, o in graph:
    predicates.add(p)  # Collect the unique predicates
    predicate_counts[p] += 1  # Count how often each predicate occurs

# Now map each predicate to its label (if available)
rel2lbl = {}

for pred in predicates:
    # Find the label for the predicate, if it exists
    label = graph.value(subject=pred, predicate=RDFS.label)
    if label:
        rel2lbl[pred] = str(label)
    else:
        # If no label is available, use the URI as the fallback
        rel2lbl[pred] = str(pred)

# Reverse dictionary: labels to relationships
lbl2rel = {lbl: rel for rel, lbl in rel2lbl.items()}

# Now, let's also map the label to its count (for easy reading)
label_counts = {rel2lbl[pred]: predicate_counts[pred] for pred in predicates}

# Output the label counts for inspection
for label, count in label_counts.items():
    print(f"{label}: {count} occurrences")

# show in sorted table
df = pd.DataFrame(label_counts.items(), columns=['label', 'count']).sort_values('count', ascending=False)


RARS rating: 223 occurrences
director: 28796 occurrences
followed by: 2315 occurrences
voice actor: 12366 occurrences
sport: 3476 occurrences
director of photography: 16801 occurrences
archives at: 49 occurrences
EIRIN film rating: 940 occurrences
contributor to the creative work or subject: 2406 occurrences
religion: 1193 occurrences
ancestral home: 44 occurrences
musical conductor: 34 occurrences
twinned administrative body: 3704 occurrences
interested in: 54 occurrences
based on: 5490 occurrences
Medierådet rating: 5941 occurrences
languages spoken, written or signed: 61107 occurrences
choreographer: 64 occurrences
JMK film rating: 1988 occurrences
occupation: 81617 occurrences
historic county: 162 occurrences
quotes work: 39 occurrences
member of: 741 occurrences
screenwriter: 32150 occurrences
sex or gender: 84 occurrences
original language of film or TV show: 31223 occurrences
distribution format: 21556 occurrences
participant: 348 occurrences
partially coincident with: 46 occurr

## Inspect the data

In [11]:
# number of triples in the graph
len(graph)

2056777

In [12]:
# number of entities in the graph
triples = {(s, p, o) for s,p,o in graph.triples((None, None, None)) if isinstance(o, rdflib.term.URIRef)}
len({s for s,p,o in triples} | {o for s,p,o in triples})

158900

In [13]:
# entity embedding size
entity_emb.shape

(158901, 256)

In [14]:
# relation embedding size
relation_emb.shape

(248, 256)

## Finding errors

In [15]:
# let's see what our graph thinks the occupation of Jean Van Hamme is
professions = set(graph.query('''
    prefix wdt: <http://www.wikidata.org/prop/direct/>
    prefix wd: <http://www.wikidata.org/entity/>
    
    SELECT ?obj ?lbl WHERE {
        ?ent rdfs:label "Jean Van Hamme"@en .
        ?ent wdt:P106 ?obj .
        ?obj rdfs:label ?lbl .
    }
    '''))
{ent[len(WD):]: str(lbl) for ent, lbl in professions}

{'Q329737': 'butcher'}

In [22]:
# "Jean Van Hamme" entity
head = entity_emb[ent2id[WD.Q428160]]

print(head)
# "occupation" relation
pred = relation_emb[rel2id[WDT.P106]]
# add vectors according to TransE scoring function.
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible entities
most_likely = dist.argsort()
# compute ranks of entities
ranks = dist.argsort().argsort()

[-2.15648285e+02 -3.43045105e+02  1.62101669e+01  1.32401404e+03
 -2.56250954e+01  5.74138298e+01  2.46062515e+02 -1.21514114e+02
 -3.26596100e+02 -1.34717758e+02 -1.47574738e+02 -1.48140198e+02
  1.05306747e+02 -2.79783844e+02 -2.55474396e+02 -6.30456177e+02
  4.98109894e+01  2.15843735e+02  8.40045929e+01  1.12588898e+02
 -2.35675705e+02  6.25411377e+02  2.43842072e+02 -3.12706909e+02
 -3.54335236e+02  1.53338882e+02  7.79959641e+01 -8.29916382e+01
  6.28118172e+01 -1.43937668e+02 -1.63557709e+02  2.05709229e+02
 -2.15649292e+02  1.72930313e+02 -1.67236511e+02 -1.09734216e+03
  1.32000702e+02 -3.79493866e+02  9.74276581e+01 -3.76270905e+02
  2.08139205e+00 -1.30563705e+02  5.71233292e+01 -2.62667053e+02
 -5.63552513e+01 -2.70164032e+01 -4.57405128e+01  1.09456078e+02
 -9.02489662e+00 -2.83179871e+02  6.51567917e+01 -1.39847443e+02
 -4.29040009e+02  4.61027184e+01 -7.60779495e+01  2.24534729e+02
 -1.18828796e+02  3.03453186e+02 -1.85770660e+02  6.11215286e+01
 -1.20118322e+01 -4.48816

In [18]:
# show scores for (Jean Van Hamme, occupation, butcher)
pd.DataFrame([(str(lbl), dist[ent2id[ent]], ranks[ent2id[ent]]) for ent, lbl in professions],
        columns=('Occupation', 'Score', 'Rank'))

Unnamed: 0,Occupation,Score,Rank
0,butcher,5196.63623,86817


In [19]:
# what would be more plausible occupations?
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

Unnamed: 0,Entity,Label,Score,Rank
0,Q36180,writer,3223.151367,1
1,Q33999,actor,3382.129883,2
2,Q6625963,novelist,3397.58252,3
3,Q1930187,journalist,3492.735352,4
4,Q4610556,model,3549.961182,5
5,Q639669,musician,3616.397949,6
6,Q67311526,Obalky knih.cz,3628.230957,7
7,Q150,French,3667.630615,8
8,Q1028181,painter,3694.66748,9
9,Q245068,comedian,3744.303955,10


## Entity Similarity

In [20]:
# which entities are similar to "Harry Potter and the Goblet of Fire"
ent = ent2id[WD.Q102225]
# we compare the embedding of the query entity to all other entity embeddings
dist = pairwise_distances(entity_emb[ent].reshape(1, -1), entity_emb).reshape(-1)
# order by plausibility
most_likely = dist.argsort()

pd.DataFrame([
    (
        id2ent[idx][len(WD):], # qid
        ent2lbl[id2ent[idx]],  # label
        dist[idx],             # score
        rank+1,                # rank
    )
    for rank, idx in enumerate(most_likely[:15])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

Unnamed: 0,Entity,Label,Score,Rank
0,Q102225,Harry Potter and the Goblet of Fire,0.0,1
1,Q102235,Harry Potter and the Order of the Phoenix,1839.364746,2
2,Q161678,Harry Potter and the Deathly Hallows – Part 1,1864.835327,3
3,Q161687,Harry Potter and the Half-Blood Prince,1895.976562,4
4,Q232009,Harry Potter and the Deathly Hallows – Part 2,1941.686646,5
5,Q102244,Harry Potter and the Chamber of Secrets,2000.969482,6
6,Q102438,Harry Potter and the Philosopher's Stone,2059.114502,7
7,Q102448,Harry Potter and the Prisoner of Azkaban,2140.694092,8
8,Q18199330,Fantastic Beasts and Where to Find Them,2350.040771,9
9,Q1880543,Harry Potter and the Deathly Hallows,2419.553467,10


## Recovering categories

In [16]:
# hmm, our graph contains no parent class of bridge (Q12280)...
set(graph.objects(WD.Q12280, WDT.P279))

set()

In [17]:
# maybe an indirect subclass?
set(graph.objects(WD.Q12280, DDIS.indirectSubclassOf))

set()

In [18]:
# Let's see if we can recover this from embeddings...

# set the head entity to bridge
head = entity_emb[ent2id[WD.Q12280]]
# let's try with wdt:P279 (subClassOf) first
pred = relation_emb[rel2id[WDT.P279]]
# combine according to the TransE scoring function
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible tails
most_likely = dist.argsort()
# show most likely entities
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

Unnamed: 0,Entity,Label,Score,Rank
0,Q12280,bridge,3145.157227,1
1,Q61457040,Ramsar site in Australia,5208.453125,2
2,Q6502866,cliffed coast,5241.154785,3
3,Q595452,baseball venue,5261.777832,4
4,Q19368170,Pont d'en Gómez,5276.720215,5
5,Q2463705,Special Protection Area,5281.016113,6
6,Q2066754,manor,5301.109863,7
7,Q17468479,district of Oulu,5323.046875,8
8,Q1049757,multi-purpose stadium,5326.45752,9
9,Q202570,Ferris wheel,5341.973633,10


In [19]:
# ... didn't really help.
# Let's try ddis:indirectSubclassOf next

# set the head entity to bridge
head = entity_emb[ent2id[WD.Q12280]]
# now we try ddis:indirectSubclassOf
pred = relation_emb[rel2id[DDIS.indirectSubclassOf]]
# combine according to the TransE scoring function
lhs = head + pred
# compute distance to *any* entity
dist = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible tails
most_likely = dist.argsort()
# show most likely entities
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

Unnamed: 0,Entity,Label,Score,Rank
0,Q12280,bridge,4617.544922,1
1,Q27096213,geographic entity,5803.001465,2
2,Q95074,fictional character,6175.950195,3
3,Q19368170,Pont d'en Gómez,6219.216797,4
4,Q2282230,River Kwai bridge,6309.948242,5
5,Q1323635,Petit Pont,6325.95166,6
6,Q6502866,cliffed coast,6327.05127,7
7,Q1497364,building complex,6341.214355,8
8,Q2151232,townland,6382.717773,9
9,Q2080521,market hall,6385.13916,10
