In [1]:
# combine datasets.

from thefuzz import process
from thefuzz import fuzz
import numpy
import pandas
import pathlib
import pydash
import rdflib
import tqdm
import uuid

def print_label(a, g):

    ''' Lookup a label for an entity. '''

    a_label = [o for s,p,o in g.triples((a, rdflib.RDFS.label, None))]
    return str(a_label[0])

def graph_to_dataframe(g, title_predicate):

    ''' Build agent and work dataframe out of a graph. '''

    df = pandas.DataFrame(columns=['agent_uri', 'agent_label'])
    for a, b, c in g.triples((None, None, ont.Person)):
        for d, e ,f in g.triples((a, rdflib.RDFS.label, None)):
            df.loc[len(df)] = [(a), (f)]

    activity = pandas.DataFrame()
    for x in [ont.hasActor, ont.hasDirector, ont.hasProducer]:
        connection = pandas.DataFrame(columns=['agent_uri', 'work_uri'])
        for a, b, c in g.triples((None, x, None)):
            connection.loc[len(connection)] = [(c), (a)]
        activity = pandas.concat([activity, connection])
    df = pandas.merge(df, activity, on='agent_uri', how='left')

    title = pandas.DataFrame(columns=['work_uri', 'work_label'])
    for a, b, c in g.triples((None, None, ont.Work)):
        for d, e ,f in g.triples((a, title_predicate, None)):
            title.loc[len(title)] = [(a), (f)]
    df = pandas.merge(df, title, on='work_uri', how='left')

    return df

def merge_entities(match_df, g):

    ''' Create new graph with merged nodes. '''

    matched_uri = pydash.uniq(list(match_df.a.unique())+list(match_df.b.unique()))
    merge_instances = dict()
    for x in matched_uri:
        family = match_df.loc[match_df.a.isin([x]) | match_df.b.isin([x])]
        if len(family):
            family_members = pydash.uniq(list(family.a.unique())+list(family.b.unique()))
            merge_uri = rdflib.URIRef(f'https://merge/{str(uuid.uuid4())}')
            for f in family_members:
                merge_instances[f] = merge_uri

            match_df = match_df.loc[~match_df.a.isin(family_members)]
            match_df = match_df.loc[~match_df.b.isin(family_members)]

    n_graph = rdflib.Graph()
    for s,p,o in g.triples((None, None, None)):
        if s in merge_instances.keys():
            s = merge_instances[s]
        if o in merge_instances.keys():
            o = merge_instances[o]
        n_graph.add((s,p,o))

    return n_graph

graph = rdflib.Graph()
ont = rdflib.Namespace("https://australian-filmography.wiki/ontology/")

graph.parse(pathlib.Path.cwd().parents[0] / 'source' / 'pike-cooper' / 'pike-cooper.ttl')
graph.parse(pathlib.Path.cwd().parents[0] / 'source' / 'ozmovies' / 'ozmovies.ttl')

dataframe = graph_to_dataframe(graph, ont.hasTitle)

print(len(dataframe))
dataframe.head()

16567


Unnamed: 0,agent_uri,agent_label,work_uri,work_label
0,https://pikecooper/005bc095-ffc1-41d3-9afa-e81...,Dick Ross,https://pikecooper/e5e2bc2d-239f-4871-8f13-fd8...,Shadow of the Boomerang
1,https://pikecooper/02045c46-2cb9-45ae-ab7e-4ca...,George Dean,https://pikecooper/7bdc0f62-cfd3-497d-94bb-4a3...,"A Long, Long Way to Tipperary"
2,https://pikecooper/046937df-0b34-41f9-b6c3-f41...,Lee Robinson,https://pikecooper/27f0b2f2-c6bd-4aaf-b64e-6fe...,The Phantom Stockman
3,https://pikecooper/046937df-0b34-41f9-b6c3-f41...,Lee Robinson,https://pikecooper/6f0567f2-16e7-4ea9-a651-15f...,King of the Coral Sea
4,https://pikecooper/046937df-0b34-41f9-b6c3-f41...,Lee Robinson,https://pikecooper/817a62a8-9a47-4516-abcb-59f...,The Intruders


In [2]:
# agent matching.

sample = dataframe.copy()

match_dataframe = pandas.DataFrame(columns=['a', 'a_label', 'b', 'b_label'])
for sub in tqdm.tqdm(sample.agent_uri.unique()):
    sub_filmography = sample.loc[sample.agent_uri.isin([sub])].to_dict('records')
    sub_labels = pydash.uniq([str(x['agent_label']) for x in sub_filmography])
    label_match = pydash.flatten([(process.extract(x, sample.agent_label.unique(), limit=1000, scorer=fuzz.token_sort_ratio)) for x in sub_labels])
    label_match = [x[0] for x in label_match if x[1] > 70]

    candidates = sample.loc[sample.agent_label.isin(label_match)]
    for can in candidates.agent_uri.unique():
        if can != sub: # note you could also force distinctions here!
            can_filmography = sample.loc[sample.agent_uri.isin([can])].to_dict('records')
            if len(sub_filmography) >= 1 and len(can_filmography) >= 1: # higher numbers return fewer, more confident matches.
                
                can_score = list()
                for f in sub_filmography:
                    match = process.extractOne(str(f['work_label']), list([x['work_label'] for x in can_filmography]), scorer=fuzz.token_sort_ratio)
                    can_score.append(match[1])

                if numpy.median(can_score) == 100:
                    match_dataframe.loc[len(match_dataframe)] = [(can), (print_label(can, graph)), (sub), (print_label(sub, graph))] 

graph = merge_entities(match_dataframe, graph)
print(len(graph))

100%|██████████| 7262/7262 [09:05<00:00, 13.31it/s]


37683


In [3]:
# exact title match.

dataframe = graph_to_dataframe(graph, rdflib.RDFS.label)
overlap = dataframe.copy().drop_duplicates()
overlap = overlap[['agent_uri',  'work_label']]
overlap = overlap[overlap.duplicated(keep=False)].drop_duplicates()

match_dataframe = pandas.DataFrame(columns=['a', 'b'])
for o in overlap.to_dict('records'):
    block = dataframe.copy()
    block = block.loc[block.agent_uri.isin([o['agent_uri']])]
    block = block.loc[block.work_label.isin([o['work_label']])]
    same_values = list(block.work_uri.unique())
    for a in same_values[1:]:
        match_dataframe.loc[len(match_dataframe)] = [(same_values[0]), (a)]

graph = merge_entities(match_dataframe, graph)
print(len(graph))

36760


In [4]:
# exact agent match.

dataframe = graph_to_dataframe(graph, rdflib.RDFS.label)
overlap = dataframe.copy().drop_duplicates()
overlap = overlap[['agent_label', 'work_uri']]
overlap = overlap[overlap.duplicated(keep=False)].drop_duplicates()

match_dataframe = pandas.DataFrame(columns=['a', 'b'])
for o in overlap.to_dict('records'):
    block = dataframe.copy()
    block = block.loc[block.agent_label.isin([o['agent_label']])]
    block = block.loc[block.work_uri.isin([o['work_uri']])]
    same_values = list(block.agent_uri.unique())
    for a in same_values[1:]:
        match_dataframe.loc[len(match_dataframe)] = [(same_values[0]), (a)]

graph = merge_entities(match_dataframe, graph)
print(len(graph)) 

36757


In [5]:
graph.serialize(destination=str(pathlib.Path.cwd() / 'merge.ttl'), format="turtle")
print('all done.')

all done.
