In [1]:

# merge data


In [2]:

# import libraries

import rdflib, pathlib, pydash, uuid
import pandas, unidecode, json
import datetime, numpy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


In [3]:

# define graph and namespace

graph = rdflib.Graph()
name_wb = rdflib.Namespace('http://wikibas.se/ontology')
name_fiaf = rdflib.Namespace('https://www.fiafnet.org/')


In [4]:

# naive data merge

def concat_graph(address, graph_a):
    graph_b = rdflib.Graph().parse(str(address), format="turtle")
    graph_a += graph_b
    print(str(address).split('/')[-1], '-', len(graph_b), 'triples.')    
    return graph_a

graph = concat_graph(pathlib.Path.cwd().resolve().parents[0] / '1-ontology' / 'ontology.ttl', graph) 

for x in ['academy_film_archive', 'british_film_institute', 'eye_film_institute', 'german_federal_archives', 
          'library_of_congress', 'museum_of_modern_art', 'national_film_archive', 'swedish_film_institute']:
    graph = concat_graph(pathlib.Path.cwd().resolve().parents[0] / '2-contributions' / x / f'{x}.ttl', graph) 

print(datetime.datetime.now(), ';', len(graph), 'triples.')   
    

ontology.ttl - 321 triples.
academy_film_archive.ttl - 4685 triples.
british_film_institute.ttl - 12438 triples.
eye_film_institute.ttl - 5394 triples.
german_federal_archives.ttl - 11962 triples.
library_of_congress.ttl - 2415 triples.
museum_of_modern_art.ttl - 7669 triples.
national_film_archive.ttl - 5034 triples.
swedish_film_institute.ttl - 9316 triples.
2021-01-28 13:02:00.761853 ; 59234 triples.


In [5]:

# convert graph to dataframe

dataframe = pandas.DataFrame(columns=['s','p','o'])
for s,p,o in graph:
    dataframe.loc[len(dataframe)] = [s, p, o]

print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
dataframe.head()


2021-01-28 13:09:06.913294 ; 59234 triples.


Unnamed: 0,s,p,o
0,https://www.bfi.org.uk/resource/reference/1991...,https://www.fiafnet.org/ontology/property/cont...,https://www.bfi.org.uk/ontology/item/bfi
1,https://www.bfi.org.uk/resource/claim/9a36aa1b...,https://www.fiafnet.org/ontology/property/mani...,https://www.bfi.org.uk/resource/work/46936
2,https://www.moma.org/resource/claim/7ca0de6f-f...,https://www.fiafnet.org/ontology/property/item_of,https://www.moma.org/resource/manifestation/50...
3,https://www.bfi.org.uk/resource/agent/184808,http://wikibas.se/ontology#claim,https://www.bfi.org.uk/resource/claim/fa39964a...
4,https://www.bundesarchiv.de/resource/item/BSN-...,http://wikibas.se/ontology#claim,https://www.bundesarchiv.de/resource/claim/e51...


In [6]:

# merge work

def merger(data, item):
    
    # all claims of instance
    data1 = data.copy()
    data1 = data1.loc[data1.p.isin([name_fiaf['ontology/property/instance_of']])]
    data1 = data1.loc[data1.o.isin([name_fiaf[f'ontology/item/{item}']])]
    
    # all class examples
    data2 = data.copy()
    data2 = data2.loc[data2.o.isin(list(data1.s))]
    
    # all claims from class
    data3 = data.copy()
    data3 = data3.loc[data3.s.isin(list(data2.s))]
    data3.columns = ['a','b','c']
    return data3

def pull_property(data, preprocessed, prop):
    data1 = data.copy()
    data1 = data1.loc[data1.s.isin(list(preprocessed.c))]
    data1 = data1.loc[data1.p.isin([name_fiaf[f'ontology/property/{prop}']])]
    data1.columns = ['c','d','e']

    data2 = pandas.merge(preprocessed, data1, on='c', how='right')
    data2 = data2[['e','a']]
    return data2
    
work_data = pull_property(dataframe, merger(dataframe, 'work'), 'title')

work_data['f'] = ''
for n,x in enumerate(work_data.e):
    if work_data.iloc[n]['f'] == '':

        match_work = work_data.copy()
        produce = process.extract(x, list(work_data.e)[n+1:], limit=10000, scorer=fuzz.token_set_ratio)        
        match_work = match_work.loc[match_work.e.isin([a for a, b in produce if b >= 60])]

        family = pydash.uniq([work_data.iloc[n]['a']]+list(match_work.a.unique()))
        new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
        work_data.loc[work_data.a.isin(family), 'f'] = new_uri
    
for k, v in dict(zip(work_data.a, work_data.f)).items():
    dataframe.loc[dataframe.s.isin([k]), 's'] = v
    dataframe.loc[dataframe.o.isin([k]), 'o'] = v    

print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
work_data.head()


2021-01-28 13:09:10.812424 ; 59234 triples.


Unnamed: 0,e,a,f
0,Tabu,https://www.nfa.cz/resource/work/0201933,https://www.fiafnet.org/resource/dac0b04f-78af...
1,Sunrise,https://www.eyefilm.nl/resource/work/FLM64568,https://www.fiafnet.org/resource/eb0024a5-e1e1...
2,Tabu,https://www.eyefilm.nl/resource/work/FLM65135,https://www.fiafnet.org/resource/dac0b04f-78af...
3,Nosferatu,https://www.eyefilm.nl/resource/work/FLM47528,https://www.fiafnet.org/resource/bdea923b-b9a9...
4,SATANAS,https://www.bfi.org.uk/resource/work/276788,https://www.fiafnet.org/resource/c6135953-989b...


In [7]:

# apply work label
    
work_label_data = pull_property(dataframe, merger(dataframe, 'work'), 'title')
work_label_data['x'] = '1'
work_label_data = work_label_data.pivot_table(index=['a','e'], aggfunc=lambda x: len(x)).reset_index()
work_label_data = work_label_data.sort_values(by='x', ascending=False).drop_duplicates(subset='a', keep='first')

for x in range(len(work_label_data)):
    line = work_label_data.iloc[x]
    dataframe.loc[len(dataframe)] = [(line['a']), (rdflib.RDFS.label), rdflib.Literal(line['e'], lang='en')]
    
print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
work_label_data.head()


2021-01-28 13:09:11.390144 ; 59255 triples.


Unnamed: 0,a,e,x
30,https://www.fiafnet.org/resource/bc4c2ca8-4ec2...,Der letzte Mann,5
45,https://www.fiafnet.org/resource/dac0b04f-78af...,Tabu,5
5,https://www.fiafnet.org/resource/14298782-2157...,Phantom,5
26,https://www.fiafnet.org/resource/b1b478e2-3930...,City Girl,4
41,https://www.fiafnet.org/resource/d5257266-4d45...,Faust,4


In [8]:

# merge agent

def merge_name(row):
    return (row['fore']+' '+row['sur']).strip()

forename_data = pull_property(dataframe, merger(dataframe, 'agent'), 'forename').rename(columns={'e':'fore'}) 
surname_data = pull_property(dataframe, merger(dataframe, 'agent'), 'surname').rename(columns={'e':'sur'}) 
name_data = pandas.merge(forename_data, surname_data, on='a', how='outer').drop_duplicates().fillna('')

name_data['e'] = name_data.apply(merge_name, axis=1)
name_data = name_data[['e','a']].drop_duplicates() 

name_data['f'] = ''
for n,x in enumerate(name_data.e):
    if name_data.iloc[n]['f'] == '':

        match_name = name_data.copy()
        produce = process.extract(x, list(name_data.e)[n+1:], limit=10000, scorer=fuzz.partial_token_sort_ratio)        
        match_name = match_name.loc[match_name.e.isin([a for a, b in produce if b >= 82])]

        family = pydash.uniq([name_data.iloc[n]['a']]+list(match_name.a.unique()))
        new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
        name_data.loc[name_data.a.isin(family), 'f'] = new_uri
        
for k, v in dict(zip(name_data.a, name_data.f)).items():
    dataframe.loc[dataframe.s.isin([k]), 's'] = v
    dataframe.loc[dataframe.o.isin([k]), 'o'] = v    

print(datetime.datetime.now(), ';', len(dataframe), 'triples.') 
name_data.head()        


2021-01-28 13:10:02.797013 ; 59255 triples.


Unnamed: 0,e,a,f
0,Alfred Abel,https://www.nfa.cz/resource/agent/03aff1cf-e8f...,https://www.fiafnet.org/resource/b5057690-6d04...
1,Heinrich Witte,https://www.nfa.cz/resource/agent/a38ca830-52f...,https://www.fiafnet.org/resource/601c7c97-4ed0...
2,Friedrich Wilhelm Murnau,https://www.eyefilm.nl/resource/agent/PER62116,https://www.fiafnet.org/resource/8e2d4943-8bd2...
102,Elsa Wagner,https://www.bundesarchiv.de/resource/agent/113798,https://www.fiafnet.org/resource/8cd482b9-370a...
103,Emil Jannings,https://www.moma.org/resource/agent/26716,https://www.fiafnet.org/resource/717f24e8-c906...


In [9]:

# apply agent label

forename_data = pull_property(dataframe, merger(dataframe, 'agent'), 'forename').rename(columns={'e':'fore'}) 
forename_data['x'] = '1'
forename_data = forename_data.pivot_table(index=['a','fore'], aggfunc=lambda x: len(x)).reset_index()
forename_data = forename_data.sort_values(by='x', ascending=False).drop_duplicates(subset='a', keep='first')[['a', 'fore']].fillna('')

surname_data = pull_property(dataframe, merger(dataframe, 'agent'), 'surname').rename(columns={'e':'sur'}) 
surname_data['x'] = '1'
surname_data = surname_data.pivot_table(index=['a','sur'], aggfunc=lambda x: len(x)).reset_index()
surname_data = surname_data.sort_values(by='x', ascending=False).drop_duplicates(subset='a', keep='first')[['a', 'sur']].fillna('')

name_label_data = pandas.merge(forename_data, surname_data, on='a', how='outer').drop_duplicates().fillna('')
name_label_data['e'] = name_label_data.apply(merge_name, axis=1)
name_label_data = name_label_data[['e','a']].drop_duplicates() 

for x in range(len(name_label_data)):
    line = name_label_data.iloc[x]
    dataframe.loc[len(dataframe)] = [(line['a']), (rdflib.RDFS.label), rdflib.Literal(line['e'], lang='en')]
    
print(datetime.datetime.now(), ';', len(dataframe), 'triples.') 
name_label_data.head()


2021-01-28 13:10:07.768278 ; 59555 triples.


Unnamed: 0,e,a
0,Friedrich Wilhelm Murnau,https://www.fiafnet.org/resource/8e2d4943-8bd2...
1,Karl Freund,https://www.fiafnet.org/resource/9ba1d807-7cb4...
2,Carl Mayer,https://www.fiafnet.org/resource/690ae3f0-67cc...
3,Erich Pommer,https://www.fiafnet.org/resource/40b167d0-2248...
4,Emil Jannings,https://www.fiafnet.org/resource/717f24e8-c906...


In [10]:

# merge manifestations

def pull_label(data):
    def check_lang(row):
        return row['g'].language    
    
    labels = data.copy()
    labels = labels.loc[labels.p.isin([rdflib.RDFS.label])].rename(columns={'s':'e', 'p':'f', 'o':'g'})
    labels['l'] = labels.apply(check_lang, axis=1)
    labels = labels.loc[labels.l.isin(['en'])]  
    return labels

def merge_label(row):
    concat = (row['specific']+' '+row['base']).strip()
    concat = (concat+' '+row['carrier']).strip()
    concat = (concat+' of '+row['label']).strip()    
    return concat

specific_data = pull_property(dataframe, merger(dataframe, 'item'), 'specific_carrier')
specific_data = pandas.merge(specific_data, pull_label(dataframe), on='e', how='left')[['a','g']].rename(columns={'g':'specific'})

base_data = pull_property(dataframe, merger(dataframe, 'item'), 'base')
base_data = pandas.merge(base_data, pull_label(dataframe), on='e', how='left')[['a','g']].rename(columns={'g':'base'})

carrier_data = pull_property(dataframe, merger(dataframe, 'item'), 'carrier')
carrier_data = pandas.merge(carrier_data, pull_label(dataframe), on='e', how='left')[['a','g']].rename(columns={'g':'carrier'})

item_manifestation = pull_property(dataframe, merger(dataframe, 'item'), 'item_of').rename(columns={'e':'m'})

work_label = dataframe.copy()
work_label = work_label.loc[work_label.p.isin([name_fiaf['ontology/property/manifestation_of']])]
work_label.columns = ['o','q','e']
work_label = pandas.merge(dataframe, work_label, on='o', how='right')
work_label = pandas.merge(work_label, pull_label(dataframe), on='e', how='left')[['s','g']].rename(columns={'s':'m','g':'label'})

manifest_data = pandas.merge(specific_data, base_data, on='a', how='outer').drop_duplicates().fillna('')
manifest_data = pandas.merge(manifest_data, carrier_data, on='a', how='outer').drop_duplicates().fillna('')
manifest_data = pandas.merge(manifest_data, item_manifestation, on='a', how='outer').drop_duplicates().fillna('')
manifest_data = pandas.merge(manifest_data, work_label, on='m', how='outer').drop_duplicates().fillna('')

manifest_data['e'] = manifest_data.apply(merge_label, axis=1)
manifest_data = manifest_data[['e','m']].drop_duplicates() 

manifest_data['f'] = ''
for n,x in enumerate(manifest_data.e):
    if manifest_data.iloc[n]['f'] == '':

        match_manifest = manifest_data.copy()
        produce = process.extract(x, list(manifest_data.e)[n+1:], limit=10000, scorer=fuzz.token_sort_ratio)        
        match_manifest = match_manifest.loc[match_manifest.e.isin([a for a, b in produce if b >= 99])]

        family = pydash.uniq([manifest_data.iloc[n]['m']]+list(match_manifest.m.unique()))
        new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
        manifest_data.loc[manifest_data.m.isin(family), 'f'] = new_uri
        
for k, v in dict(zip(manifest_data.m, manifest_data.f)).items():
    dataframe.loc[dataframe.s.isin([k]), 's'] = v
    dataframe.loc[dataframe.o.isin([k]), 'o'] = v
    
print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
manifest_data.head()    


2021-01-28 13:10:29.000820 ; 59555 triples.


Unnamed: 0,e,m,f
0,35mm Acetate Film of Der letzte Mann,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/a738b44e-b78d...
1,VHS Acetate Videotape of Nosferatu,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/2b170ee4-5891...
2,35mm Nitrate Film of Tabu,https://www.eyefilm.nl/resource/manifestation/...,https://www.fiafnet.org/resource/2c7be1e0-4dbf...
3,VHS Acetate Videotape of Der letzte Mann,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/6589153c-d638...
4,35mm Nitrate Film of Der brennende Acker,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/4abb58a0-edea...


In [11]:

# apply manifestation label

for x in range(len(manifest_data)):
    line = manifest_data.iloc[x]
    dataframe.loc[len(dataframe)] = [(line['f']), (rdflib.RDFS.label), rdflib.Literal(line['e'], lang='en')]

print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
manifest_data.head()  


2021-01-28 13:10:35.126721 ; 60048 triples.


Unnamed: 0,e,m,f
0,35mm Acetate Film of Der letzte Mann,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/a738b44e-b78d...
1,VHS Acetate Videotape of Nosferatu,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/2b170ee4-5891...
2,35mm Nitrate Film of Tabu,https://www.eyefilm.nl/resource/manifestation/...,https://www.fiafnet.org/resource/2c7be1e0-4dbf...
3,VHS Acetate Videotape of Der letzte Mann,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/6589153c-d638...
4,35mm Nitrate Film of Der brennende Acker,https://www.bundesarchiv.de/resource/manifesta...,https://www.fiafnet.org/resource/4abb58a0-edea...


In [12]:

# apply item label

def renamer(row):
    return str(str(row['held'])+' item '+str(row['ext_id'])).strip()

id_data = pull_property(dataframe, merger(dataframe, 'item'), 'external_id').rename(columns={'e':'ext_id'}).drop_duplicates()
held_data = pull_property(dataframe, merger(dataframe, 'item'), 'held_at')
held_data = pandas.merge(held_data, pull_label(dataframe), on='e', how='left')[['a','g']].rename(columns={'g':'held'}).drop_duplicates()

item_data = pandas.merge(held_data, id_data, on='a', how='outer')
item_data['combined'] = item_data.apply(renamer, axis=1)
for x in range(len(item_data)):
    line = item_data.iloc[x]
    dataframe.loc[len(dataframe)] = [(line['a']), (rdflib.RDFS.label), rdflib.Literal(line['combined'], lang='en')]
    
print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
item_data.head()     


2021-01-28 13:10:40.827398 ; 60540 triples.


Unnamed: 0,a,held,ext_id,combined
0,https://www.bundesarchiv.de/resource/item/BDL-...,German Federal Archives,BDL-50104,German Federal Archives item BDL-50104
1,https://www.oscars.org/film-archiveresource/it...,Academy Film Archive,I247037,Academy Film Archive item I247037
2,https://www.eyefilm.nl/resource/item/KOP53501,Eye Film Institute,KOP53501,Eye Film Institute item KOP53501
3,https://loc.gov/resource/item/4374161,Library of Congress,4374161,Library of Congress item 4374161
4,https://www.bundesarchiv.de/resource/item/BSN-...,German Federal Archives,BSN-1790,German Federal Archives item BSN-1790


In [13]:

# merge claims

def rewrite(row):
    new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
    for x in row['c'].split(','):
        dataframe.loc[dataframe.s.str.contains(x, na=False), 's'] = new_uri     
        dataframe.loc[dataframe.o.str.contains(x, na=False), 'o'] = new_uri        

dataframe1 = dataframe.copy()
dataframe1 = dataframe1.loc[dataframe1.p.isin([name_wb['#claim']])]
dataframe1.columns = ['a','b','c']

dataframe2 = dataframe.copy()
dataframe2.columns = ['c','d','e']

dataframe3 = pandas.merge(dataframe1, dataframe2, on='c', how='left')
dataframe3 = dataframe3[['a', 'd', 'e', 'c']].drop_duplicates()
dataframe3 = dataframe3.pivot_table(index=['a','d','e'], aggfunc=lambda x: ','.join(sorted(x.unique()))).reset_index()
dataframe3 = dataframe3.loc[dataframe3.c.str.contains(',', na=False)]
dataframe3.apply(rewrite, axis=1)

dataframe = dataframe.drop_duplicates()
print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
dataframe.head() 


2021-01-28 13:27:07.226644 ; 47375 triples.


Unnamed: 0,s,p,o
0,https://www.bfi.org.uk/resource/reference/1991...,https://www.fiafnet.org/ontology/property/cont...,https://www.bfi.org.uk/ontology/item/bfi
1,https://www.fiafnet.org/resource/7e122d98-1363...,https://www.fiafnet.org/ontology/property/mani...,https://www.fiafnet.org/resource/eb0024a5-e1e1...
2,https://www.moma.org/resource/claim/7ca0de6f-f...,https://www.fiafnet.org/ontology/property/item_of,https://www.fiafnet.org/resource/1e7acd02-f396...
3,https://www.fiafnet.org/resource/40b167d0-2248...,http://wikibas.se/ontology#claim,https://www.bfi.org.uk/resource/claim/fa39964a...
4,https://www.bundesarchiv.de/resource/item/BSN-...,http://wikibas.se/ontology#claim,https://www.bundesarchiv.de/resource/claim/e51...


In [14]:

# convert dataframe to graph

export_graph = rdflib.Graph()
for x in range(len(dataframe)):
    line = dataframe.iloc[x]
    export_graph.add((line['s'], line['p'], line['o']))

print(datetime.datetime.now(), ';', len(dataframe), 'triples.')
dataframe.head() 


2021-01-28 13:27:16.876664 ; 47375 triples.


Unnamed: 0,s,p,o
0,https://www.bfi.org.uk/resource/reference/1991...,https://www.fiafnet.org/ontology/property/cont...,https://www.bfi.org.uk/ontology/item/bfi
1,https://www.fiafnet.org/resource/7e122d98-1363...,https://www.fiafnet.org/ontology/property/mani...,https://www.fiafnet.org/resource/eb0024a5-e1e1...
2,https://www.moma.org/resource/claim/7ca0de6f-f...,https://www.fiafnet.org/ontology/property/item_of,https://www.fiafnet.org/resource/1e7acd02-f396...
3,https://www.fiafnet.org/resource/40b167d0-2248...,http://wikibas.se/ontology#claim,https://www.bfi.org.uk/resource/claim/fa39964a...
4,https://www.bundesarchiv.de/resource/item/BSN-...,http://wikibas.se/ontology#claim,https://www.bundesarchiv.de/resource/claim/e51...


In [15]:

# write graph

export_graph.serialize(destination=str(pathlib.Path.cwd() / 'merge.ttl'), format="turtle")
print(datetime.datetime.now(), ';', len(export_graph), 'triples.')


2021-01-28 13:27:32.367924 ; 47375 triples.
