In [1]:

# merge, add labelling to work, then add labelling to agent and manifestations.
# then merge manifestations based on label



In [2]:

# naive merge of triples

import rdflib, pathlib, pydash, uuid
import pandas, unidecode, json, numpy
import datetime
from fuzzywuzzy import fuzz

def concat_graph(address, graph_a):
    graph_b = rdflib.Graph().parse(str(address), format="turtle")
    graph_a += graph_b
    return graph_a

graph = rdflib.Graph()
graph = concat_graph(pathlib.Path.cwd().resolve().parents[0] / '1-ontology' / 'fiaf.ttl', graph) 
# for x in ['eye', 'nfa', 'sfi', 'moma', 'loc', 'barch', 'afa']:

for x in ['sfi', 'moma', 'eye', 'nfa', 'barch', 'loc']:
    graph = concat_graph(pathlib.Path.cwd().resolve().parents[0] / '2-contributors' / x / f'{x}.ttl', graph) 

print(datetime.datetime.now(), len(graph))


2021-01-16 07:28:15.527503 42006


In [3]:

# merging works, currently performed by just exploiting title similarities


name_wb = rdflib.Namespace('http://wikibas.se/ontology')
name_fiaf = rdflib.Namespace("https://www.fiafnet.org/")

def pull_subject(p, o):
    table = pandas.DataFrame(columns=['SUBJECT'])
    for a,b,c in graph.triples((None, name_fiaf['ontology/property/instance_of'], o)):
        for d,e,f in graph.triples((None, name_wb['#claim'], a)):
            table.loc[len(table)] = [(d)]            
    return table

def pull_property(p):
    table = pandas.DataFrame(columns=['SUBJECT', 'OBJECT'])
    for a,b,c in graph.triples((None, p, None)):
        for d,e,f in graph.triples((None, name_wb['#claim'], a)):
            table.loc[len(table)] = [(d), (c)]
    return table

data_a = pull_subject(name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/work']) 
data_b = pull_property(name_fiaf['ontology/property/title'])
works = pandas.merge(data_a, data_b, on='SUBJECT', how='left').drop_duplicates()    

works['hinge'] = '.'
works = pandas.merge(works.rename(columns={'SUBJECT':'SUBJECT_a', 'OBJECT':'OBJECT_a'}), 
                     works.rename(columns={'SUBJECT':'SUBJECT_b', 'OBJECT':'OBJECT_b'}),
                     on='hinge', how='outer')

def title_matching(row):
    left = unidecode.unidecode(row['OBJECT_a']).upper()
    right = unidecode.unidecode(row['OBJECT_b']).upper()
    ratio = fuzz.partial_ratio(left, right)
    return ratio

works['fuzz'] = works.apply(title_matching, axis=1)
works = works.sort_values(by='fuzz', ascending=False)
works = works.loc[works.fuzz > 85].drop_duplicates()
works = works[['SUBJECT_a', 'SUBJECT_b']]
works = works.loc[works.SUBJECT_a != works.SUBJECT_b].drop_duplicates()

owl_graph = rdflib.Graph()

for x in range(len(works)):
    section = works.iloc[x]
    owl_graph.add((section['SUBJECT_a'], rdflib.OWL.sameAs, section['SUBJECT_b']))

print(datetime.datetime.now(), len(works))
works.head()



2021-01-16 07:28:16.207484 230


Unnamed: 0,SUBJECT_a,SUBJECT_b
1537,https://loc.gov/resource/work/1208391,https://www.bundesarchiv.de/resource/work/5431
1726,https://www.nfa.cz/resource/work/0201933,https://www.bundesarchiv.de/resource/work/77258
1717,https://www.nfa.cz/resource/work/0201933,https://www.eyefilm.nl/resource/work/FLM65135
1714,https://www.nfa.cz/resource/work/0201933,https://www.moma.org/resource/work/W2770
1698,https://www.nfa.cz/resource/work/0201933,https://loc.gov/resource/work/1675


In [4]:

# restart process, # create a list 
# can you link all synonyms and then do a replacer
# what I want to do is string or things, where if there is any way to link it can be done

trails = list()
for s,p,o in owl_graph:
    trails.append([s,o])

for k in range(27):    
    for n, t in enumerate(trails):
        for x in trails:
            if set(t).intersection(set(x)):
                trails[n].extend(x)
        trails[n] = sorted(pydash.uniq(trails[n]))   

graph_a = graph

for t in pydash.uniq(trails):
    graph_b = rdflib.Graph()
    new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
    
    for s,p,o in graph_a:
        if s in t:
            s = new_uri
        if p in t:
            p = new_uri            
        if o in t:
            o = new_uri   
        graph_b.add((s,p,o))
    graph_a = graph_b
    
graph = graph_a
        
print(datetime.datetime.now(), len(graph))


2021-01-16 07:28:37.603159 42006


In [5]:

# add major title as work label

work_dataframe = pandas.DataFrame(columns=['a','b','c']) # 47 instances of works (from claim)
for a,b,c in graph.triples((None, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/work'])):
    work_dataframe.loc[len(work_dataframe)] = [(a),(b),(c)]
    
claim_dataframe = pandas.DataFrame(columns=['a','b','c']) # 7004 claims in total
for a,b,c in graph.triples((None, name_wb['#claim'], None)):
    claim_dataframe.loc[len(claim_dataframe)] = [(a),(b),(c)]
    
ref_dataframe = pandas.DataFrame(columns=['c','d','e']) # 5307 reference claims
for a,b,c in graph.triples((None, name_wb['#reference'], None)):
    ref_dataframe.loc[len(ref_dataframe)] = [(a),(b),(c)]    
    
contrib_dataframe = pandas.DataFrame(columns=['e','f','institute']) # 
for a,b,c in graph.triples((None, name_fiaf['ontology/property/contributed_by'], None)):
    contrib_dataframe.loc[len(contrib_dataframe)] = [(a),(b),(c)]    
    
contrib_dataframe = pandas.merge(ref_dataframe, contrib_dataframe, on='e', how='right')  # 5307, a is the claim, e is the institute
contrib_dataframe = contrib_dataframe[['c', 'institute']].drop_duplicates()
     
work_subject = claim_dataframe.loc[claim_dataframe.c.isin(list(work_dataframe.a))] # 47, so this is all claims which turn into instance of work
work_claim = claim_dataframe.loc[claim_dataframe.a.isin(list(work_subject.a))] # 1121 all claims from "the works"
    
title_dataframe = pandas.DataFrame(columns=['c','d','e']) # all title claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/title'], None)):
    title_dataframe.loc[len(title_dataframe)] = [(a),(b),(c)]  
    
title_dataframe = pandas.merge(work_claim, title_dataframe, on='c', how='right') # this is the guy, now we need to blend in referneces
title_dataframe = pandas.merge(title_dataframe, contrib_dataframe, on='c', how='left') # this is the guy, now we need to blend in referneces
title_dataframe = title_dataframe[['a', 'e', 'institute']].drop_duplicates()
title_dataframe = title_dataframe.pivot_table(index=['a','e'], aggfunc=lambda x: len(x.unique())).reset_index()
title_dataframe = title_dataframe.sort_values(by='institute', ascending=False)
title_dataframe = title_dataframe.drop_duplicates(subset='a', keep='first')

# # okay now we can write english lables for each of these, hooray

for x in range(len(title_dataframe)):
    line = title_dataframe.iloc[x]
    graph.add((line['a'], rdflib.RDFS.label, rdflib.Literal(line['e'], lang='en'))) 

print(datetime.datetime.now(), len(graph))


2021-01-16 07:30:00.836712 42025


In [6]:

# # can you tap into the same system for agents?

# okay this is not accurate enough, you need to drill down by work, and by agent-type to find "likes"
# you could probs have different ratios (eg crew can be loose, but cast has to be a bit more strict)

def pull_direct(g, p, o):
    table = pandas.DataFrame(columns=['SUBJECT'])
    for a,b,c in g.triples((None, p, o)):
        table.loc[len(table)] = [(a)]            
    return table

def pull_subject(g, p, o):
    table = pandas.DataFrame(columns=['SUBJECT'])
    for a,b,c in g.triples((None, p, o)):
        for d,e,f in g.triples((None, name_wb['#claim'], a)):
            table.loc[len(table)] = [(d)]            
    return table

def pull_qual(g, p, col):
    table = pandas.DataFrame(columns=['CLAIM', col])
    for a,b,c in g.triples((None, p, None)):
        for d,e,f in g.triples((None, name_wb['#qualifier'], a)):
            table.loc[len(table)] = [(d), (c)]            
    return table

def pull_property_claim(g, p, col, col2):
    table = pandas.DataFrame(columns=['SUBJECT', col, col2])
    for a,b,c in g.triples((None, p, None)):
        for d,e,f in g.triples((None, name_wb['#claim'], a)):
            table.loc[len(table)] = [(d), (c), (f)]
    return table

def pull_property(p, col2, col1):
    table = pandas.DataFrame(columns=[col1, col2])
    for a,b,c in graph.triples((None, p, None)):
        for d,e,f in graph.triples((None, name_wb['#claim'], a)):
            table.loc[len(table)] = [(d), (c)]
    return table


agents = pull_property_claim(graph, name_fiaf['ontology/property/agent'], 'AGENT', 'CLAIM') 
agent_qual = pull_qual(graph, name_fiaf['ontology/property/agent_type'], 'AGENT_TYPE')
agent_dataframe = pandas.merge(agents, agent_qual, on='CLAIM', how='left')

forename = pull_property(name_fiaf['ontology/property/forename'], 'FORENAME', 'AGENT').drop_duplicates()
agent_dataframe = pandas.merge(agent_dataframe, forename, on='AGENT', how='left')

surname = pull_property(name_fiaf['ontology/property/surname'], 'SURNAME', 'AGENT').drop_duplicates()
agent_dataframe = pandas.merge(agent_dataframe, surname, on='AGENT', how='left')


# okay now you can carve up based on agent type and work and see how you go


agent_type = pull_direct(graph, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/agent_type']) 
print(len(agent_type))
works = pull_subject(graph, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/work']) 
print(len(works))

owl_graph = rdflib.Graph()

# for a in [x for x in list(agent_type.SUBJECT)]:
#     for w in pydash.uniq([x for x in list(works.SUBJECT)]):
# #         print(a, w)
        
section = agent_dataframe.copy()
#         section = section.loc[section.SUBJECT.isin([w])]
#         section = section.loc[section.AGENT_TYPE.isin([a])] 
section = section[['AGENT', 'FORENAME', 'SURNAME']].drop_duplicates() # now hinge this!
section['FORENAME'] = section['FORENAME'].str.replace('.','').str.upper()
section['SURNAME'] = section['SURNAME'].str.replace('.','').str.upper()        



section['hinge'] = '.'

section = pandas.merge(section.rename(columns={'AGENT':'AGENT_a', 'FORENAME':'FORENAME_a', 'SURNAME':'SURNAME_a'}), 
             section.rename(columns={'AGENT':'AGENT_b', 'FORENAME':'FORENAME_b', 'SURNAME':'SURNAME_b' }),
             on='hinge', how='outer')

# if len(section):

def matching(row, a, b):
    ratio = fuzz.partial_ratio(str(row[a]), str(row[b]))
    return ratio

section['fuzz_forename'] = section.apply(matching, a = 'FORENAME_a', b = 'FORENAME_b', axis=1)        
section['fuzz_surname'] = section.apply(matching, a = 'SURNAME_a', b = 'SURNAME_b', axis=1)  

section['fuzz_mean'] = (section['fuzz_forename']+section['fuzz_surname'])/2


# agents = agents.sort_values(by='fuzz', ascending=False)

section = section.loc[section.AGENT_a != section.AGENT_b].drop_duplicates()
section = section.loc[section.fuzz_mean >= 80].drop_duplicates()
section = section[['AGENT_a', 'AGENT_b']]

section = section.loc[section.AGENT_a != section.AGENT_b].drop_duplicates()

for x in range(len(section)):
    sect = section.iloc[x]
    owl_graph.add((sect['AGENT_a'], rdflib.OWL.sameAs, sect['AGENT_b']))

print(len(owl_graph))

print(datetime.datetime.now(), len(graph))


7
64
3056
2021-01-16 07:31:13.017530 42025


In [7]:

# restart process, # create a list 
# can you link all synonyms and then do a replacer
# what I want to do is string or things, where if there is any way to link it can be done

trails = list()
for s,p,o in owl_graph:
    trails.append([s,o])

for k in range(27):    
    for n, t in enumerate(trails):
        for x in trails:
            if set(t).intersection(set(x)):
                trails[n].extend(x)
        trails[n] = sorted(pydash.uniq(trails[n]))   

graph_a = graph

for t in pydash.uniq(trails):
    graph_b = rdflib.Graph()
    new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
    
    for s,p,o in graph_a:
        if s in t:
            s = new_uri
        if p in t:
            p = new_uri            
        if o in t:
            o = new_uri   
        graph_b.add((s,p,o))
    graph_a = graph_b
    
    
graph = graph_a

print(datetime.datetime.now(), len(graph))
        

KeyboardInterrupt: 

In [None]:

# and here we do agent labels
# agent labels are a bit more complex, because you actually have to run twice for fore and sur

agent_dataframe = pandas.DataFrame(columns=['a','b','c']) # 746 instances of agent (from claim)
for a,b,c in graph.triples((None, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/agent'])):
    agent_dataframe.loc[len(agent_dataframe)] = [(a),(b),(c)]
    
claim_dataframe = pandas.DataFrame(columns=['a','b','c']) # 7004 claims in total
for a,b,c in graph.triples((None, name_wb['#claim'], None)):
    claim_dataframe.loc[len(claim_dataframe)] = [(a),(b),(c)]
    
# ref_dataframe = pandas.DataFrame(columns=['c','d','e']) # 5307 reference claims
# for a,b,c in graph.triples((None, name_wb['#reference'], None)):
#     ref_dataframe.loc[len(ref_dataframe)] = [(a),(b),(c)]    
    
# contrib_dataframe = pandas.DataFrame(columns=['e','f','institute']) # 
# for a,b,c in graph.triples((None, name_fiaf['ontology/property/contributed_by'], None)):
#     contrib_dataframe.loc[len(contrib_dataframe)] = [(a),(b),(c)]    
    
# contrib_dataframe = pandas.merge(ref_dataframe, contrib_dataframe, on='e', how='right')  # 5307, a is the claim, e is the institute
# contrib_dataframe = contrib_dataframe[['c', 'institute']].drop_duplicates()
     
agent_subject = claim_dataframe.loc[claim_dataframe.c.isin(list(agent_dataframe.a))] # 746, so this is all claims which turn into instance of work
agent_claim = claim_dataframe.loc[claim_dataframe.a.isin(list(agent_subject.a))] # 3684 all claims from "the works"
    
fore_dataframe = pandas.DataFrame(columns=['c','d','e']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/forename'], None)):
    fore_dataframe.loc[len(fore_dataframe)] = [(a),(b),(c)]  
    
fore_dataframe = pandas.merge(agent_claim, fore_dataframe, on='c', how='right') # 731 first names
fore_dataframe = pandas.merge(fore_dataframe, contrib_dataframe, on='c', how='left') # this is the guy, now we need to blend in referneces
fore_dataframe = fore_dataframe[['a', 'e', 'institute']].drop_duplicates()
fore_dataframe = fore_dataframe.pivot_table(index=['a','e'], aggfunc=lambda x: len(x.unique())).reset_index()
fore_dataframe = fore_dataframe.sort_values(by='institute', ascending=False)
fore_dataframe = fore_dataframe.drop_duplicates(subset='a', keep='first') # 264 down
fore_dataframe = fore_dataframe[['a','e']].rename(columns={'e':'forename'})

surn_dataframe = pandas.DataFrame(columns=['c','d','e']) # all sur claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/surname'], None)):
    surn_dataframe.loc[len(surn_dataframe)] = [(a),(b),(c)]  
    
surn_dataframe = pandas.merge(agent_claim, surn_dataframe, on='c', how='right') # 731 first names
surn_dataframe = pandas.merge(surn_dataframe, contrib_dataframe, on='c', how='left') # this is the guy, now we need to blend in referneces
surn_dataframe = surn_dataframe[['a', 'e', 'institute']].drop_duplicates()
surn_dataframe = surn_dataframe.pivot_table(index=['a','e'], aggfunc=lambda x: len(x.unique())).reset_index()
surn_dataframe = surn_dataframe.sort_values(by='institute', ascending=False)
surn_dataframe = surn_dataframe.drop_duplicates(subset='a', keep='first') # 273 down
surn_dataframe = surn_dataframe[['a','e']].rename(columns={'e':'surname'})

names_dataframe = pandas.merge(fore_dataframe, surn_dataframe, on='a', how='outer').fillna('')

# # # okay now we can write english lables for each of these, hooray

for x in range(len(names_dataframe)):
    line = names_dataframe.iloc[x]
    spliced = (line['forename']+' '+line['surname']).strip()
    graph.add((line['a'], rdflib.RDFS.label, rdflib.Literal(spliced, lang='en'))) 

print(datetime.datetime.now(), len(graph))


In [None]:

# okay now lets do manifestation labels. way this works is go down to item level,
# and then return gauge, base, 

# base, sepcific carrier, carrier

manifest_dataframe = pandas.DataFrame(columns=['a','b','c']) # 156 manifests
for a,b,c in graph.triples((None, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/manifestation'])):
    manifest_dataframe.loc[len(manifest_dataframe)] = [(a),(b),(c)]

claim_dataframe = pandas.DataFrame(columns=['a','b','c']) # 7004 claims in total
for a,b,c in graph.triples((None, name_wb['#claim'], None)):
    claim_dataframe.loc[len(claim_dataframe)] = [(a),(b),(c)]
        
    
manifest_subject = claim_dataframe.loc[claim_dataframe.c.isin(list(manifest_dataframe.a))] # 156 manifests
manifest_claim = claim_dataframe.loc[claim_dataframe.a.isin(list(manifest_subject.a))] # 468 totals, now we want to follow down to items

# print(manifest_claim.b.unique())
    
item_property = pandas.DataFrame(columns=['c','d','e']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/item'], None)):
    item_property.loc[len(item_property)] = [(a),(b),(c)]      
    
item_property = pandas.merge(manifest_claim, item_property, on='c', how='right') # 156 items first names  

secondary_claim = claim_dataframe.copy()
secondary_claim.columns = ['e','f','g']
item_property = pandas.merge(item_property, secondary_claim, on='e', how='left')

# first grab, specific carrier
sc_property = pandas.DataFrame(columns=['g','h','i']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/specific_carrier'], None)):
    sc_property.loc[len(sc_property)] = [(a),(b),(c)]  
sc_property = pandas.merge(item_property, sc_property, on='g', how='right') # 123 bases items first names   
sc_property = sc_property[['a','i']].drop_duplicates().rename(columns={'i':'sc'})

# second grab, base
base_property = pandas.DataFrame(columns=['g','h','i']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/base'], None)):
    base_property.loc[len(base_property)] = [(a),(b),(c)]  
base_property = pandas.merge(item_property, base_property, on='g', how='right') # 123 bases items first names   
base_property = base_property[['a','i']].drop_duplicates().rename(columns={'i':'b'})

# third grab, carrier
carr_property = pandas.DataFrame(columns=['g','h','i']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/carrier'], None)):
    carr_property.loc[len(carr_property)] = [(a),(b),(c)]  
carr_property = pandas.merge(item_property, carr_property, on='g', how='right') # 123 bases items first names   
carr_property = carr_property[['a','i']].drop_duplicates().rename(columns={'i':'c'})

manifestation_rename = pandas.merge(sc_property, base_property, on='a', how='outer')
manifestation_rename = pandas.merge(manifestation_rename, carr_property, on='a', how='outer')


manifestation_property = pandas.DataFrame(columns=['c','d','e']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/manifestation_of'], None)):
    manifestation_property.loc[len(manifestation_property)] = [(a),(b),(c)] 

manifestation_property = pandas.merge(claim_dataframe, manifestation_property, on='c', how='right')

labels = pandas.DataFrame(columns=['e','f','g']) # all fore claims
for a,b,c in graph.triples((None, rdflib.RDFS.label, None)):
    labels.loc[len(labels)] = [(a),(b),(c)] 
    
manifestation_property = pandas.merge(labels, manifestation_property, on='e', how='right')  
manifestation_property = manifestation_property[['a', 'g']].rename(columns={'g':'label'})

manifestation_rename = pandas.merge(manifestation_property, manifestation_rename, on='a', how='left')

# okay we need to add label as a column

for x in range(len(manifestation_rename)):
    line = manifestation_rename.iloc[x]
    
    spliced = str(line['sc']).split('/')[-1].replace('digibeta', 'Digibeta').replace('mxf', 'MXF').replace('quarter-inch', '1/4-inch')+' '
    if line['b'] is not numpy.nan:
        spliced += str(line['b']).split('/')[-1].title()+' '  
    spliced += str(line['c']).split('/')[-1].replace('_','').title()+' '
    spliced += f"of {line['label']}."#     spliced
    spliced = spliced.strip()
    graph.add((line['a'], rdflib.RDFS.label, rdflib.Literal(spliced, lang='en'))) 

print(datetime.datetime.now(), len(graph))


In [None]:

# okay now attempt to join all like named manifestations
# you really just need a two way hinged data frame

manifest = pandas.DataFrame(columns=['c','d','e']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/manifestation'])):
    manifest.loc[len(manifest)] = [(a),(b),(c)] 
    
manifest = pandas.merge(claim_dataframe, manifest, on='c', how='right')    

labels = pandas.DataFrame(columns=['a','label']) # all fore claims
for a,b,c in graph.triples((None, rdflib.RDFS.label, None)):
    labels.loc[len(labels)] = [(a),(c)] 
    
manifest = pandas.merge(manifest, labels, on='a', how='left')    
manifest = manifest[['a', 'label']].drop_duplicates().rename(columns={'a':'manifest'})

manifest['hinge'] = '.'
manifest = pandas.merge(manifest.rename(columns={'manifest':'manifest_a', 'label':'label_a'}), 
                     manifest.rename(columns={'manifest':'manifest_b', 'label':'label_b'}),
                     on='hinge', how='outer')

manifest = manifest.loc[manifest.manifest_a != manifest.manifest_b].drop_duplicates()
manifest = manifest.loc[manifest.label_a == manifest.label_b].drop_duplicates()

owl_graph = rdflib.Graph()
for x in range(len(manifest)):
    section = manifest.iloc[x]
    owl_graph.add((section['manifest_a'], rdflib.OWL.sameAs, section['manifest_b']))
    
    
# so this is the owl filtering code spliced directly - should really be a function huh?    
    
trails = list()
for s,p,o in owl_graph:
    trails.append([s,o])

for k in range(27):    
    for n, t in enumerate(trails):
        for x in trails:
            if set(t).intersection(set(x)):
                trails[n].extend(x)
        trails[n] = sorted(pydash.uniq(trails[n]))   

graph_a = graph

for t in pydash.uniq(trails):
    graph_b = rdflib.Graph()
    new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
    
    for s,p,o in graph_a:
        if s in t:
            s = new_uri
        if p in t:
            p = new_uri            
        if o in t:
            o = new_uri   
        graph_b.add((s,p,o))
    graph_a = graph_b
    
graph = graph_a
        
print(datetime.datetime.now(), len(graph))



In [None]:

# LAST THING TO DO HERE, add some item labels which will just be institutuion and external id as label, easy
# then write this to local wikibase.
# then write this to online wikibase.


item_labels = pandas.DataFrame(columns=['c','d','e']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/item'])):
    item_labels.loc[len(item_labels)] = [(a),(b),(c)] 
    
item_subject = claim_dataframe.loc[claim_dataframe.c.isin(list(item_labels.c))] # 156 manifests
item_claim = claim_dataframe.loc[claim_dataframe.a.isin(list(item_subject.a))] # 1626 totals, now we want to follow down to items

# okay so now we need to issue two requests, the external id and the held at


held_dataframe = pandas.DataFrame(columns=['c','d','e']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/held_at'], None)):
    held_dataframe.loc[len(held_dataframe)] = [(a),(b),(c)]  
    
held_dataframe = pandas.merge(item_claim, held_dataframe, on='c', how='right') # 731 first names
held_dataframe = held_dataframe[['a','e']].drop_duplicates().rename(columns={'e':'institute'})

exid_dataframe = pandas.DataFrame(columns=['c','d','e']) # all fore claims
for a,b,c in graph.triples((None, name_fiaf['ontology/property/external_id'], None)):
    exid_dataframe.loc[len(exid_dataframe)] = [(a),(b),(c)]  
    
exid_dataframe = pandas.merge(item_claim, exid_dataframe, on='c', how='right') # 731 first names
exid_dataframe = exid_dataframe[['a','e']].drop_duplicates().rename(columns={'e':'external'})

dataframe = pandas.merge(held_dataframe, exid_dataframe, on='a', how='left')

def inst_rename(row):
    if str(row['institute']) == 'https://www.nfa.cz/ontology/item/nfa':
        return 'National Film Archive'
    if str(row['institute']) == 'https://www.filminstitutet.se/ontology/item/sfi':
        return 'Swedish Film Institute'
    if str(row['institute']) == 'https://www.moma.org/ontology/item/moma':
        return 'Museum of Modern Art'
    if str(row['institute']) == 'https://www.eyefilm.nl/ontology/item/eye':
        return 'Eye Film Institute'    
    if str(row['institute']) == 'https://www.bundesarchiv.de/ontology/item/barch':
        return 'Bundesarchiv' 
    if str(row['institute']) == 'https://loc.gov/ontology/item/loc':
        return 'Library of Congress'     

dataframe['institute'] = dataframe.apply(inst_rename, axis=1)

print(dataframe.institute.unique())

for x in range(len(dataframe)):
    line = dataframe.iloc[x]
    spliced = (line['institute']+' '+line['external']).strip()
    graph.add((line['a'], rdflib.RDFS.label, rdflib.Literal(spliced, lang='en'))) 

print(datetime.datetime.now(), len(graph))


In [None]:

# try and link claims, same method as above, find matching claims
# genius idea, can you do a pivot for claim ids?

claim_dataframe = pandas.DataFrame(columns=['S', 'P', 'O', 'C'])

for a,b,c in graph.triples((None, name_wb['#claim'], None)): # this is all yr claims
    for d,e,f in graph.triples((c, None, None)):
        claim_dataframe.loc[len(claim_dataframe)] = [(a), (e), (f), (c)]

claim_dataframe = claim_dataframe.pivot_table(index=['S','P','O'], aggfunc=lambda x: ','.join(sorted(x.unique()))).reset_index()
claim_dataframe = claim_dataframe.loc[claim_dataframe.C.str.contains(',', na=False)]
claim_overlap = list(claim_dataframe.C)
claim_overlap = [x.split(',') for x in claim_overlap]

for n, x in enumerate(claim_overlap):
    claim_overlap[n] = [rdflib.URIRef(y) for y in x]
    
graph_a = graph

for t in pydash.uniq(claim_overlap):
    graph_b = rdflib.Graph()
    new_uri = rdflib.URIRef(name_fiaf[f'resource/{str(uuid.uuid4())}'])
    
    for s,p,o in graph_a:
        if s in t:
            s = new_uri
        if p in t:
            p = new_uri            
        if o in t:
            o = new_uri   
        graph_b.add((s,p,o))
    graph_a = graph_b
        
graph = graph_a
print(datetime.datetime.now(), len(graph))


In [None]:

graph.serialize(destination=str(pathlib.Path.cwd() / 'merge.ttl'), format="turtle")
print(datetime.datetime.now(), len(graph))
