In [None]:

# Bundesarchiv


In [2]:

# import libraries

import rdflib, pandas, pathlib, json
import numpy, uuid, xmltodict, pydash


In [3]:

# define graph and namespace

graph = rdflib.Graph()
name_ba = rdflib.Namespace('https://www.bundesarchiv.de/') 
name_wb = rdflib.Namespace('http://wikibas.se/ontology')
name_fiaf = rdflib.Namespace('https://www.fiafnet.org/')


In [4]:

# useful functions

def make_claim(s, p, o):        
    claim_id = name_ba[f"resource/claim/{uuid.uuid4()}"]    
    graph.add((s, name_wb['#claim'], claim_id))
    graph.add((claim_id, p, o))
    return claim_id

def make_qual(s, p, o):
    qual_id = name_ba[f"resource/qualifier/{uuid.uuid4()}"]       
    graph.add((s, name_wb['#qualifier'], qual_id))
    graph.add((qual_id, p, o))
    return qual_id

def reference(claim_id, institute):
    ref_id = name_ba[f"resource/reference/{uuid.uuid4()}"]
    graph.add((claim_id, name_wb['#reference'], ref_id))
    graph.add((ref_id, name_fiaf['ontology/property/contributed_by'], institute))  
    
def single_list(data):  
    if isinstance(data, list):
        return data
    else:
        return [data]    


In [5]:

# define institution

graph.add((name_ba['ontology/item/barch'], rdflib.RDFS.label, rdflib.Literal('German Federal Archives', lang='en'))) 
graph.add((name_ba['ontology/item/barch'], rdflib.RDFS.label, rdflib.Literal('Bundesarchiv', lang='de'))) 
make_claim(name_ba['ontology/item/barch'], name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/holding_institution'])
make_claim(name_ba['ontology/item/barch'], name_fiaf['ontology/property/located_in'], name_fiaf['ontology/item/germany'])

print(len(graph)) 


6


In [6]:

# format data

data = pandas.read_csv(pathlib.Path.cwd() / 'Murnau_BArch_210106.csv', dtype='str')
data = data.loc[data.MURNAU_ROLE.isin(['Regie / Spielleitung / Realisation'])]
data = data.replace({'WORK_ID':{'307190':'1210', '307191':'5431'}})

print(len(graph))


6


In [7]:

# write work

for x in (data.WORK_ID.unique()):
    work_id = x
    work = name_ba[f"resource/work/{work_id}"]
    
    make_claim(work, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/work'])
    claim1 = make_claim(work, name_fiaf['ontology/property/external_id'], rdflib.Literal(work_id))
    make_qual(claim1, name_fiaf['ontology/property/institution'], name_ba['ontology/item/barch'])
    reference(claim1, name_ba['ontology/item/barch'])     

print(len(graph))


86


In [8]:

# write title

title_data = data[['WORK_ID', 'TITLE_ID', 'TITLE', 'TITLE_TYPE']].drop_duplicates()
title_data = title_data.loc[title_data.TITLE_TYPE.isin(['Originaltitel'])]
title_data = title_data.sort_values(by='TITLE_ID', ascending=False)
title_data = title_data.drop_duplicates(subset=['WORK_ID'], keep='first')

for x in title_data.to_dict(orient='records'):
    
    work_id = x['WORK_ID']
    work = name_ba[f"resource/work/{work_id}"]

    claim1 = make_claim(work, name_fiaf['ontology/property/title'], rdflib.Literal(x['TITLE']))
    make_qual(claim1, name_fiaf['ontology/property/title_type'], name_fiaf['ontology/item/original_title'])
    reference(claim1, name_ba['ontology/item/barch'])     

print(len(graph))


146


In [9]:

# write agents 

agent_data = data[['WORK_ID', 'CC_FUNCTION', 'CC_PERSON_ID', 'CREDIT_CAST_NAME']].drop_duplicates()

def write_credit(role, uri):
    for x in agent_data.to_dict(orient='records'):
        if x['CC_FUNCTION'] == role and x['CREDIT_CAST_NAME'] is not numpy.nan and 'Ufa' not in x['CREDIT_CAST_NAME']:

            name = x['CREDIT_CAST_NAME'].replace('°','').replace('(ZDF-Fassg. 1983)','')
            if '.' in name:
                name = name.split('.')[-1]
            if ',' in name:
                name = name.split(',')[1].strip()+' '+name.split(',')[0].strip()
            name = name.strip()
     
            if len(name) > 4:
                name = [y for y in name.split(' ') if y != '']                

                work_id = x['WORK_ID']
                work = name_ba[f"resource/work/{work_id}"]

                agent_id = x['CC_PERSON_ID']
                agent = name_ba[f"resource/agent/{agent_id}"]  

                claim_id = make_claim(work, name_fiaf['ontology/property/agent'], agent)
                make_qual(claim_id, name_fiaf['ontology/property/agent_type'], uri)                
                reference(claim_id, name_ba['ontology/item/barch'])    

                make_claim(agent, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/agent']) 

                claim_id = make_claim(agent, name_fiaf['ontology/property/external_id'], rdflib.Literal(agent_id))
                make_qual(claim_id, name_fiaf['ontology/property/institution'], name_ba['ontology/item/barch'])
                reference(claim_id, name_ba['ontology/item/barch']) 

                if len(name) > 1:
                    claim_id = make_claim(agent, name_fiaf['ontology/property/forename'], rdflib.Literal(' '.join(name[:-1])))
                    reference(claim_id, name_ba['ontology/item/barch'])                      
                
                claim_id = make_claim(agent, name_fiaf['ontology/property/surname'], rdflib.Literal(name[-1]))
                reference(claim_id, name_ba['ontology/item/barch'])                     

                claim_id = make_claim(agent, name_fiaf['ontology/property/work'], work)    
                reference(claim_id, name_ba['ontology/item/barch']) 

write_credit('Darsteller', name_fiaf['ontology/item/cast'])
write_credit('Regie / Spielleitung / Realisation', name_fiaf['ontology/item/director']) 
write_credit('Produzent', name_fiaf['ontology/item/producer'])   
write_credit('Kamera/Bild/Bildgestaltung/Fotografie', name_fiaf['ontology/item/cinematographer']) 
write_credit('Redaktion', name_fiaf['ontology/item/editor'])     
write_credit('Drehbuch', name_fiaf['ontology/item/screenwriter'])  
write_credit('Musik (Filmkomponist)', name_fiaf['ontology/item/composer'])

print(len(graph))             


3622


In [10]:

# write events

event_col = ['WORK_ID', 'EVENT_TYPE', 'EVENT_AGENT_FUNCTION', 'DATE_FROM', 'DECISION']

decision_event = data.loc[data.EVENT_AGENT_FUNCTION.isin(['Prüfstelle'])][event_col].drop_duplicates().dropna(subset=['DATE_FROM'])
for x in decision_event.to_dict(orient='records'):
    work_id = x['WORK_ID']
    work = name_ba[f"resource/work/{work_id}"]
    
    date = pandas.to_datetime(x['DATE_FROM']).strftime('%Y-%m-%d')
    claim_id = make_claim(work, name_fiaf['ontology/property/event'], rdflib.Literal(date))
    make_qual(claim_id, name_fiaf['ontology/property/event_type'], name_fiaf['ontology/item/decision_censorship']) 
    make_qual(claim_id, name_fiaf['ontology/property/country'], name_fiaf['ontology/item/germany'])  
    make_qual(claim_id, name_fiaf['ontology/property/certificate'], rdflib.Literal(x['DECISION']))  
    reference(claim_id, name_ba['ontology/item/barch']) 

publication_event = data.loc[data.EVENT_TYPE.isin(['PUBLIKATION'])][event_col].drop_duplicates().dropna(subset=['DATE_FROM'])
for x in publication_event.to_dict(orient='records'):
    work_id = x['WORK_ID']
    work = name_ba[f"resource/work/{work_id}"]    
    
    date = pandas.to_datetime(x['DATE_FROM']).strftime('%Y-%m-%d')
    claim_id = make_claim(work, name_fiaf['ontology/property/event'], rdflib.Literal(date))
    make_qual(claim_id, name_fiaf['ontology/property/event_type'], name_fiaf['ontology/item/publication']) 
    make_qual(claim_id, name_fiaf['ontology/property/country'], name_fiaf['ontology/item/germany'])              
    reference(claim_id, name_ba['ontology/item/barch']) 

print(len(graph))             


3890


In [11]:

# write manifestations / items

item_data = data[['WORK_ID', 'SIGNATUR', 'CARRIER', 'MEDIENART', 'COLOUR', 'MEDIA_TYPE', 'MATERIAL_TYPE', 
                  'FILM_FORMAT', 'DURATION_MIN', 'LENGTH_M']].drop_duplicates()

item_data = item_data.replace({'CARRIER':{'Triazetatzellulose': name_fiaf['ontology/item/acetate'], 'Zellulosenitrat': name_fiaf['ontology/item/nitrate'], 
                                          'Polyethylenterephtalat (Polyester) ':name_fiaf['ontology/item/polyester']}})
item_data = item_data.replace({'MEDIENART':{'FILM': name_fiaf['ontology/item/film'], 'VIDEO': name_fiaf['ontology/item/video_tape'], 
                                            'DATEN':name_fiaf['ontology/item/digital']}})
item_data = item_data.replace({'COLOUR':{'schwarz-weiß': name_fiaf['ontology/item/black_and_white'], 'Farbe': name_fiaf['ontology/item/colour'], 
                                         'schwarz-weiß und Farbe':name_fiaf['ontology/item/black_and_white_and_colour']}})
item_data = item_data.replace({'MEDIA_TYPE':{'Bild': name_fiaf['ontology/item/silent'], 'Bild/Ton': name_fiaf['ontology/item/sound']}})
item_data = item_data.replace({'MATERIAL_TYPE':{'Stumme Kopie ': name_fiaf['ontology/item/print'], 'VHS': name_fiaf['ontology/item/vhs'], 
                                             'Tonnegativ': name_fiaf['ontology/item/negative'], 
                                             'Bildduplikatnegativ': name_fiaf['ontology/item/duplicate_negative'], 
                                             'Stummes Duplikatnegativ': name_fiaf['ontology/item/duplicate_negative'], 
                                             'Kombinierte Kopie': name_fiaf['ontology/item/print'], 'VHS/BA': name_fiaf['ontology/item/vhs'], 
                                             'Umatic': name_fiaf['ontology/item/umatic'], 
                                             'Stummes Duplikatpositiv': name_fiaf['ontology/item/duplicate_positive'], 
                                             'Stummes Originalnegativ': name_fiaf['ontology/item/original_negative'], 
                                             'DVD/TC': name_fiaf['ontology/item/dvd'], 'DVD': name_fiaf['ontology/item/dvd'], 
                                             'Bildnegativ': name_fiaf['ontology/item/negative'], 
                                             'Intermediate Positive': name_fiaf['ontology/item/duplicate_positive'], 
                                             'Stummes Internegativ': name_fiaf['ontology/item/duplicate_negative'], 
                                             'VHS/BA + TC': name_fiaf['ontology/item/vhs'], 'VHS/TC': name_fiaf['ontology/item/vhs'], 
                                             'Bildpositiv': name_fiaf['ontology/item/duplicate_positive'], 
                                             'kombiniertes Duplikatnegativ': name_fiaf['ontology/item/duplicate_negative']}})
item_data = item_data.replace({'FILM_FORMAT':{'35 mm': name_fiaf['ontology/item/35mm'], '16 mm': name_fiaf['ontology/item/16mm'], 
                                              '9,5 mm': name_fiaf['ontology/item/9mm']}})

for x in item_data.to_dict(orient='records'):
    
    work_id = x['WORK_ID']
    work = name_ba[f"resource/work/{work_id}"]
    
    manifestation = name_ba[f"resource/manifestation/{uuid.uuid4()}"]
    
    make_claim(manifestation, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/manifestation'])
    make_claim(manifestation, name_fiaf['ontology/property/manifestation_of'], work)

    item_id = x['SIGNATUR']
    item = name_ba[f"resource/item/{item_id}"]     

    make_claim(item, name_fiaf['ontology/property/instance_of'], name_fiaf['ontology/item/item'])        
    make_claim(item, name_fiaf['ontology/property/item_of'], manifestation)  

    claim_id = make_claim(item, name_fiaf['ontology/property/held_at'], name_ba['ontology/item/barch'])
    reference(claim_id, name_ba['ontology/item/barch'])          
        
    claim_id = make_claim(item, name_fiaf['ontology/property/external_id'], rdflib.Literal(item_id))
    make_qual(claim_id, name_fiaf['ontology/property/institution'], name_ba['ontology/item/barch'])
    reference(claim_id, name_ba['ontology/item/barch'])    

    if isinstance(x['CARRIER'], rdflib.URIRef): 
        claim_id = make_claim(item, name_fiaf['ontology/property/base'], x['CARRIER'])   
        reference(claim_id, name_ba['ontology/item/barch'])      
        
    if isinstance(x['MEDIENART'], rdflib.URIRef): 
        claim_id = make_claim(item, name_fiaf['ontology/property/carrier'], x['MEDIENART'])   
        reference(claim_id, name_ba['ontology/item/barch'])  
        
    if isinstance(x['FILM_FORMAT'], rdflib.URIRef): 
        claim_id = make_claim(item, name_fiaf['ontology/property/specific_carrier'], x['FILM_FORMAT'])   
        reference(claim_id, name_ba['ontology/item/barch'])  
        
    if isinstance(x['MATERIAL_TYPE'], rdflib.URIRef): 
        if x['MATERIAL_TYPE'] in [name_fiaf['ontology/item/vhs'], name_fiaf['ontology/item/umatic'], name_fiaf['ontology/item/dvd']]:
            claim_id = make_claim(item, name_fiaf['ontology/property/specific_carrier'], x['MATERIAL_TYPE'])   
            reference(claim_id, name_ba['ontology/item/barch'])  
        else:
            claim_id = make_claim(item, name_fiaf['ontology/property/element'], x['MATERIAL_TYPE'])   
            reference(claim_id, name_ba['ontology/item/barch'])             
        
    if isinstance(x['COLOUR'], rdflib.URIRef): 
        claim_id = make_claim(item, name_fiaf['ontology/property/colour'], x['COLOUR'])   
        reference(claim_id, name_ba['ontology/item/barch'])         
        
    if isinstance(x['MEDIA_TYPE'], rdflib.URIRef): 
        claim_id = make_claim(item, name_fiaf['ontology/property/sound'], x['MEDIA_TYPE'])   
        reference(claim_id, name_ba['ontology/item/barch'])         
        
    if x['LENGTH_M'] is not numpy.nan and str(x['LENGTH_M']) != '0':
        claim_id = make_claim(item, name_fiaf['ontology/property/extent_metres'], rdflib.Literal(x['LENGTH_M']))   
        reference(claim_id, name_ba['ontology/item/barch']) 
        
    if x['DURATION_MIN'] is not numpy.nan and str(x['DURATION_MIN']) != '0':
        claim_id = make_claim(item, name_fiaf['ontology/property/duration'], rdflib.Literal(x['DURATION_MIN']))   
        reference(claim_id, name_ba['ontology/item/barch'])          
    
    make_claim(work, name_fiaf['ontology/property/manifestation'], manifestation)
    make_claim(manifestation, name_fiaf['ontology/property/item'], item)   
    
print(len(graph))  # 12018   


11962


In [12]:

graph.serialize(destination=str(pathlib.Path.cwd() / 'barch.ttl'), format="turtle")
print(len(graph))


11962
