In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import regex as re

from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS

In [73]:
# init operations
ELB = Namespace("http://literarybibliography.eu/")
BF = Namespace("http://id.loc.gov/ontologies/bibframe/")
g = Graph()
g.bind("elb", ELB)
g.bind("bf", BF)

uri_base = 'http://literarybibliography.eu/'
output_df = pd.DataFrame(columns=['subject', 'type', 'predicate', 'object'])

entities_dict = {
        'authors': {},
        'subjects': {},
        'genreforms': {},
    }

flow_control = {
        'work_last_id': 0,
        'instance_last_id': 0,
        'item_last_id': 0,
        'topic_last_id': 0,
        'genreform_last_id': 0,
        'author_last_id': 0,
    }

In [72]:
def load_input_df(df_name):
    df = pd.read_csv(df_name).fillna('')
    df = df[df['do PBL'] == True]
    return df

def clear_viaf_uri(url):
    if uri := re.match(r'https://viaf\.org/viaf/\d+', url):
        return uri.group(0) + '/'
    
def get_viaf_label(url):
    if viaf_uri := clear_viaf_uri(url):
        url = viaf_uri + 'viaf.json'
        response = requests.get(url)
        if response.ok:
            if 'redirect' not in response.json():
                try:
                    label = response.json()['mainHeadings']['data'][0]['text']
                except KeyError:
                    label = response.json()['mainHeadings']['data']['text']
                return label
    
def get_filmpolski_label(url):
    response = requests.get(url)
    if response.ok:
        response.encoding = 'utf-8'
        soup = bs(response.text, 'lxml')
        label = soup.find('article', {'id': 'film'}).find('h1').text
        return label

def preprocess_authors(df, with_viaf=False):
    authors_list = set(zip(df['Autor'], df['VIAF autor 1'], df['VIAF autor 2'], df['VIAF autor 3']))
    for author_tuple in authors_list:
        author_splitted = author_tuple[0].split('|')
        for idx, aut in enumerate(author_splitted):
            if idx > 2: break
            label = aut.strip()
            if (viaf_url := author_tuple[idx + 1]):
                if with_viaf:
                    viaf_label = get_viaf_label(viaf_url)
                else: viaf_label = None
                viaf_uri = clear_viaf_uri(viaf_url)
            else:
                viaf_label = None
                viaf_uri = None   
            if label not in entities_dict['authors']:
                last_id = flow_control['author_last_id']
                author_id = str(last_id + 1).zfill(8)
                entities_dict['authors'][label] = {
                        'author_id': author_id,
                        'viaf_uri': viaf_uri,
                        'viaf_label': viaf_label,
                        'bibframe_type': 'bf:Agent',
                    }
                flow_control['author_last_id'] += 1
            
def preprocess_topics(df):
    topics_map = {
            'czasopismo': 'Work',
            'film': 'MovingImage',
            'instytucja': 'Organization',
            'kraj': 'Place',
            'książka': 'Work',
            'miejscowość': 'Place',
            'osoba': 'Person',
            'spektakl': 'Work',
            'wydarzenie': 'Event',
        }
    
    # entities
    df_entities = df[['byt 1', 'zewnętrzny identyfikator bytu 1', 'byt 2', 'zewnętrzny identyfikator bytu 2', 'byt 3', 'zewnętrzny identyfikator bytu 3']]
    entities_list = list(zip(df_entities['byt 1'], df_entities['zewnętrzny identyfikator bytu 1'])) + list(zip(df_entities['byt 2'], df_entities['zewnętrzny identyfikator bytu 2'])) + list(zip(df_entities['byt 3'], df_entities['zewnętrzny identyfikator bytu 3']))
    entities_list = [e for e in entities_list if e[0] and e[1]]
        
    for elem in tqdm(entities_list):
        if elem[1].startswith('https://viaf.org/'):
            pass
            # label = get_viaf_label(elem[1])
            # if label:
            #    uri = clear_viaf_uri(url)
            #    key = uri
            #    bibframe_type = 'bf:' + topics_map[elem[0]]
        elif elem[1].startswith('https://filmpolski.pl/'):
            label = get_filmpolski_label(elem[1])
            if label:
                uri = elem[1]
                key = uri
                bibframe_type = 'bf:' + topics_map[elem[0]]
        if key and key not in entities_dict['subjects']:
            last_id = flow_control['topic_last_id']
            topic_uri = uri_base + 'subjects/subject' + str(last_id + 1).zfill(8)
            entities_dict['subjects'][key] = {
                    'label': label,
                    'uri': topic_uri,
                    'external_uri': uri,
                    'bibframe_type': bibframe_type,
                }
            flow_control['topic_last_id'] += 1
    
    # other subjects
    topics = set(df['Sekcja'])
    for topic in topics:
        if topic not in entities_dict['subjects']:
            last_id = flow_control['topic_last_id']
            topic_id = str(last_id + 1).zfill(8)
            entities_dict['subjects'][topic] = {
                    'topic_id': topic_id,
                    'external_uri': None,
                    'bibframe_type': 'bf:Topic',
                }
            flow_control['topic_last_id'] += 1
        
def preprocess_forms(df):
    forms = [
                'artykuł', 
                'esej',
                'felieton',
                'inne',
                'kalendarium',
                'kult',
                'list',
                'miniatura prozą',
                'nota',
                'opowiadanie',
                'poemat',
                'proza',
                'proza poetycka',
                'recenzja',
                'reportaż',
                'rozmyślanie religijne',
                'scenariusz',
                'słownik',
                'sprostowanie',
                'szkic',
                'teksty dramatyczne',
                'wiersz',
                'wpis blogowy',
                'wspomnienie',
                'wypowiedź',
                'wywiad',
                'zgon',
            ]
    
    for idx, form in enumerate(forms):
        form_uri = uri_base + 'genreForms/genreform' + str(idx + 1).zfill(8)
        entities_dict['genreforms'][form] = form_uri
    
def preprocess_row(idx, row):

    work_id = str(flow_control.get('work_last_id') + 1).zfill(8)
    work = ELB[f'works/{work_id}']
    flow_control['work_last_id'] += 1
    g.add((work, RDF.type, BF.Work))
    
    instance_id = str(flow_control.get('instance_last_id') + 1).zfill(8)
    instance = ELB[f'instances/{instance_id}']
    flow_control['instance_last_id'] += 1
    g.add((work, BF.hasInstance, instance))
    g.add((instance, RDF.type, BF.Instance))
    g.add((instance, BF.instanceOf, work))
    
    item_id = str(flow_control.get('item_last_id') + 1).zfill(8)
    item = ELB[f'items/{item_id}']
    flow_control['item_last_id'] += 1
    g.add((instance, BF.hasItem, item))
    g.add((item, RDF.type, BF.Item))
    g.add((item, BF.itemOf, instance))

    for col, value in row.items():
        # value = value.item()
        match col:
            case 'Link':
                g.add((item, BF.electronicLocator, URIRef(value.strip())))
                
            case 'Data publikacji':
                g.add((instance, BF.originDate, Literal(value.strip())))
                
            case 'Autor':
                for author in value.split('|'):
                    author_dct = entities_dict['authors'].get(author.strip())
                    if author_dct:
                        author_id, viaf_uri, viaf_label = author_dct['author_id'], author_dct['viaf_uri'], author_dct['viaf_label'] # it is possible to use locals().update(author_dct)
                        label = viaf_label if viaf_label else author
                        
                        # create an Agent
                        agent = ELB[f'agents/{author_id}']
                        g.add((agent, RDF.type, BF.Agent))
                        g.add((agent, RDF.type, BF.Person))
                        g.add((agent, RDFS.label, Literal(label)))
                        if viaf_uri:
                            identifier = BNode()
                            g.add((identifier, RDF.type, BF.Identifier))
                            g.add((identifier, RDF.value, Literal(viaf_uri)))
                            g.add((agent, BF.identifiedBy, identifier))
        
                        # add Agent
                        contribution = BNode()
                        g.add((work, BF.contribution, contribution))
                        g.add((contribution, RDF.type, BF.Contribution))
                        g.add((contribution, RDF.type, BF.PrimaryContribution))
                        g.add((contribution, BF.agent, agent))

                        # add role
                        author_role = URIRef('http://id.loc.gov/vocabulary/relators/aut')
                        g.add((contribution, BF.role, author_role))
                        g.add((author_role, RDF.type, BF.Role))
                        g.add((author_role, RDFS.label, Literal('author')))
                
            case 'do PBL':
                pass
            
            case 'VIAF autor 1':
                pass
            
            case 'VIAF autor 2':
                pass
            
            case 'VIAF autor 3':
                pass
            
            case 'Sekcja':
                topic_dct = entities_dict['subjects'].get(value.strip())
                if topic_dct:
                    topic_id = topic_dct['topic_id']
                    topic = ELB[f'subjects/{topic_id}']
                    g.add((work, BF.subject, topic))
                    g.add((topic, RDF.type, BF.Topic))
                    g.add((topic, RDFS.label, Literal(value.strip())))
                
            case 'Tytuł artykułu':                
                title = BNode()
                g.add((work, BF.title, title))
                g.add((title, RDF.type, BF.Title))
                g.add((title, BF.mainTitle, Literal(value.strip())))
                
            case 'Opis':
                summary = BNode()
                g.add((work, BF.summary, summary))
                g.add((summary, RDF.type, BF.Summary))
                g.add((summary, RDFS.label, Literal(value.strip())))
                
            case 'Numer':
                enumeration = BNode()
                g.add((item, BF.enumerationAndChronology, enumeration))
                g.add((enumeration, RDF.type, BF.Enumeration))
                if isinstance(value, float): value = int(value)
                g.add((enumeration, RDFS.label, Literal(str(value).strip())))
            
            case 'Tagi':
                for tag in value.split('|'):
                    topic_dct = entities_dict['subjects'].get(tag.strip())
                    if topic_dct:
                        topic_id = topic_dct['topic_id']
                        topic = ELB[f'subjects/{topic_id}']
                        g.add((work, BF.subject, topic))
                        g.add((topic, RDF.type, BF.Topic))
                        g.add((topic, RDFS.label, Literal(value.strip())))

            case 'forma/gatunek':
                genreform = BNode()
                g.add((work, BF.genreForm, genreform))
                g.add((genreform, RDF.type, BF.GenreForm))
                g.add((genreform, RDFS.label, Literal(value.strip())))
                
            case 'hasła przedmiotowe':
                # matching with lcsh
                pass
            
            case 'zewnętrzny identyfikator bytu 1' | 'zewnętrzny identyfikator bytu 2' | 'zewnętrzny identyfikator bytu 3':
                topic_dct = entities_dict['subjects'].get(value.strip())
                if topic_dct:
                    match topic_dct['bibframe_type']:
                        case 'bf:Work':
                            pass
                        case 'bf:MovingImage':
                            uri = topic_dct['external_uri']
                            movie_work = URIRef(uri)
                            g.add((work, BF.subject, movie_work))
                            g.add((movie_work, RDF.type, BF.Work))
                            g.add((movie_work, RDF.type, BF.MovingImage))
                            title = BNode()
                            g.add((movie_work, BF.title, title))
                            g.add((title, RDF.type, BF.Title))
                            g.add((title, BF.mainTitle, Literal(topic_dct['label'])))
                        case 'bf:Organization':
                            pass
                        case 'bf:Place':
                            pass
                        case 'bf:Person':
                            pass
                        case 'bf:Event':
                            pass
            
            case 'byt 1':
                pass

            case 'byt 2':
                pass
            
            case 'byt 3':
                pass
            
            case 'adnotacje':
                pass
            
            case 'Linki zewnętrzne':
                pass
            
            case 'Linki do zdjęć':
                pass

            case _:
                pass


def preprocess_df(df):
    pass

def save_graph():
    pass

In [49]:
# main
input_df_names = ['dwutygodnik_2024-05-06 - Posts.csv']
df = load_input_df(input_df_names[0])
sample = df.iloc[8:10]

In [74]:
preprocess_authors(sample)
preprocess_topics(sample)
preprocess_forms(sample)
for index, row in sample.iterrows():
    preprocess_row(index, row)

100%|██████████| 2/2 [00:00<00:00,  8.65it/s]


In [75]:
print(g.serialize(format="pretty-xml"))

<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
  xmlns:bf="http://id.loc.gov/ontologies/bibframe/"
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
>
  <bf:Item rdf:about="http://literarybibliography.eu/items/00000002">
    <bf:itemOf>
      <bf:Instance rdf:about="http://literarybibliography.eu/instances/00000002">
        <bf:instanceOf rdf:resource="http://literarybibliography.eu/works/00000002"/>
        <bf:hasItem rdf:resource="http://literarybibliography.eu/items/00000002"/>
        <bf:originDate>2024-04-01</bf:originDate>
      </bf:Instance>
    </bf:itemOf>
    <bf:electronicLocator rdf:resource="https://www.dwutygodnik.com/artykul/11208-moliwoci.html"/>
    <bf:enumerationAndChronology>
      <bf:Enumeration rdf:nodeID="N5bd093491b8547afb2694319aa5c0d33">
        <rdfs:label>383</rdfs:label>
      </bf:Enumeration>
    </bf:enumerationAndChronology>
  </bf:Item>
  <bf:Work rdf:about="http://literarybibliography.