In [None]:
import json
import csv
from pathlib import Path
import re
from rdflib import Namespace, Graph, Literal, URIRef, RDF, SKOS, DCTERMS

In [None]:
csv_path = Path.cwd() / "data" / "csv" / "schlagworte.csv"


def open_csv(csv_path) -> list:
    csv_data = []
    with open(csv_path, newline="") as f:
        reader = csv.DictReader(f, delimiter="|")
        for row in reader:
            csv_data.append(row)
    return csv_data

# Schlagwort: Definierter Begriff, kann aus einem oder mehreren Wörtern bestehen, kann Deskriptor oder Nicht-Desckriptor sein
# Art: DS = Deskriptor, ND = Nicht-Deskriptor, (Verweis mit s. im Feld "Text"), D = Deskriptor mit Verweis im Feld "Text"
# Feldbegr: Feldbegriff, Kategorisierung der Deskriptoren
# Text: Definition des Deskriptors oder Querverweis mit "s." und "s.a."


def findAllNodes(data):
    root = []
    for item in data:
        if item["ART"] == "DS":
            # item is main node append it
            if item["FELDBEGR"] == "":
                continue
            node = {
                "schlagwort": item["SCHLAGWORT"],
                "dokument-id": item["DOKUMENT-ID"],
                "text": item["TEXT"],
                "discipline": item["FELDBEGR"]
            }
            root.append(node)
    return root

csv_data = open_csv(csv_path)
root = findAllNodes(csv_data)
disciplines = set([node['discipline'] for node in root])

print(f"Distict disciplines: {*disciplines,}")
print(f"Number of distinct disciplines: {len(disciplines)}")

## Append all all Nodes to their respective discipline

In [None]:
def parseTree(item, data) -> dict:
    """
    gets an item as input and looks for descriptors in data till it finds a "DS"-descriptor
    """
    # split after s. or s.a. in TEXT column
    descriptor_split = re.compile(r"s\.a\.|s\.")
    
    # look for the descriptor in data
    def findDescriptor(item, data):
        # find the descriptor
        descriptor = re.split(descriptor_split, item["TEXT"])[-1].strip()
        
        descriptor_nodes = [node for node in data if node["SCHLAGWORT"].strip() == descriptor]
        if len(descriptor_nodes) == 0:
            ## use the more insensitive method to check if the string appears in another string
            descriptor_nodes = [node for node in data if descriptor in node["SCHLAGWORT"].strip()]

        try:
            target_node = next(node for node in descriptor_nodes if node["ART"] == "DS")
            if target_node:
                return target_node
            else:
                for node in descriptor_nodes:
                    findDescriptor(target_node, data)

        except:
            print(f"item :{item}")
            print(f"descriptor: {descriptor}")
            print(f"target_node:  {target_nodes}")


    ds_node = findDescriptor(item, data)
    return ds_node
    

def buildListOfDisciplines(data) -> list:
    discipline_list = []
    for item in data:
        # second iteration when all mein node exists
        # if item is a Non-Deskriptor find its Deskriptor (DS) and append it to DS as hiddenLabel

        if item["ART"].strip() == "DS":
            discipline = item["FELDBEGR"]
            if discipline == "":
                continue

            # if list empty, append the discipline
            if len(discipline_list) == 0:
                discipline_list.append({discipline: []})
            # check if the discipline is already there, otherwise append
            elif len([d for d in discipline_list if discipline in d.keys()]) == 0:
                discipline_list.append({discipline: []})

            item.update({"discipline": item.get("FELDBEGR", "")})
            
            next(d[discipline] for d in discipline_list if discipline in d.keys()).append(item)

        # node is not a descriptor
        else:
            # set node values
            node = item

            # find the parent_node it belongs to
            parent_node = parseTree(item, data)

            discipline = parent_node["FELDBEGR"]

            # append parent_node to node["broader"]
            node["broader"] = parent_node["DOKUMENT-ID"]

            # if discipline not in list, append the discipline
            if len(discipline_list) == 0:
                discipline_list.append({discipline: []})
            # check if the discipline is already there, otherwise append
            elif len([d for d in discipline_list if discipline in d.keys()]) == 0:
                discipline_list.append({discipline: []})

            next(d[discipline] for d in discipline_list if discipline in d.keys()).append(node)

    return discipline_list

discipline_list = buildListOfDisciplines(csv_data)

## Build csv files for each discipline

In [None]:
def buildCSV(data):
    csv_out_path = Path.cwd() / "data" / "csv_out"
    csv_out_path.mkdir(exist_ok=True, parents=True)
    counter = 0

    for d in data:
        discipline = next(iter(d))
        csv_path = Path(csv_out_path / (str(discipline) + ".csv"))
        csv_path.touch(exist_ok=True)
        with open(Path(csv_out_path / (str(discipline) + ".csv")), 'w', newline='') as csvfile:
            fieldnames = ['DOKUMENT-ID', 'SCHLAGWORT', 'ART', 'TEXT', 'discipline', 'narrower', 'broader']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames,extrasaction='ignore')

            writer.writeheader()

            for i, item in enumerate(d[discipline]):

                writer.writerow(item)
                counter += 1
            print(f"{discipline} has {i} items")
    print(counter)
buildCSV(discipline_list)

## build skos-model

In [None]:
def buildGraph(data):
    systematics_name = "eaf-schlagwortverzeichnis-all"
    g = Graph()
    namespace = Namespace("http://w3id.org/openeduhub/vocabs/" + systematics_name + "/")
    base_url = URIRef(namespace)

    g.add(( base_url, RDF.type, SKOS.ConceptScheme))
    g.add(( base_url, DCTERMS.title, Literal(systematics_name, lang="de")))


    for subject in data:
        discipline = next(iter(subject))
        # TODO for testing
        if str(discipline) not in ["Mathematik", "Physik"]:
            continue
        print(discipline)
        # END TODO

        discipline_url = URIRef(namespace + str(discipline.lower().replace(" ", "-")))
        g.add( (discipline_url, RDF.type, SKOS.Concept) )
        g.add( (discipline_url, SKOS.prefLabel, Literal(str(discipline), lang="de" ) ) )

        # add topConceptOf
        g.add(( discipline_url, SKOS.topConceptOf, base_url) )
        g.add(( base_url, SKOS.hasTopConcept, discipline_url) )

        def addItem(item):
            item_url = URIRef(namespace + item["DOKUMENT-ID"])

            g.add( (item_url, RDF.type, SKOS.Concept) )
            g.add( (item_url, SKOS.prefLabel, Literal(item["SCHLAGWORT"], lang="de")) )
            g.add( (item_url, SKOS.inScheme, base_url) )

            if "broader" in item.keys():
                child_url = URIRef(namespace + item["broader"])
                g.add( (item_url, SKOS.broader, child_url) )
                g.add( (child_url, SKOS.narrower, item_url) )


        for item in subject[discipline]:
            item_url = URIRef(namespace + item["DOKUMENT-ID"])

            # if item is a descriptor, add it
            if item["ART"] == "DS":
                # add item to discipline narrower
                g.add((discipline_url, SKOS.narrower, item_url))
                # and add discipline narrower to item broader
                g.add((item_url, SKOS.broader, discipline_url))

            addItem(item)

    g.bind("skos", SKOS)
    g.bind("dct", DCTERMS)
    g.bind("eaf", base_url)

    output = g.serialize(format='turtle', base=base_url).decode("utf-8")

    data_path = Path.cwd() / "data"
    data_path.mkdir(exist_ok=True)
    with open("data/eaf-graph-by-subject.ttl", "w") as f:
        f.write(output)

buildGraph(discipline_list)

## TODO check gnd or wikidata for entry ?

In [None]:
import json
with open('outputfile.json') as js_file:
    data = json.load(js_file)

def buildDisciplineNodes(nodes):
    disciplineNodes = set(item["discipline"]["de"] for item in data)
    return disciplineNodes

disciplines = buildDisciplineNodes(data)

In [None]:
with open("eaf-graph-all.ttl", "w") as f:
    f.write(output)