In [1]:
import json
import csv
from pathlib import Path
import re
from rdflib import Namespace, Graph, Literal, URIRef, RDF, SKOS, DCTERMS

In [18]:
# TODO add notation eaf datatype

csv_path = Path.cwd() / "data" / "csv" / "schlagworte.csv"


def open_csv(csv_path) -> list:
    csv_data = []
    with open(csv_path, newline="") as f:
        reader = csv.DictReader(f, delimiter="|")
        for row in reader:
            csv_data.append(row)
    return csv_data

# Schlagwort: Definierter Begriff, kann aus einem oder mehreren Wörtern bestehen, kann Deskriptor oder Nicht-Desckriptor sein
# Art: DS = Deskriptor, ND = Nicht-Deskriptor, (Verweis mit s. im Feld "Text"), D = Deskriptor mit Verweis im Feld "Text"
# Feldbegr: Feldbegriff, Kategorisierung der Deskriptoren
# Text: Definition des Deskriptors oder Querverweis mit "s." und "s.a."


def findAllNodes(data):
    for item in data:
        if item["ART"] == "DS":
            # item is main node append it
            node = {
                "schlagwort": item["SCHLAGWORT"],
                "dokument-id": item["DOKUMENT-ID"],
                "text": item["TEXT"],
                "discipline": item["FELDBEGR"]
            }
            root.append(node)

csv_data = open_csv(csv_path)
root = []

findAllNodes(csv_data)
disciplines = set([node['discipline'] for node in root])
print(f"Distict disciplines: {*disciplines,}")
print(f"Number of distinct disciplines: {len(disciplines)}")

Distict disciplines: ('', 'Verkehrserziehung', 'Interkulturelle Bildung', 'Ethik', 'Musik', 'Arbeitslehre', 'Grundschule', 'Gesundheit', 'Fremdsprachen', 'Spiel- und Dokumentarfilm', 'Geschichte', 'Berufliche Bildung', 'Informationstechnische Bildung', 'Pädagogik', 'Politische Bildung', 'Religion', 'Geographie', 'Freizeit', 'Bildende Kunst', 'Biologie', 'Sucht und Prävention', 'Physik', 'Deutsch', 'Chemie', 'Retten, Helfen, Schützen', 'Elementarbereich, Vorschulerziehung', 'Umweltgefährdung, Umweltschutz', 'Wirtschaftskunde', 'Medienpädagogik', 'Leben', 'Sexualerziehung', 'Sport', 'Mathematik')
Number of distinct disciplines: 33


## Append all all Nodes to their respective discipline

In [19]:
def buildListOfDisciplines(data):
    discipline_list = []
    for item in data:
        # second iteration when all mein node exists
        # if item is a Non-Deskriptor find its Deskriptor (DS) and append it to DS as hiddenLabel
        try:
            if item["ART"] == "DS":
                discipline = node["discipline"]

                # if list empty, append the discipline
                if len(discipline_list) == 0:
                    discipline_list.append({discipline: []})
                # check if the discipline is already there, otherwise append
                elif len([d for d in discipline_list if discipline in d.keys()]) == 0:
                    discipline_list.append({discipline: []})
                
                next(d[discipline] for d in discipline_list if discipline in d.keys()).append({
                    "schlagwort": item["SCHLAGWORT"],
                    "art": item["ART"],
                    "dokument-id": item["DOKUMENT-ID"],
                    "text": item["TEXT"],
                    "discipline": item["FELDBEGR"]
                })

            else:
                # find the descriptor
                descriptor = re.split(descriptor_split, item["TEXT"])[-1].strip()

                # find the node it belongs to
                node = next(n for n in root if descriptor in n["schlagwort"])
                if not node:
                    node = next(n for n in root if n["schlagwort"] == descriptor)

                discipline = node["discipline"]
                
                # if list empty, append the discipline
                if len(discipline_list) == 0:
                    discipline_list.append({discipline: []})
                # check if the discipline is already there, otherwise append
                elif len([d for d in discipline_list if discipline in d.keys()]) == 0:
                    discipline_list.append({discipline: []})
                
                next(d[discipline] for d in discipline_list if discipline in d.keys()).append({
                    "schlagwort": item["SCHLAGWORT"],
                    "art": item["ART"],
                    "dokument-id": item["DOKUMENT-ID"],
                    "text": item["TEXT"],
                    "discipline": item["FELDBEGR"]
                    })

        except:
            print(
                f"did not find root: {descriptor} at item: {item['DOKUMENT-ID']}")
    return discipline_list

descriptor_split = re.compile(r"s\.a\.|s\.")


discipline_list = buildListOfDisciplines(csv_data)

## Build csv files for each discipline

In [6]:
csv_out_path = Path.cwd() / "data" / "csv_out"
csv_out_path.mkdir(exist_ok=True, parents=True)

for d in discipline_list:
    discipline = next(iter(d))
    csv_path = Path(csv_out_path / (str(discipline) + ".csv"))
    csv_path.touch(exist_ok=True)
    with open(Path(csv_out_path / (str(discipline) + ".csv")), 'w', newline='') as csvfile:
        fieldnames = ['notation', 'prefLabel', 'art', 'definition']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames,extrasaction='ignore')

        writer.writeheader()
        
        for i, item in enumerate(d[discipline]):
            writer.writerow(item)
        print(f"{discipline} has {i} items")

Religion has 1809 items
Geschichte has 2749 items
Geographie has 8440 items
Deutsch has 970 items
Musik has 1014 items
Verkehrserziehung has 803 items
Physik has 2215 items
Wirtschaftskunde has 1361 items
Medienpädagogik has 478 items
Ethik has 86 items
Leben has 1761 items
Politische Bildung has 1434 items
Gesundheit has 1979 items
Sport has 2675 items
Biologie has 6321 items
Bildende Kunst has 1663 items
Freizeit has 530 items
Chemie has 1267 items
Mathematik has 576 items
Umweltgefährdung, Umweltschutz has 809 items
Arbeitslehre has 2055 items
Pädagogik has 847 items
Spiel- und Dokumentarfilm has 432 items
Grundschule has 32 items
Informationstechnische Bildung has 841 items
Sucht und Prävention has 160 items
Berufliche Bildung has 1332 items
Fremdsprachen has 606 items
Sexualerziehung has 171 items
Interkulturelle Bildung has 260 items
Retten, Helfen, Schützen has 180 items
Elementarbereich, Vorschulerziehung has 41 items
 has 0 items


## check gnd or wikidata for entry

In [None]:
def appendToRootNodes(data):
    for item in data:
        # second iteration when all mein node exists
        # if item is a Non-Deskriptor find its Deskriptor (DS) and append it to DS as hiddenLabel
        try:
            if item["ART"] == "ND":
                # find the descriptor
                descriptor = re.split(descriptor_split, item["TEXT"])[-1].strip()

                node = next(n for n in root if descriptor in n["prefLabel"]["de"])
                if not node:
                    node = next(n for n in root if n["prefLabel"]["de"] == descriptor)

                node.setdefault("hiddenLabel", []).append({"de": item["SCHLAGWORT"]})

            elif item["ART"] == "D":
                descriptor = re.split(descriptor_split, item["TEXT"])[-1].strip()

                node = next(n for n in root if descriptor in n["prefLabel"]["de"])
                if not node:
                    node = next(n for n in root if n["prefLabel"]["de"] == descriptor)
                node.setdefault("altLabel", []).append({"de": item["SCHLAGWORT"]})
        except:
            print(
                f"did not find root: {descriptor} at item: {item['DOKUMENT-ID']}")


descriptor_split = re.compile(r"s\.a\.|s\.")

appendToRootNodes(csv_data)

with open('outputfile.json', 'w') as fout:
    json.dump(root, fout)

In [3]:
import json
with open('outputfile.json') as js_file:
    data = json.load(js_file)

def buildDisciplineNodes(nodes):
    disciplineNodes = set(item["discipline"]["de"] for item in data)
    return disciplineNodes

disciplines = buildDisciplineNodes(data)

In [4]:
root_disciplines = {}
for item in data:
    root_disciplines.setdefault(item["discipline"]["de"], []).append(item)

In [4]:
from pprint import pprint
pprint(root_disciplines.keys())

dict_keys(['Geographie', 'Bildende Kunst', 'Religion', 'Biologie', 'Sport', 'Physik', 'Verkehrserziehung', 'Spiel- und Dokumentarfilm', 'Deutsch', 'Grundschule', 'Umweltgefährdung, Umweltschutz', 'Politische Bildung', 'Informationstechnische Bildung', 'Musik', 'Interkulturelle Bildung', 'Geschichte', 'Pädagogik', 'Gesundheit', 'Mathematik', 'Wirtschaftskunde', 'Fremdsprachen', 'Leben', 'Chemie', 'Berufliche Bildung', 'Arbeitslehre', 'Medienpädagogik', 'Sucht und Prävention', 'Ethik', 'Freizeit', 'Elementarbereich, Vorschulerziehung', 'Retten, Helfen, Schützen', 'Sexualerziehung'])


In [60]:
systematics_name = "eaf-schlagwortsystematik-all"
g = Graph()
namespace = Namespace("http://w3id.org/openeduhub/vocabs/" + systematics_name + "/")
base_url = URIRef(namespace)

g.add(( base_url, RDF.type, SKOS.ConceptScheme))
g.add(( base_url, DCTERMS.Title, Literal(systematics_name, lang="de")))


for discipline in root_disciplines.keys():
    discipline_url = URIRef(namespace + str(discipline.lower().replace(" ", "-")))
    g.add( (discipline_url, RDF.type, SKOS.Concept) )
    g.add( (discipline_url, SKOS.prefLabel, Literal(str(discipline.lower()), lang="de" ) ) )

    # add topConceptOf
    g.add(( discipline_url, SKOS.topConceptOf, base_url) )
    g.add(( base_url, SKOS.hasTopConcept, discipline_url) )
    
    for item in root_disciplines[discipline]:
        item_url = URIRef(namespace + str(item["notation"]["de"].lower().replace(" ", "-")))

        # add item to discipline narrower
        g.add((discipline_url, SKOS.narrower, item_url))
        # and add discipline narrower to item broader
        g.add((item_url, SKOS.broader, discipline_url))
        
        g.add(( item_url, SKOS.inScheme, base_url))
        

        g.add( (item_url, RDF.type, SKOS.Concept) )
        g.add( (item_url, SKOS.prefLabel, Literal(item["prefLabel"]["de"], lang="de")) )
        
        if "altLabel" in item:
            for label in item["altLabel"]:
                g.add( (item_url, SKOS.altLabel, Literal(label["de"], lang="de") ) )
        if "hiddenLabel" in item:
            for label in item["hiddenLabel"]:
                g.add( (item_url, SKOS.hiddenLabel, Literal(label["de"], lang="de") ) )

g.bind("skos", SKOS)
g.bind("dct", DCTERMS)
g.bind("eaf", base_url)

output = g.serialize(format='turtle', base=base_url).decode("utf-8")

In [61]:
with open("eaf-graph-all.ttl", "w") as f:
    f.write(output)