Starting from LAM mapping Excel file generate the SRC-AP representation for the LAM project suitable to be laded into VocBench3

In [12]:
%load_ext autoreload
%autoreload

import rdflib as rdf
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, FOAF, OWL, XMLNS, XSD
import pathlib
import pandas as pd
import re 
from lam_utils import *
# from pprint import pprint as print
import warnings
import uuid

INPUT_FILE = pathlib.Path("LAM-mapping-input.xlsx")

DCT = DCTERMS
EUVOC = rdf.Namespace("http://publications.europa.eu/ontology/euvoc#")
SKOSXL = rdf.Namespace("http://www.w3.org/2008/05/skos-xl#")
SHACL = rdf.Namespace("http://www.w3.org/ns/shacl#")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read Excel

In [18]:
lam_df_description = pd.read_excel(INPUT_FILE, sheet_name=0 , header = [0,1,2], )
lam_df_constraints = pd.read_excel(INPUT_FILE, sheet_name=1 ,header = [0,1,2], )
lam_df_constraints_with_annotations = pd.read_excel(INPUT_FILE, sheet_name=2 ,header = [0,1,2], )
lam_df_constraints_unclear = pd.read_excel(INPUT_FILE, sheet_name=3 ,header = [0,1,2], )

# Defining the generative procedires

In [28]:
from abc import ABC, abstractmethod

class ColumnTripleMaker(ABC):
    """
        Create triples in a controlled manned  for a specified column.
        
        Given a target column name present in a descriptive dataframe generate all the RDF triples for the column.
        This functionality is similar to what Sheet2RDF does, and in addition this function allows for multi-level columns, 
        which permit specification for additional parameters.
        
        Current convention for the multi level index is as follows: 
            level 0 is the user friendly column name 
            level 1 is the qualified name of the RDF property to be used (e.g. skos:prefLabel@en)
            level 2 is the URI indicating the controlled list of values (property range). This column can also be used as 
                    a base URI prefix for column values. (currently unimplemented/unused)
        
        The rows represent descriptions of LAM concepts. The values in the "ID" column represent 
        the concept URIs and function as subjects. The values from the rest of the columns represent 
        property values that function as objects. The subject and objects are connected by predicates 
        specified at the level 2 of the index.
        
        This function creates the triples for all the rows and only one non-ID column.
        
        df: data frame object
        target_column: the column from which triples should be created
        uri_column: the column with subject uris, usually the first column
        graph: the RDF graph where the triples should be added, also the source of namespace records needed for qname resolution
        uri_value: indicates whetehr the column values are expected to be URIs or literals
        error_bad_lines: if true then no result is generated, else the errorneous rows are reported as warnings 
    
    """
    def __init__(self,df,uri_column, graph):
        self.df = df 
        self.graph = graph
        self.uri_column = uri_column
    
    def make_column_triples(self, target_column, uri_value=False, error_bad_lines=True):
        """
            For a given target_column, iterate uri and target values and create triples for them. 
        """
        result_triples = []
        self.target_column = target_column
        try:
            name, pred, controled_list_uri = multiindex_colum_header(self.df.columns, target_column)
            self.controled_list_uri = None if "Unnamed:" in controled_list_uri else controled_list_uri
        except:
            raise Exception(f"Could not access {target_column} column.")
            
        self.predicate = qname_uri(pred, self.graph.namespaces())
        self.language = qname_lang(pred)

        # create triples for each value in the column
        for quri, obj in zip(multiindex_colum_values(self.df,self.uri_column),multiindex_colum_values(self.df,target_column)):
            #skip nan
            if pd.isna(obj):
                continue 
            try:
                subject = qname_uri(quri, self.graph.namespaces())
                # if not language
                if self.language:
                    oobject = rdf.Literal(obj, lang=self.language)
                else:
                    oobject = rdf.Literal(obj)
                    # try to make a URI for this object value and overwite the variable value
                    if uri_value:
                        try:
                            oobject = qname_uri(obj, self.graph.namespaces())
                        except:
                            try:
                                oobject = rdf.URIRef(obj)
                            except:
                                raise Exception(f"Cannot create URI from {obj}.")

                # if everything went well so far, record the triples
                result_triples.extend( self.make_cell_triples(subject, self.predicate, oobject) ) 
            except:
                if error_bad_lines:
                    raise Exception(f"Could not create triples for the column {target_column}. There is an error ar the row {quri}.")
                else:
                    warnings.warn(f"There is an error ar the row {quri} column {name}. The value {obj} was skipped.")
                    continue

            #  add triples to the graph
            for triple in result_triples:
                self.graph.add(triple)

    
    @abstractmethod
    def make_cell_triples(self, subject, predicate, oobject, controled_list_uri):
        """
            for a given subject, predicate, object create the triples gor RDF graph.
            The triple can be simple or reified. 
            @return a list of triple tuples
        """
        pass


class SimpleTripleMaker(ColumnTripleMaker):
    """
        make (s,p,o)
    """
    def make_cell_triples(self, subject, predicate, oobject):
        return [tuple( [subject, predicate, oobject] )]
        
    
class ReifiedTripleMaker(ColumnTripleMaker):
    """
        make 
            (s, p, o_uri),
            (o_uri, rdf:type, reification_class),
            (o_uri, reification_property, o),
    """
    def __init__(self, df, uri_column, graph, reification_class="skosxl:Label", reification_property="skos:literalForm",):
        self.reification_class = reification_class
        self.reification_property = reification_property
        super().__init__(df, uri_column, graph,)
        
    def make_cell_triples(self, subject, predicate, oobject, ):
        r_class = qname_uri(self.reification_class, self.graph.namespaces())
        r_property = qname_uri(self.reification_property, self.graph.namespaces())
        # use the default namespace for intermediary/reification nodes
        r_uri = qname_uri(":"+str(uuid.uuid4()), self.graph.namespaces())
        return [ 
            tuple ( [subject,predicate,r_uri] ), 
            tuple ( [r_uri,RDF.type,r_class] ), 
            tuple ( [r_uri,r_property,oobject] ), 
        ]

    
class SimpleConstraintTripleMaker(ColumnTripleMaker):
    """
        considers the URI a class and for column add property contraint. 
        
        Current convention for the multi level index is as follows: 
            - level 0 is the user friendly column name , value of sh:name
            - level 1 is the qualified name of sh:path (of the SHACL property constraint), 
            - level 2 is the URI indicating the controlled list of values (property range); 
                    value of sh:class, assuming that the controlled list is indicated as a class
         
         The cell values are interpreted as follows:
             - empty cell: SKIP the cell, no constrainy is created
             - yes: cardinality [1,*]
             - no: cardinality [0,0]             
             - all cell values: sh:description
             
        make (s, sh:property, o_uri),
             (o_uri, rdf:type, sh:PropertyShape),
             (o_uri, sh:path, p),
             (o_uri, sh:name, column name),
             (o_uri, sh:description, o),
             (o_uri, sh:class, column controlled list value), #(if not empty)
             
             (... additional triples based on the above conditional interpretation of the cell values)
    """
    def make_cell_triples(self, subject, predicate, oobject, ):
        
        # if the cell has no value then do nothing
        if not oobject:
            return []
        
#         r_class =  SHACL.PropertyShape 
#         r_property = qname_uri(self.reification_property, self.graph.namespaces())
        # use the default namespace for intermediary/reification nodes
        r_uri = qname_uri(":"+str(uuid.uuid4()), self.graph.namespaces())
        
        result =  [
            tuple ( [subject,SHACL.property,r_uri] ), 
            tuple ( [r_uri,RDF.type,SHACL.PropertyShape ] ), 
            tuple ( [r_uri,SHACL.path,predicate] ), 
            tuple ( [r_uri,SHACL.name,rdf.Literal(f"{self.target_column} on {str(self.graph.namespace_manager.qname(predicate))}")] ), 
            tuple ( [r_uri,SHACL.description, rdf.Literal(f"The {self.target_column} column on {str(self.graph.namespace_manager.qname(predicate))} is '{oobject}'") ] ),
            tuple ( [r_uri,RDFS.comment,rdf.Literal("\
For full documentation prease refer to \
LAM-SRC-AP (recommendations for LAM project) \
and SHACL (language specification) documentation")] ),
        ]

        if str.lower(oobject) in ["y","yes"]:
            result.append( tuple( [ r_uri,SHACL.minCount,rdf.Literal(1)]) )

        if str.lower(oobject) in ["n","no"]:
            result.append( tuple( [ r_uri,SHACL.maxCount,rdf.Literal(0)]) )

        return result
    


# LAM project settings and variables

In [7]:
CDM = rdf.Namespace("http://publications.europa.eu/ontology/cdm#")
LAM = rdf.Namespace("http://publications.europa.eu/resources/authority/lam/")

def make_lam_graph():
    """
        init the LAM graph 
    """
    lam_graph = rdf.Graph()

    lam_graph.bind("", LAM)
    lam_graph.bind("lam", LAM)
    
    lam_graph.bind("cdm", CDM)
    lam_graph.bind("euvoc", EUVOC)
        
    lam_graph.bind("skos", SKOS)
    lam_graph.bind("skosxl", SKOSXL)
    lam_graph.bind("dct", DCTERMS)
    lam_graph.bind("sh", SHACL)

    lam_graph.bind("rdf", RDF)
    lam_graph.bind("rdfs", RDFS)
    lam_graph.bind("xsd", XSD)
    
    return lam_graph

# create the graph instance
lam_graph = make_lam_graph()
# the project concept scheme URI
LAM_CS = rdf.URIRef("http://publications.europa.eu/resources/authority/lam")
# a little bit of column management 
URI_COLUMN = 'ID'

# all descriptive columns
# DESCRIPTIVE_COLUMNS_ALL = ['CDM_class', 'CDM_class_comment', 'DTS', 'DTT', 'Resource type',
#                            'OJ type', 'Keywords', 'Example EN', 'Example FR', 'Additional comment',
#                            'Author', 'Examples from EUR-Lex']
# all descriptive columns which have URI values
# DESCRIPTIVE_COLUMNS_URIS = ['CDM_class']
# all descriptive columns which have literal values
# DESCRIPTIVE_COLUMNS_LITERALS = [x for x in DESCRIPTIVE_COLUMNS_ALL if x not in DESCRIPTIVE_COLUMNS_URIS]

# Creating the project

In [38]:

def create_cs(graph = lam_graph):
    """
        create the concept scheme definition
    """
    graph.add( (LAM_CS, RDF.type, SKOS.ConceptScheme) )
    graph.add( (LAM_CS, SKOS.prefLabel, rdf.Literal("Legal Analisys Methodology") ) )    
        
def create_concepts(df, graph):
    """
        for each row create triples for all descriptive columns. 
    """
    #     NAMESPACES = graph.namespaces()
    for idx, row in df.iterrows():
        subject =  qname_uri(row[URI_COLUMN][0], graph.namespaces())
        #         print(row["ID"])
            
        graph.add( (subject, RDF.type, SKOS.Concept) )
        graph.add( (subject, SKOS.inScheme, LAM_CS) )
    
    
    # add descriptive triples with URI values for each column
    simple_uri_maker = SimpleTripleMaker(df, uri_column=URI_COLUMN, graph = graph)
    for column_name in ["CDM_class"]:
        simple_uri_maker.make_column_triples(column_name, uri_value=True)
        
    # add descriptive triples with literal values for each column
    simple_literal_maker = SimpleTripleMaker(df, uri_column=URI_COLUMN, 
                                              graph = graph)
    
    for column_name in ['DTS', 'DTT', 'Resource type','OJ type',]:
        simple_literal_maker.make_column_triples(column_name, uri_value=False)
    
    # add reified labels for each column
    reified_label_maker = ReifiedTripleMaker(df, uri_column=URI_COLUMN, graph = graph, 
                                      reification_class="skosxl:Label", reification_property="skos:literalForm",)
    for column_name in ["Keywords"]:
        reified_label_maker.make_column_triples(column_name)

    # add reified notes for each column
    reified_note_maker = ReifiedTripleMaker(df, uri_column=URI_COLUMN, graph = graph, 
                                      reification_class="euvoc:XlNote", reification_property="rdf:value",)
    for column_name in ["CDM_class_comment","Example EN","Example FR","Additional comment","Examples from EUR-Lex"]:
        reified_note_maker.make_column_triples(column_name)        

        
def create_constraints(df, graph ):
    """
        for each row create triples for all constraint columns. 
    """
    constraint_columns = ['EuroVoc', 'Subject Matter', 'Directory Code',
       'EU Case law directory code (before 2009)',
       'EU Case law directory code', 'NF', 'VO', 'DB', 'LO', 'VV', 'REP', 'RS',
       'AS', 'MI', 'LG (and subproperties)', 'RI', 'DP (and subproperties)',
       'LF', 'IC', 'CM', 'NS', 'TT', 'LB', 'CI', 'JR', 'AG', 'NC', 'NO']
    
    
    simple_constraint_maker = SimpleConstraintTripleMaker(df, uri_column=URI_COLUMN, graph = graph)
    for column_name in constraint_columns:
        simple_constraint_maker.make_column_triples(column_name)
        
# display(lam_df_constraints.columns.get_level_values(0))
        
# geneation   
lam_graph = make_lam_graph()

create_cs(lam_graph)
create_concepts(df=lam_df_description, graph = lam_graph)
create_constraints(df=lam_df_constraints, graph = lam_graph)

lam_graph.serialize("./lam_project.ttl",format='turtle',)