Starting from LAM mapping Excel file generate the SRC-AP representation for the LAM project suitable to be laded into VocBench3

In [26]:
%load_ext autoreload
%autoreload

import rdflib as rdf
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, FOAF, OWL, XMLNS, XSD
import pathlib
import pandas as pd
import re 
from lam_utils import *
# from pprint import pprint as print
import warnings

INPUT_FILE = pathlib.Path("LAM-mapping-input.xlsx")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read Excel

In [2]:
lam_df_description = pd.read_excel(INPUT_FILE, sheet_name=0 , header = [0,1,2], )
lam_df_constraints = pd.read_excel(INPUT_FILE, sheet_name=1 ,header = [0,1,2], )

# Setup RDF graph

In [36]:
def lam_graph_instance():
    
    lam_graph = rdf.Graph()

    CDM = rdf.Namespace("http://publications.europa.eu/ontology/cdm#")
    LAM = rdf.Namespace("http://publications.europa.eu/resources/authority/lam/")
    DCT = DCTERMS
    EUVOC = rdf.Namespace("http://publications.europa.eu/ontology/euvoc#")
    SKOSXL = rdf.Namespace("http://www.w3.org/2008/05/skos-xl#")

    lam_graph.bind("", LAM)
    lam_graph.bind("lam", LAM)
    
    lam_graph.bind("cdm", CDM)
    lam_graph.bind("lam", EUVOC)
        
    lam_graph.bind("skos", SKOS)
    lam_graph.bind("lam", SKOSXL)
    lam_graph.bind("dct", DCTERMS)

    lam_graph.bind("rdf", RDF)
    lam_graph.bind("rdfs", RDFS)
    lam_graph.bind("xsd", XSD)
    
    return lam_graph

lam_graph = lam_graph_instance()
# NAMESPACES = lam_graph.namespaces()

# Defining the generative procedires

In [32]:
from abc import ABC, abstractmethod

class ColumnTripleMaker(ABC):
    """
        Create triples in a controlled manned  for a specified column.
        
        Given a target column name present in a descriptive dataframe generate all the RDF triples for the column.
        This functionality is similar to what Sheet2RDF does, and in addition this function allows for multi-level columns, 
        which permit specification for additional parameters.
        
        Current convention for the multi level index is as follows: 
            level 0 is the user friendly column name 
            level 1 is the qualified name of the RDF property to be used (e.g. skos:prefLabel@en)
            level 2 is the URI indicating the controlled list of values (property range). This column can also be used as 
                    a base URI prefix for column values. (currently unimplemented/unused)
        
        The rows represent descriptions of LAM concepts. The values in the "ID" column represent 
        the concept URIs and function as subjects. The values from the rest of the columns represent 
        property values that function as objects. The subject and objects are connected by predicates 
        specified at the level 2 of the index.
        
        This function creates the triples for all the rows and only one non-ID column.
        
        df: data frame object
        target_column: the column from which triples should be created
        uri_column: the column with subject uris, usually the first column
        graph: the RDF graph where the triples should be added, also the source of namespace records needed for qname resolution
        uri_value: indicates whetehr the column values are expected to be URIs or literals
        error_bad_lines: if true then no result is generated, else the errorneous rows are reported as warnings 
    
    """
    def __init__(self,df,uri_column, 
                         graph, uri_value=False, 
                         error_bad_lines = True):
        self.df = df 
        self.uri_column = uri_column
        self.graph = graph
        self.uri_value = uri_value
        self.uri_column = uri_column
        self.error_bad_lines = error_bad_lines
        self.NAMESPACES = self.graph.namespaces()
    
    
    def make_column_triples(self, target_column):
        """
            For a given target_column, iterate uri and target values and create triples for them. 
        """
        result_triples = []
        try:
            name, pred, controled_list_uri = multiindex_colum_header(self.df.columns, target_column)
            controled_list_uri = None if "Unnamed:" in controled_list_uri else controled_list_uri
        except:
            raise Exception(f"Could not access {target_column} column.")
        predicate = qname_uri(pred, self.NAMESPACES)
        language = qname_lang(pred)

        # create triples for each value in the column
        for quri, obj in zip(multiindex_colum_values(self.df,self.uri_column),multiindex_colum_values(self.df,target_column)):
            try:
                subject = qname_uri(quri, self.NAMESPACES)
                # if not language
                if language:
                    oobject = rdf.Literal(obj, lang=language)
                else:
                    oobject = rdf.Literal(obj)
                    # try to make a URI for this object value and overwite the variable value
                    if uri_value:
                        try:
                            oobject = qname_uri(obj, self.NAMESPACES)
                        except:
                            try:
                                oobject = rdf.URIRef(obj)
                            except:
                                raise Exception(f"Cannot create URI from {obj}.")

                # if everything went well so far, record the triples
                result_triples.extend( self.make_row_triples(subject, predicate, oobject) ) 
            except:
                if self.error_bad_lines:
                    raise Exception(f"Could not create triples for the column {target_column}. There is an error ar the row {quri}.")
                else:
                    warnings.warn(f"There is an error ar the row {quri} column {name}. The value {obj} was skipped.")
                    continue

            #  add triples to the graph
            for triple in result_triples:
                self.graph.add(triple)

    
    @abstractmethod
    def make_row_triples(self, subject, predicate, oobject):
        """
            for a given subject, predicate, object create the triples gor RDF graph.
            The triple can be simple or reified. 
            @return a list of triple tuples
        """
        pass


class SimpleTripleMaker(ColumnTripleMaker):
    
#     def __init__(self,df,uri_column=URI_COLUMN, 
#                          graph = lam_graph, uri_value=False, 
#                          error_bad_lines = True):
#         super().__init__(self,df,uri_column=URI_COLUMN, 
#                          graph = lam_graph, uri_value=False, 
#                          error_bad_lines = True)
    
    def make_row_triples(self, subject, predicate, oobject):
        return tuple(subject, predicate, oobject)
    


In [17]:
  
def create_cs(graph = lam_graph):
    """
        create the concept scheme definition
    """
    graph.add( (LAM_CS, RDF.type, SKOS.ConceptScheme) )
    graph.add( (LAM_CS, SKOS.prefLabel, rdf.Literal("Legal Analisys Methodology") ) )


def simple_triple_maker(subject, predicate, oobject):
    """
        return a triple representing the sunbect, predicate , 
    """
    
def create_descriptive_column_triples(df, target_column, uri_column, 
                                      graph, uri_value=False, 
                                      error_bad_lines = True, triple_maker=simple_triple_maker):
    """
        Given a target column name present in a descriptive dataframe generate all the RDF triples for the column.
        This functionality is similar to what Sheet2RDF does, and in addition this function allows for multi-level columns, 
        which permit specification for additional parameters.
        
        Current convention for the multi level index is as follows: 
            level 0 is the user friendly column name 
            level 1 is the qualified name of the RDF property to be used (e.g. skos:prefLabel@en)
            level 2 is the URI indicating the controlled list of values (property range). This column can also be used as 
                    a base URI prefix for column values. (currently unimplemented/unused)
        
        The rows represent descriptions of LAM concepts. The values in the "ID" column represent 
        the concept URIs and function as subjects. The values from the rest of the columns represent 
        property values that function as objects. The subject and objects are connected by predicates 
        specified at the level 2 of the index.
        
        This function creates the triples for all the rows and only one non-ID column.
        
        df: data frame object
        target_column: the column from which triples should be created
        uri_column: the column with subject uris, usually the first column
        graph: the RDF graph where the triples should be added, also the source of namespace records needed for qname resolution
        uri_value: indicates whetehr the column values are expected to be URIs or literals
        error_bad_lines: if true then no result is generated, else the errorneous rows are reported as warnings 
    """
    NAMESPACES = graph.namespaces()
    result_triples = []
    try:
        name, pred, controled_list_uri = multiindex_colum_header(df.columns, target_column)
        controled_list_uri = None if "Unnamed:" in controled_list_uri else controled_list_uri
    except:
        raise Exception(f"Could not access {target_column} column.")
    predicate = qname_uri(pred, NAMESPACES)
    language = qname_lang(pred, NAMESPACES)
    
    # create triples for each value in the column
    for quri, obj in zip(multiindex_colum_values(df,uri_column),multiindex_colum_values(df,target_column)):
        try:
            subject = qname_uri(quri, NAMESPACES)
            # if not language
            if language:
                oobject = rdf.Literal(obj, lang=language)
            else:
                oobject = rdf.Literal(obj)
                # try to make a URI for this object value and overwite the variable value
                if uri_value:
                    try:
                        oobject = qname_uri(obj, NAMESPACES)
                    except:
                        try:
                            oobject = rdf.URIRef(obj)
                        except:
                            raise Exception(f"Cannot create URI from {obj}.")
                    
            # if everything went well so far, record the triples
            result_triples.append ( triple_maker(subject, predicate, oobject) ) 
        except:
            if error_bad_lines:
                raise Exception(f"Could not create triples for the column {target_column}. There is an error ar the row {quri}.")
            else:
                warnings.warn(f"There is an error ar the row {quri} column {name}. The value {obj} was skipped.")
                continue
        
        #  add triples to the graph
        for triple in result_triples:
            graph.add(triple)


def create_descriptive_reified_column_triples(df, target_column, uri_column, graph, 
                             reification_class="skosxl:Label", value_propoerty="skosxl:literalForm", ):            
    """                        
        Given a target column name present in a descriptive dataframe generate all the RDF triples 
        for the column instantiating a reified structure such as for example skosxl:Label.
        
        This function is similar to `create_descriptive_column_triples` only that the resulting triples 
        are a reified structure rather than a direct value statement. 
    """
        
            
def create_reified_skos_labels(df, target_column, uri_column, graph , uri_value=False, error_bad_lines = True, ):
    """
        TODO:
    """
    pass

def create_reified_skos_notes():
    """
        TODO:
    """
    pass

def create_constraint_column_triples(df, target_column, uri_column, graph , error_bad_lines = True, ):
    """
        TODO:
    """
    pass

def create_concepts(df, graph):
    """
        for each row create triples for all columns. 
    """
    NAMESPACES = graph.namespaces()
    for idx, row in df.iterrows():
        subject =  qname_uri(row["ID"][0], NAMESPACES)
        #         print(row["ID"])
            
        graph.add( (subject, RDF.type, SKOS.Concept) )
        graph.add( (subject, SKOS.inScheme, LAM_CS) )
    # add descriptive triples for each column, uris
    for column_name in DESCRIPTIVE_COLUMNS_URIS:
        create_descriptive_column_triples(lam_df_description, 
                                              target_column=column_name, 
                                              uri_column=uri_column, 
                                              graph = graph, 
                                              uri_value=True) 
        # add descriptive triples for each column, literals
    for column_name in DESCRIPTIVE_COLUMNS_LITERALS:
        create_descriptive_column_triples(lam_df_description, 
                                              target_column=column_name, 
                                              uri_column=uri_column, 
                                              graph = graph, 
                                              uri_value=False) 

# Creating the project

In [35]:
# create the graph instance
lam_graph = lam_graph_instance()
print(list(lam_graph.namespaces()))
# the project concept scheme URI
LAM_CS = rdf.URIRef("http://publications.europa.eu/resources/authority/lam")
# a little bit of column management 
URI_COLUMN = 'ID'
# all descriptive columns
DESCRIPTIVE_COLUMNS_ALL = ['CDM_class', 'CDM_class_comment', 'DTS', 'DTT', 'Resource type',
                           'OJ type', 'Keywords', 'Example EN', 'Example FR', 'Additional comment',
                           'Author', 'Examples from EUR-Lex']
# all descriptive columns which have URI values
DESCRIPTIVE_COLUMNS_URIS = ['CDM_class']
# all descriptive columns which have literal values
DESCRIPTIVE_COLUMNS_LITERALS = [x for x in DESCRIPTIVE_COLUMNS_ALL if x not in DESCRIPTIVE_COLUMNS_URIS]


# geneation        
# create_cs(lam_graph)
# create_concepts(df=lam_df_description, graph = lam_graph)
# lam_graph.serialize("./lam_project.ttl",format='turtle',)

tm = SimpleTripleMaker(df=lam_df_description,uri_column=URI_COLUMN, 
                         graph=lam_graph, uri_value=False, 
                         error_bad_lines = True)

tm.make_column_triples("CDM_class_comment")

# testing
# create_descriptive_column_triples(lam_df_description, target_column="CDM_class_comment", uri_column="ID", graph = lam_graph, uri_value=False) 
print (list( lam_graph.triples( (LAM.c_014, None, None) ) ))

[('xml', rdflib.term.URIRef('http://www.w3.org/XML/1998/namespace')), ('rdf', rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#')), ('rdfs', rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#')), ('xsd', rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#')), ('', rdflib.term.URIRef('http://publications.europa.eu/resources/authority/lam/')), ('lam', rdflib.term.URIRef('http://publications.europa.eu/resources/authority/lam/')), ('cdm', rdflib.term.URIRef('http://publications.europa.eu/ontology/cdm#')), ('dct', rdflib.term.URIRef('http://purl.org/dc/terms/')), ('skos', rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'))]


Exception: Could not create triples for the column CDM_class_comment. There is an error ar the row lam:c_001.