In [1]:
import kglab
kglab.__version__

'0.6.1'

## Construction of Code Type Subgraph for 2Kilos KG
This Notebook takes the Tables pulled from the Table Munger and, using KgLab and the codetypes.ttl file, builds a Knowledge Graph for merging into 2Kilos-KG. This serves as a constituent part of the "2Kilos Vocabulary" layer of the Ontology Design Pyramid, allowing for instantaneous derefrencing of the pertinent codes in the dataset for Querying and future ML usage.

Using these inputs, this constructs a  "subgraph", consisting of a lightweight Typology that maps each of the code types as subclasses to a master codeType class, with properties containing the text that appears in the dataset (hasText) and the description provided by the JFMM (hasDescription). In addition, a stub is left for each of the codes allowing us to integrate deeper semantics for future inferencing (ex. CauseCodeType 'hasCause' a Cause-- from there we can further structure semantics on causes).

Note: Make sure you are using the latest version of KgLab, 0.6.1. it should be able to be pip installed without issue.

In [2]:
namespaces = {
    "twokkg":  "http://w3id.org/moo/kg/2k/",
    "twokvoc": "http://w3id.org/moo/ont/2k/",
    "moo":  "http://w3id/org/moo/ont/",
    "wd": "http://www.wikidata.org/entity/",
    "wds": "http://www.wikidata.org/entity/statement/",
    "wdv": "http://www.wikidata.org/value/",
    "wdt": "http://www.wikidata.org/prop/direct/",
    "wikibase": "http://wikiba.se/ontology#",
     "p": "http://www.wikidata.org/prop/",
     "ps": "http://www.wikidata.org/prop/statement/",
     "pq": "http://www.wikidata.org/pr+op/qualifier/",
      "bd": "http://www.bigdata.com/rdf#",
    "time": "http://www.w3.org/2006/time#",
}

kg = kglab.KnowledgeGraph(
    name = "Codes Vocabulary extracted from JFMM and Ship's 3M Manual",
    base_uri = "https://w3id.org/moo/voc/mooVoc",
    namespaces = namespaces,
    )
kg.describe_ns()

Unnamed: 0,prefix,namespace
0,dct,http://purl.org/dc/terms/
1,owl,http://www.w3.org/2002/07/owl#
2,prov,http://www.w3.org/ns/prov#
3,rdf,http://www.w3.org/1999/02/22-rdf-syntax-ns#
4,rdfs,http://www.w3.org/2000/01/rdf-schema#
5,schema,http://schema.org/
6,sh,http://www.w3.org/ns/shacl#
7,skos,http://www.w3.org/2004/02/skos/core#
8,xsd,http://www.w3.org/2001/XMLSchema#
9,twokkg,http://w3id.org/moo/kg/2k/


Loading in the codetypes.ttl file to be appended by KgLab. We're using uuids for each instance of the ___CodeTypes pulled from the CSV files- these will be linked over in 2Kilos-KG to corresponding instances in the data.

In [3]:
import pandas as pd
import uuid
import rdflib
from rdflib.namespace import XSD
kg.load_rdf('./decoder-ring/codetypes.ttl')

<kglab.kglab.KnowledgeGraph at 0x7f48fc372da0>

We begin by creating the CodeType top property. This serves as both a collection of codes and the master class of all codeTypes. We can reuse this structure to further integrate any other codes we deem necessary for both 2Kilos and future datasets.

In [4]:
#Create a build code voc function that passes in each of the csv files
#Create a top level code type
CodeCSV = str(uuid.uuid4())
CodeType = rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(CodeCSV))
kg.add(CodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
kg.add(CodeType, kg.get_ns("rdf").type, kg.get_ns("prov").Collection)
kg.add(CodeType, kg.get_ns("dct").title, rdflib.Literal("Naval Maintenance Code"))

Then, we iterate across each of the pertinent CodeTypes, adding nodes for each and mapping the string literals from the CSV files to populate the text and description columns. Note that we have to use PurePath to switch to each CSV before building the vocabulary. I have each individually broken out into separate code blocks here for sanity checking purposes, but this can be formalized as a loop.

In [5]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/causeCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_cause_vocabulary(kg,index,row):
    #Create Cause Code Type
    CauseCodeCSV = str(uuid.uuid4())
    CauseCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(CauseCodeCSV))
    kg.add(CauseCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(CauseCodeType, kg.get_ns("dct").title, rdflib.Literal("Cause Code"))
    #Add subclass relationship to CodeType
    kg.add(CauseCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(CauseCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(CauseCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Cause Node
    CauseCSV = str(uuid.uuid4())
    Cause =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(CauseCSV))
    kg.add(CauseCodeType, kg.get_ns("moo").hasCause, kg.get_ns("moo").Cause)
    #Cause is stub
#build cause vocabulary
for index, row in df.iterrows():
    build_cause_vocabulary(kg,index,row)


In [6]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/actionTakenCodes/actionTakenPrimaryCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_actionTaken_vocabulary(kg,index,row):
    #Create ActionTaken Code Type
    ActionTakenCSV = str(uuid.uuid4())
    ActionTakenCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(ActionTakenCSV))
    kg.add(ActionTakenCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(ActionTakenCodeType, kg.get_ns("dct").title, rdflib.Literal("Action Taken Code"))
    #Add subclass relationship to CodeType
    kg.add(ActionTakenCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(ActionTakenCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(ActionTakenCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Action Taken Node
    ActionCSV = str(uuid.uuid4())
    ActionTaken =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(ActionCSV))
    kg.add(ActionTakenCodeType, kg.get_ns("moo").hasAction, kg.get_ns("moo").Action)
    #Action is stub
#build vocabulary
for index, row in df.iterrows():
    build_actionTaken_vocabulary(kg,index,row)

In [7]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/alterationCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_alteration_vocabulary(kg,index,row):
    #Create Cause Code Type
    AltCodeCSV = str(uuid.uuid4())
    AlterationCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(AltCodeCSV))
    kg.add(AlterationCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(AlterationCodeType, kg.get_ns("dct").title, rdflib.Literal("Alteration Code"))
    #Add subclass relationship to CodeType
    kg.add(AlterationCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(AlterationCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(AlterationCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Alteration Node
    AlterationCSV = str(uuid.uuid4())
    Alteration =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(AlterationCSV))
    kg.add(AlterationCodeType, kg.get_ns("moo").hasCause, kg.get_ns("moo").Alteration)
    #Alteration is stub
#build vocabulary
for index, row in df.iterrows():
    build_alteration_vocabulary(kg,index,row)



In [8]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/deferralCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_deferral_vocabulary(kg,index,row):
    #Create Deferral Code Type
    DeferralCodeCSV = str(uuid.uuid4())
    DeferralCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(DeferralCodeCSV))
    kg.add(DeferralCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(DeferralCodeType, kg.get_ns("dct").title, rdflib.Literal("Deferral Code"))
    #Add subclass relationship to CodeType
    kg.add(DeferralCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(DeferralCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(DeferralCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Deferral Node
    DeferralCSV = str(uuid.uuid4())
    Deferral =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(DeferralCSV))
    kg.add(DeferralCodeType, kg.get_ns("moo").hasDeferral, kg.get_ns("moo").Deferral)
    #Deferral is stub
#build vocabulary
for index, row in df.iterrows():
    build_deferral_vocabulary(kg,index,row)
    

In [9]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/priorityCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_priority_vocabulary(kg,index,row):
    #Create Priority Code Type
    PriorityCodeCSV = str(uuid.uuid4())
    PriorityCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(PriorityCodeCSV))
    kg.add(PriorityCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(PriorityCodeType, kg.get_ns("dct").title, rdflib.Literal("Priority Code"))
    #Add subclass relationship to CodeType
    kg.add(PriorityCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(PriorityCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(PriorityCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Priority Node
    PriorityCSV = str(uuid.uuid4())
    Priority =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(PriorityCSV))
    kg.add(PriorityCodeType, kg.get_ns("moo").hasPriority, kg.get_ns("moo").Deferral)
    #Priority is stub
#build vocabulary
for index, row in df.iterrows():
    build_priority_vocabulary(kg,index,row)

In [10]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/rateCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_rate_vocabulary(kg,index,row):
    #Create Rate Code Type
    RateCodeCSV = str(uuid.uuid4())
    RateCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(RateCodeCSV))
    kg.add(RateCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(RateCodeType, kg.get_ns("dct").title, rdflib.Literal("Rate Code"))
    #Add subclass relationship to CodeType
    kg.add(RateCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(RateCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(RateCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Deferral Node
    RateCSV = str(uuid.uuid4())
    Rate =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(RateCSV))
    kg.add(RateCodeType, kg.get_ns("moo").hasRank, kg.get_ns("moo").Rank)
    #Deferral is stub
#build vocabulary
for index, row in df.iterrows():
    build_rate_vocabulary(kg,index,row)

In [11]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/safetyHazardCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_safety_vocabulary(kg,index,row):
    #Create Safety Code Type
    SafetyCodeCSV = str(uuid.uuid4())
    SafetyCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(SafetyCodeCSV))
    kg.add(SafetyCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(SafetyCodeType, kg.get_ns("dct").title, rdflib.Literal("Safety Code"))
    #Add subclass relationship to CodeType
    kg.add(SafetyCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(SafetyCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(SafetyCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Safety Level Node
    SafetyCSV = str(uuid.uuid4())
    SafetyLevel =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(SafetyCSV))
    kg.add(SafetyCodeType, kg.get_ns("moo").hasSafetyLevel, kg.get_ns("moo").SafetyLevel)
    #Deferral is stub
#build vocabulary
for index, row in df.iterrows():
    build_safety_vocabulary(kg,index,row)

In [12]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/typeAvailabilityCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_typeAvailability_vocabulary(kg,index,row):
    #Create typeAvailability Code Type
    typeAvailabilityCodeCSV = str(uuid.uuid4())
    typeAvailabilityCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(typeAvailabilityCodeCSV))
    kg.add(typeAvailabilityCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(typeAvailabilityCodeType, kg.get_ns("dct").title, rdflib.Literal("Type Availability Code"))
    #Add subclass relationship to CodeType
    kg.add(typeAvailabilityCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(typeAvailabilityCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(typeAvailabilityCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Type Availability  Node
    TypeCSV = str(uuid.uuid4())
    TypeAvailability =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(TypeCSV))
    kg.add(typeAvailabilityCodeType, kg.get_ns("moo").hasTypeAvailabilty, kg.get_ns("moo").TypeAvailability)
    #Deferral is stub
#build vocabulary
for index, row in df.iterrows():
    build_typeAvailability_vocabulary(kg,index,row)

In [13]:
#Load CSV
from pathlib import PurePath, Path
p = PurePath('./tables/whenDiscoveredCode.csv')
df = pd.read_csv(p)
df = df.applymap(str)
def build_whenDiscovered_vocabulary(kg,index,row):
    #Create whenDiscovered Code Type
    whenDiscoveredCodeCSV = str(uuid.uuid4())
    whenDiscoveredCodeType =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(whenDiscoveredCodeCSV))
    kg.add(whenDiscoveredCodeType, kg.get_ns("rdf").type, kg.get_ns("moo").Code)
    kg.add(whenDiscoveredCodeType, kg.get_ns("dct").title, rdflib.Literal("whenDiscovered Code"))
    #Add subclass relationship to CodeType
    kg.add(whenDiscoveredCodeType, kg.get_ns("rdf").subClassOf,kg.get_ns("moo").CodeType)
    #Add string literals for the dataframe
    kg.add(whenDiscoveredCodeType, kg.get_ns("moo").codeText, rdflib.Literal(row['Code']))
    kg.add(whenDiscoveredCodeType, kg.get_ns("moo").codeDescription, rdflib.Literal(row['Description']))
    #Adding Situation When Discovered  Node
    DiscoveredCSV = str(uuid.uuid4())
    SituationWhenDiscovered =  rdflib.URIRef("https://w3id.org/moo/voc/mooVoc{}".format(DiscoveredCSV))
    kg.add(whenDiscoveredCodeType, kg.get_ns("moo").hasSituation, kg.get_ns("moo").SituationWhenDiscovered)
    #Deferral is stub
#build vocabulary
for index, row in df.iterrows():
    build_whenDiscovered_vocabulary(kg,index,row)

Finally, the completed .ttl file that contains the subgraph is saved under ./kg/codeVocabulary.ttl. I'd reccomend using RDFpreview in VSCode or taking a scroll through the .ttl file to see how UUIDs are used to map each code to its codeType.

In [14]:
# text = kg.save_rdf_text()
# print(text)
kg.save_rdf("./kg/codeVocabulary.ttl")