<a href="https://colab.research.google.com/github/mech0s/nodehenge/blob/main/doc2rdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
revision = 14
print ("Revision", revision)

Revision 14


##  Install steps
Need to re-run for each fresh google colab session

In [2]:
### commented out - rdfpandas not needed?
###  %pip install rdfpandas
#  %pip install pandas
#  %pip install openpyxl
%pip install rdflib
%pip install -U spacy



In [3]:
!python -m spacy download en_core_web_sm

2023-07-19 10:05:35.730942: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
# file structure creation
from pathlib import Path
import os, shutil
# create the folder structure in the colab session (colab does not pull this from github when opening the notebook)
Path("gen").mkdir(parents=True, exist_ok=True)
Path("resource").mkdir(parents=True, exist_ok=True)
Path("publish/nodehenge.gov").mkdir(parents=True, exist_ok=True)
Path("publish/nodehenge.org").mkdir(parents=True, exist_ok=True)
# if accessible (not in colab) copy the ontology into the publish area
if os.path.isfile("resource/ont.ttl") : shutil.copy("resource/ont.ttl","publish/nodehenge.org/")

##   Imports

In [6]:
import numpy as np
import pandas as pd
import os
import re
import json

### Optional    google  colab   enhancements

In [7]:
#optional
if 'COLAB_JUPYTER_TOKEN' in os.environ:
  from google.colab import data_table
  data_table.enable_dataframe_formatter()



## Source Data Read/Setup

### Read dodcio DevSecOpsActivitesToolsGuidebookTables.xlsx

In [8]:
def cleanCamel(r):
  r = re.sub('\W|^(?=\d)','_', r) ## cleans to make a valid identifier
  r = re.sub(r"(_|-|\n|!)+", " ", r).title().replace(" ", "")  ## turns to camel-case
  return ''.join([r[0].lower(), r[1:]])

In [9]:
sourceURI = "https://dodcio.defense.gov/Portals/0/Documents/Library/DevSecOpsActivitesToolsGuidebookTables.xlsx"
xls = pd.ExcelFile(sourceURI)

### Setup fixed data

In [10]:
phaseNames = [ 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback']
phaseIDs = list(map(cleanCamel, phaseNames))

##  Build phaseActivityDataFrame

One named worksheet per phase: Pull these into a list of DataFrames, adjusting column names to create valid identifiers. Turn NaN entries into blank strings.

In [11]:
phaseDataList = []
phaseOrder = 0
for phName in phaseNames:
  df = pd.read_excel(xls,phName)
  df = df.rename(columns={"Activities":"Activity",
                     "Security / Testing / CM": "SecurityTestingCM",
                     "Tool Dependencies": "ToolDependency",
                     "Tool Dependency": "ToolDependency"
                     })
  # add Phase and order columns - alternative avoid tricky MultiIndex when concatening below
  df["Phase"] = phName
  df["PhaseOrder"] = phaseOrder
  df["OrderInPhase"] = df.index
  phaseOrder+=1
  #
  phaseDataList.append(df.replace(np.nan, ""))

phaseDataList : list of dataframes :- concatenate into one dataframe

In [12]:
phaseActivityDataFrame = pd.concat(phaseDataList)
phaseActivityDataFrame.reset_index(drop=True, inplace=True)
# MultiIndex example: phaseActivityDataFrame = pd.concat(phaseDataList,keys=phaseNames, names=["Phase","IDinPhase"])

In [13]:
phaseActivityDataFrame["ActivityIdentifier"] = phaseActivityDataFrame["Activity"].apply( cleanCamel )

In [14]:
# just inspecting as json (enables processing using GPT3/4 possibilities)
phaseActivityDataFrame.to_json(orient='table')



In [15]:
phaseActivityDataFrame.rename(columns={"ToolDependency":"ToolDependenciesText", "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
## prepare empty lists to accept values parsed from text fields
phaseActivityDataFrame["ToolsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["InputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["OutputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]

In [16]:
# just checking sheet names
xls.sheet_names

['Cover',
 'Overview',
 'Continuous Activities',
 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback',
 'Tools']

In [17]:
toolsDataFrame = pd.read_excel(xls, "Tools")
toolsDataFrame.replace(np.nan, "", inplace=True)

In [18]:
toolsDataFrame.rename(columns={ "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
toolsDataFrame["InputsList"]=[[] for _ in range(len(toolsDataFrame))]
toolsDataFrame["OutputsList"]=[[] for _ in range(len(toolsDataFrame))]

### Dataframe select, filter, order examples

In [19]:
### Dataframe slicing examples
phaseActivityDataFrame.iloc[3:39]

Unnamed: 0,Activity,Baseline,SSDF,Description,InputsText,OutputsText,ToolDependenciesText,SecurityTestingCM,Phase,PhaseOrder,OrderInPhase,ActivityIdentifier,ToolsList,InputsList,OutputsList
3,Database design,PREFERRED,"PO.1.2, PO.3.1, PO.5.2, PW.1.1, PW.5.1",Data modeling; \nDatabase selection;\nDatabase...,System requirement;\nSystem design,- Database design document\n-,Data modeling tool;\nTeams collaboration system,,Plan,0,3,databaseDesign,[],[],[]
4,Design review,PREFERRED,"PO.1.2, PW.1.2, PW.2.1, PW.8.2, RV.2.2",Review and approve plans and documents,Plans and design documents;,Review comments;\nAction items,Team collaboration system,Configuration Management,Plan,0,4,designReview,[],[],[]
5,DevSecOps process design,REQUIRED,PO.1.1,Design the DevSecOps process workflows that ar...,Change management process;\nSystem design;\nRe...,DevSecOps process flow chart;\nDevSecOps ecosy...,Team collaboration system,,Plan,0,5,devsecopsProcessDesign,[],[],[]
6,Documentation version control,REQUIRED,"PO.1.1, PO.1.2, PO.1.3, PS.1.1",Track design changes,Plans and design documents;,Version controlled documents,Team collaboration system,Configuration Management,Plan,0,6,documentationVersionControl,[],[],[]
7,IaC deployment,REQUIRED,"PO.3.2, PO.3.3",Deploy infrastructure and set up environment u...,Artifacts (Infrastructure as Code)\nInfrastruc...,The environment ready,Configuration automation tool;\nIaC,,Plan,0,7,iacDeployment,[],[],[]
8,Mission-Based Cyber Risk Assessments,REQUIRED,"PW.7.2, RV.1.1, RV.1.2, RV.2.1, RV.3.1, RV.3.2...",An assessment of risks based upon the stated m...,NIST 800-53 RMF Control Implementations\nFIPS ...,Risk assessment,Risk assessment tool,Security,Plan,0,8,missionBasedCyberRiskAssessments,[],[],[]
9,Project/Release planning,REQUIRED,"PS.3.1, PS.3.2",Project task management\nRelease planning,Project charter\nProject constraints,Project Plan\nTask plan & schedule\nRelease pl...,Team collaboration system;\nProject management...,,Plan,0,9,projectReleasePlanning,[],[],[]
10,Project team onboarding planning,REQUIRED,"PO.2.1, PO.2.2, PO.2.3","Plan the project team onboarding process, inte...",Organization policy,Onboarding plan,Team collaboration system,,Plan,0,10,projectTeamOnboardingPlanning,[],[],[]
11,Risk management,REQUIRED,"PO.1.2, PO.3.1, PO.4.1, PW.1.1, PW.1.2, PW.2.1...",Risk assessment,System architecture;\nSupply chain information...,Risk management plan,Team collaboration system;,,Plan,0,11,riskManagement,[],[],[]
12,Software requirement analysis,REQUIRED,"PO.1.1, PO.1.2, PO.1.3",Gather the requirements from all stakeholders,Stakeholder inputs or feedback;\nOperation mon...,Requirements Documents:\n- Feature requirement...,Requirements tool;\nTeam collaboration system;...,,Plan,0,12,softwareRequirementAnalysis,[],[],[]


In [20]:
### Dataframe ordering and filtering examples
phaseActivityDataFrame.sort_values(["PhaseOrder","OrderInPhase"])[phaseActivityDataFrame["Phase"]>="Plan"]

Unnamed: 0,Activity,Baseline,SSDF,Description,InputsText,OutputsText,ToolDependenciesText,SecurityTestingCM,Phase,PhaseOrder,OrderInPhase,ActivityIdentifier,ToolsList,InputsList,OutputsList
0,Change management planning,REQUIRED,"PO.1.1, PS.1.1, PS.3.1, PW.6.1",Plan the change control process,Organizational policy;\nSoftware development b...,Change control procedures;\nReview procedures;...,Team collaboration system;\nIssue tracking system,,Plan,0,0,changeManagementPlanning,[],[],[]
1,Configuration identification,REQUIRED,"PO.2.1, PS.1.1, PW.2.1, PW.4.1, PW.4.2, PW.6.2",Discover or manual input configuration items i...,IT infrastructure asset;\nSoftware system comp...,Configuration items,CMDB;\nSource code repository;\nArtifact repos...,Configuration Management,Plan,0,1,configurationIdentification,[],[],[]
2,Configuration management (CM) planning,REQUIRED,"PO.3.1, PO.3.3, PO.4.1, PO.4.2, PW.2.1",Plan the configuration control process;\nIdent...,"Software development, security and operations ...",CM processes and plan;\nCM tool selection;\nRe...,Team collaboration system;\nIssue tracking system,Configuration Management,Plan,0,2,configurationManagementCmPlanning,[],[],[]
3,Database design,PREFERRED,"PO.1.2, PO.3.1, PO.5.2, PW.1.1, PW.5.1",Data modeling; \nDatabase selection;\nDatabase...,System requirement;\nSystem design,- Database design document\n-,Data modeling tool;\nTeams collaboration system,,Plan,0,3,databaseDesign,[],[],[]
4,Design review,PREFERRED,"PO.1.2, PW.1.2, PW.2.1, PW.8.2, RV.2.2",Review and approve plans and documents,Plans and design documents;,Review comments;\nAction items,Team collaboration system,Configuration Management,Plan,0,4,designReview,[],[],[]
5,DevSecOps process design,REQUIRED,PO.1.1,Design the DevSecOps process workflows that ar...,Change management process;\nSystem design;\nRe...,DevSecOps process flow chart;\nDevSecOps ecosy...,Team collaboration system,,Plan,0,5,devsecopsProcessDesign,[],[],[]
6,Documentation version control,REQUIRED,"PO.1.1, PO.1.2, PO.1.3, PS.1.1",Track design changes,Plans and design documents;,Version controlled documents,Team collaboration system,Configuration Management,Plan,0,6,documentationVersionControl,[],[],[]
7,IaC deployment,REQUIRED,"PO.3.2, PO.3.3",Deploy infrastructure and set up environment u...,Artifacts (Infrastructure as Code)\nInfrastruc...,The environment ready,Configuration automation tool;\nIaC,,Plan,0,7,iacDeployment,[],[],[]
8,Mission-Based Cyber Risk Assessments,REQUIRED,"PW.7.2, RV.1.1, RV.1.2, RV.2.1, RV.3.1, RV.3.2...",An assessment of risks based upon the stated m...,NIST 800-53 RMF Control Implementations\nFIPS ...,Risk assessment,Risk assessment tool,Security,Plan,0,8,missionBasedCyberRiskAssessments,[],[],[]
9,Project/Release planning,REQUIRED,"PS.3.1, PS.3.2",Project task management\nRelease planning,Project charter\nProject constraints,Project Plan\nTask plan & schedule\nRelease pl...,Team collaboration system;\nProject management...,,Plan,0,9,projectReleasePlanning,[],[],[]


In [21]:
### Dataframe selection examples
phaseActivityDataFrame[["Phase","Baseline"]]

Unnamed: 0,Phase,Baseline
0,Plan,REQUIRED
1,Plan,REQUIRED
2,Plan,REQUIRED
3,Plan,PREFERRED
4,Plan,PREFERRED
...,...,...
137,Monitor,REQUIRED
138,Monitor,PREFERRED
139,Monitor,REQUIRED
140,Feedback,REQUIRED


In [22]:
### access to lists within cells
toolsDataFrame.iloc[0].InputsList.append(123)
toolsDataFrame.iloc[0].InputsList.clear()
toolsDataFrame.iloc[0]

Tool                                   Alerting and notification
Features       Notify security teams and/or administrators ab...
Benefits       Improve visibility of system events\nReduce sy...
InputsText     Aggregated filtered logs from the Log Aggregat...
OutputsText    Alert messages, emails, etc.\nRemediation repo...
InputsList                                                    []
OutputsList                                                   []
Name: 0, dtype: object

### Parse out text cells

In [23]:
#helper function - check what text isn't being extracted from a string during parsing
def remaining_text(txt, strs):
    rem = txt
    for s in strs:
        rem = rem.replace(s,'')

    return rem.replace("\n","")

### Cleansing of text list cells

In [24]:
phaseActivityDataFramePreMods = phaseActivityDataFrame.copy()
toolsDataFramePreMdods = toolsDataFrame.copy()

inputOutputReplaces ={ "Change management" : "Change-management" ,
           "IT infrastructure asset" : "IT-infrastructure-asset",
            "Artifacts (Infrastructure as Code)" : "IAC Artifacts,",
            "Infrastructure as Code" : "IAC",
            "NIST 800-53 RMF Control Implementations" :"NIST80053RMFControl-Implementations,",
            "FIPS 199 system categorization" : "FIPS-199-system-categorization,",
            "Stakeholder inputs or feedback" : "Stakeholder-inputs, Stakeholder feedback",
            "Requirements database or documents" : "Requirements database, Requirements documents",
            "Test environment applications and infrastructure" : "Test environment applications, Test environment infrastructure",
            "Developer coding and appropriate unit, integration, etc. testing input" : "Developer coding, Unit test input, Integration test input, Other test input",
            "- Review Comments" : "Review-Comments,",
            "- Source Code Weakness Findings" : "Source Code Weakness Findings,",
            "- Version-Controlled Source Code" :"Version-Controlled Source Code,",
            "- Security Findings and Warnings" : "Security Findings, Security Warnings,",

            "" : "",
            "" : "",
            "" : "",
            "" : "",
            "" : "",
}
for pair in inputOutputReplaces.items():
    phaseActivityDataFrame.InputsText = phaseActivityDataFrame.apply(lambda x : x.InputsText.replace(pair[0], pair[1]), axis='columns')
    phaseActivityDataFrame.OutputsText = phaseActivityDataFrame.apply(lambda x : x.OutputsText.replace(pair[0], pair[1]), axis='columns')

toolReplaces = {
            "Monitoring tool suite" : "Monitoring-tool suite",
            "Test tool suite" : "Test-tool suite",
            "Log aggregator" : "Log-aggregator",
            "Log analysis & auditing" : "Log-analysis, Log-auditing, ",
            "Logging" : "Logging-tool",
            "Logging-tool tool" : "Logging-tool",
            "" : "",
            "" : "",
}
for pair in toolReplaces.items():
    phaseActivityDataFrame.ToolDependenciesText = phaseActivityDataFrame.apply(lambda x : x.ToolDependenciesText.replace(pair[0], pair[1]), axis='columns')
    toolsDataFrame.Tool = toolsDataFrame.apply(lambda x : x.Tool.replace(pair[0], pair[1]), axis='columns')


# nlp extraction of noun phrases
phaseActivityDataFrame.InputsList = phaseActivityDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.InputsText).noun_chunks], axis='columns')
phaseActivityDataFrame.OutputsList = phaseActivityDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.OutputsText).noun_chunks], axis='columns')
phaseActivityDataFrame.ToolsList = phaseActivityDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.ToolDependenciesText).noun_chunks], axis='columns')
# text left behind
phaseActivityDataFrame['InputsTextRemainder'] = phaseActivityDataFrame.apply(lambda x : remaining_text(x.InputsText, x.InputsList) , axis='columns')
phaseActivityDataFrame['OutputsTextRemainder'] = phaseActivityDataFrame.apply(lambda x : remaining_text(x.OutputsText, x.OutputsList) , axis='columns')
phaseActivityDataFrame['ToolsTextRemainder'] = phaseActivityDataFrame.apply(lambda x : remaining_text(x.ToolDependenciesText, x.ToolsList) , axis='columns')


### Cleansed - Now turn item names into identifiers - camelCase

In [25]:
toolsDataFrame["ToolIdentifier"]=toolsDataFrame["Tool"].apply( cleanCamel )
phaseActivityDataFrame.InputsList = phaseActivityDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.InputsList ], axis='columns' )
phaseActivityDataFrame.OutputsList = phaseActivityDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.OutputsList ], axis='columns' )
phaseActivityDataFrame.ToolsList = phaseActivityDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.ToolsList ], axis='columns' )



In [26]:

#filtered column view to aid visual inspection post-cleansing
checkPAInputs = phaseActivityDataFrame[["Phase","Activity","InputsText","InputsList","InputsTextRemainder"]]
checkPAOutputs = phaseActivityDataFrame[["Phase","Activity","OutputsText","OutputsList","OutputsTextRemainder"]]
checkPATools = phaseActivityDataFrame[["Phase","Activity","ToolDependenciesText","ToolsList","ToolsTextRemainder"]]

## TODO : Other sheets / regions

# RDF creation

In [27]:
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import SKOS, RDF, RDFS, XSD, NamespaceManager, DCTERMS # DC, DOAP, FOAF, VOID, XMLNS





In [28]:
# setup namespaces
ONT = Namespace("http://nodehenge.org/ont#")
INST = Namespace("http://nodehenge.org/inst#")
PHASE = Namespace("http://nodehenge.org/inst/phase#")
TOOL = Namespace("http://nodehenge.org/inst/tool#")
ACT = Namespace("http://nodehenge.org/inst/activity#")
ART = Namespace("http://nodehenge.org/inst/artifact#")
# create graph and bind namespaces
def newNodehengeGraph():
  g = Graph() ###base="http://nodehenge.org/inst/")
  g.bind("rdf", RDF)
  g.bind("rdfs", RDFS)
  g.bind("skos", SKOS)
  g.bind("xsd", XSD)
  g.bind("ont", ONT)
  g.bind("inst", INST)
  g.bind("phase", PHASE)
  g.bind("tool", TOOL)
  g.bind("act", ACT)
  g.bind("art", ART)
  return g

In [29]:
gTopLevel = newNodehengeGraph()
gPhases = newNodehengeGraph()
gToolsArts = newNodehengeGraph()
gPhaseActsArts = newNodehengeGraph()


In [30]:
# populate the top level INST namespace
gTopLevel.add ((INST.dodDsopScheme, RDF.type, SKOS.ConceptScheme ))
gTopLevel.add ((INST.dodDsopScheme, DCTERMS.title, Literal("DoD DevSecOps Abstract Phase Activites and Tools Scheme") ))
gTopLevel.add ((INST.dodDsopScheme, DCTERMS.source, URIRef(sourceURI) ))
orderedPhaseBNodes = dict([(ph, BNode()) for ph in phaseIDs ])
gTopLevel.add ((INST.phasing, RDF.type, SKOS.OrderedCollection))
gTopLevel.add ((INST.phasing, SKOS.inScheme, INST.dodDsopScheme))
gTopLevel.add ((INST.phasing, SKOS.memberList, orderedPhaseBNodes[phaseIDs[0]]))

<Graph identifier=N8ff75345dd684122908ec4955e1b04a6 (<class 'rdflib.graph.Graph'>)>

In [31]:
#populate the PHASE namespace
sideEffectTranversePhases = [ (
    gPhases.add((
        PHASE[current], RDF.type, ONT.Phase
    )),
    gPhases.add ((
        PHASE[current], SKOS.inScheme, INST.dodDsopScheme
    )),
    gPhases.add((
        PHASE[current], RDF.type, SKOS.Concept
    )),
    gPhases.add((
        PHASE[current], SKOS.prefLabel, Literal(currentName)
    )),
    gPhases.add((
        orderedPhaseBNodes[current], RDF.first, PHASE[current]    #list head
    )),
    gPhases.add((
        orderedPhaseBNodes[current], RDF.rest , orderedPhaseBNodes[next] if next != None else RDF.nil
    )),
    )
    for current, currentName, next in zip( phaseIDs, phaseNames , phaseIDs[1:]+[None]) ]

#print(gPhases.serialize(format="turtle"))

### Load

... pre-load from onto.ttl here ?
g.parse('publish/nodehenge.org/ont.ttl')

In [32]:
toolsDataFrame.apply( lambda row :
    (
        gToolsArts.add((
            TOOL[row.ToolIdentifier], RDF.type, ONT.AbstractTool
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], RDF.type, SKOS.Concept
        )),
        gToolsArts.add ((
            TOOL[row.ToolIdentifier], SKOS.inScheme, INST.dodDsopScheme
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], SKOS.prefLabel, Literal(row.Tool)
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], SKOS.definition, Literal(row.Benefits)
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], SKOS.scopeNote, Literal(row.Features)
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], SKOS.editorialNote, Literal("InputsText and OutputText source fields need cleansed and turned into related Atrifact entities")
        )),
    )
    , axis='columns' )
None

### TODO : Tools : InputsText and OutputText source fields need cleansed and turned into related Atrifact entities

In [33]:
phaseActivityDataFrame.apply( lambda row :
    (
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], RDF.type, ONT.AbstractActivity
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], RDF.type, SKOS.Concept
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.inScheme, INST.dodDsopScheme
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.prefLabel, Literal(row.Activity)
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.definition, Literal(row.Description)
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.scopeNote, Literal("Baseline: " + row.Baseline )
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.scopeNote, Literal("Relevance: " + row.SecurityTestingCM )
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.scopeNote, Literal("SSDF: " + row.SSDF )
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.editorialNote, Literal("skos:scopeNote currently holding three fields. Could normalize")
        )),
        [ ( gPhaseActsArts.add(( ART[item], RDF.type, ONT.AbstractArtifact )),
            gPhaseActsArts.add(( ART[item], RDF.type, SKOS.Concept )),
            gPhaseActsArts.add(( ART[item], SKOS.inScheme, INST.dodDsopScheme )),
            gPhaseActsArts.add(( ACT[row.ActivityIdentifier], ONT.input, ART[item] )),
            gPhaseActsArts.add(( ART[item], ONT.input, ACT[row.ActivityIdentifier] )),
            ) for item in row.InputsList ],
        [ ( gPhaseActsArts.add(( ART[item], RDF.type, ONT.AbstractArtifact )),
            gPhaseActsArts.add(( ART[item], RDF.type, SKOS.Concept )),
            gPhaseActsArts.add(( ART[item], SKOS.inScheme, INST.dodDsopScheme )),
            gPhaseActsArts.add(( ACT[row.ActivityIdentifier], ONT.output, ART[item] )),
            gPhaseActsArts.add(( ART[item], ONT.output, ACT[row.ActivityIdentifier] )),
            ) for item in row.OutputsList ],
        [ ( gPhaseActsArts.add(( TOOL[item], RDF.type, TOOL.AbstractTool )),
            gPhaseActsArts.add(( TOOL[item], RDF.type, SKOS.Concept )),
            gPhaseActsArts.add(( TOOL[item], SKOS.inScheme, INST.dodDsopScheme )),
            gPhaseActsArts.add(( ACT[row.ActivityIdentifier], ONT.activityTool, TOOL[item] )),
            gPhaseActsArts.add(( TOOL[item], ONT.activityTool, ACT[row.ActivityIdentifier] )),
            ) for item in row.ToolsList ],
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], ONT.phaseActivity, PHASE[cleanCamel(row.Phase)]
        )),
        gPhaseActsArts.add((
            PHASE[cleanCamel(row.Phase)], ONT.phaseActivity, ACT[row.ActivityIdentifier]
        )),
        gPhaseActsArts.add(( ACT[row.ActivityIdentifier] , SKOS.editorialNote, Literal("Unused Inputs text::"+row.InputsTextRemainder))) if any(letter.isalpha() for letter in row.InputsTextRemainder) else None,
        gPhaseActsArts.add(( ACT[row.ActivityIdentifier] , SKOS.editorialNote, Literal("Unused Outputs text::"+row.OutputsTextRemainder))) if any(letter.isalpha() for letter in row.OutputsTextRemainder) else None,
        gPhaseActsArts.add(( ACT[row.ActivityIdentifier] , SKOS.editorialNote, Literal("Unused Tools text::"+row.ToolsTextRemainder))) if any(letter.isalpha() for letter in row.ToolsTextRemainder) else None,
    )
    , axis='columns' )
None

In [34]:

gTopLevel.serialize(format="turtle",destination="gen/topLevel.ttl")
gPhases.serialize(format="turtle",destination="gen/phases.ttl")
gToolsArts.serialize(format="turtle",destination="gen/toolsArts.ttl")
gPhaseActsArts.serialize(format="turtle",destination="gen/phaseActsArts.ttl")

filenames = ['gen/topLevel.ttl', 'gen/phases.ttl', 'gen/toolsArts.ttl', 'gen/phaseActsArts.ttl', ]
with open('publish/nodehenge.org/inst.ttl', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            outfile.write(infile.read())

In [36]:
# publish combined graph
# simply a case of parsing generated source graphs into the same graph

gCombined = Graph()
import glob
for file in glob.glob("gen/*.ttl"):
    gCombined.parse(file)

gCombined.serialize(format="turtle",destination="publish/nodehenge.org/inst.ttl")

<Graph identifier=Ne6fd465ac9c843be8c70d6b6e0657228 (<class 'rdflib.graph.Graph'>)>

In [None]:
# some commented-out RDF exploration tests
%%script false --no-raise-error

# ONT = Namespace("http://nodehenge.org/ont#")
# INST = Namespace("http://nodehenge.org/inst#")
# PHASE = Namespace("http://nodehenge.org/inst/phase#")
# TOOL = Namespace("http://nodehenge.org/inst/tool#")
# ACT = Namespace("http://nodehenge.org/inst/activity#")
# ART = Namespace("http://nodehenge.org/inst/artifact#")
context = {"@ont": "http://nodehenge.org/ont#",
           "@inst": "http://nodehenge.org/inst#",
           "@phase": "http://nodehenge.org/inst/phase#",
           "@tool": "http://nodehenge.org/inst/tool#",
           "@act": "http://nodehenge.org/inst/activity#",
           "@artifact": "http://nodehenge.org/inst/artifact#",
           "@skos" :  "http://www.w3.org/2004/02/skos/core#",
           "@language": "en"}
#print(g.serialize(format="json-ld", context=context ))  # example with manually constructed @context
jsontxt=(g.serialize(format="json-ld", auto_compact=True))  #auto_compact create @context entries
jsonobj = json.loads(jsontxt)
json.dumps(jsonobj, separators=(',', ':')) # minified output

In [None]:
# some commented-out RDF exploration tests
%%script false --no-raise-error

g.add((
    URIRef("#nick"),
    SKOS.prefLabel,
    Literal("Nick") #, datatype=XSD.string)
    )
)
g.add((
    URIRef("#bob"),
    SKOS.prefLabel,
    Literal("Bob") #, datatype=XSD.string)
    )
)

print(g.serialize(format="turtle"))

my_query = """
SELECT DISTINCT ?a ?b
WHERE {
    ?a skos:prefLabel "Nick" .
}"""

qres = g.query(my_query)
for row in qres:
    print(f"{row.a} ")

bob=URIRef("#bob")
print(g.value(bob,SKOS.prefLabel))
#```

In [None]:
%%script false --no-raise-error

g.add( (PHASE.plan, SKOS.prefLabel, Literal("Plan")))
print(  PHASE.plan )
print( g.value( PHASE.plan, SKOS.prefLabel ) )

In [None]:
%%script false --no-raise-error

phaseID = "pha+--=sfgsdfg   \n se13"
phaseID2 = "phase13432"
g.remove( (PHASE[cleanCamel(phaseID)], None, None) )  #dict notation as alternative to explicit value and dot notation
g.add( (PHASE[cleanCamel(phaseID)], SKOS.related, PHASE[phaseID2]))
print( g.value( PHASE[cleanCamel(phaseID)], SKOS.related ) )