<a href="https://colab.research.google.com/github/mech0s/nodehenge/blob/main/doc2rdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
scriptRevision = 17
print ("Revision", scriptRevision)

Revision 17


##  Install steps
Need to re-run for each fresh google colab session

In [42]:
### commented out - rdfpandas not needed?
###  %pip install rdfpandas
#  %pip install pandas
#  %pip install openpyxl
%pip install rdflib
%pip install -U spacy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
                                              0.0/12.8 MB ? eta -:--:--
     -                                        0.3/12.8 MB 10.2 MB/s eta 0:00:02
     --                                       0.7/12.8 MB 10.4 MB/s eta 0:00:02
     ---                                      1.0/12.8 MB 9.0 MB/s eta 0:00:02
     ----                                     1.3/12.8 MB 9.2 MB/s eta 0:00:02
     -----                                    1.6/12.8 MB 9.4 MB/s eta 0:00:02
     ------                                   1.9/12.8 MB 9.5 MB/s eta 0:00:02
     -------                                  2.3/12.8 MB 9.0 MB/s eta 0:00:02
     --------                                 2.6/12.8 MB 9.1 MB/s eta 0:00:02
     ---------                                2.9/12.8 MB 9.2 MB/s eta 0:00:02
     ----------                        


[notice] A new release of pip is available: 23.1.2 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [44]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [45]:
# file structure creation
from pathlib import Path
import os, shutil
# create the folder structure in the colab session (colab does not pull this from github when opening the notebook)
Path("gen").mkdir(parents=True, exist_ok=True)
Path("resource").mkdir(parents=True, exist_ok=True)
Path("publish/nodehenge.gov").mkdir(parents=True, exist_ok=True)
Path("publish/nodehenge.org").mkdir(parents=True, exist_ok=True)
# if accessible (not in colab) copy the ontology into the publish area
if os.path.isfile("resource/ont.ttl") : shutil.copy("resource/ont.ttl","publish/nodehenge.org/")

##   Imports

In [46]:
import numpy as np
import pandas as pd
import os
import re
import json

### Optional    google  colab   enhancements

In [47]:
#optional
if 'COLAB_JUPYTER_TOKEN' in os.environ:
  from google.colab import data_table
  data_table.enable_dataframe_formatter()



## Source Data Read/Setup

### Read dodcio DevSecOpsActivitesToolsGuidebookTables.xlsx

In [48]:
def cleanCamel(r):
  r = re.sub('\W|^(?=\d)','_', r) ## cleans to make a valid identifier
  r = re.sub(r"(_|-|\n|!)+", " ", r).title().replace(" ", "")  ## turns to camel-case
  return ''.join([r[0].lower(), r[1:]])

In [49]:
sourceURI = "https://dodcio.defense.gov/Portals/0/Documents/Library/DevSecOpsActivitesToolsGuidebookTables.xlsx"
xls = pd.ExcelFile(sourceURI)

### Setup fixed data

In [50]:
phaseNames = [ 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback']
phaseIDs = list(map(cleanCamel, phaseNames))

###  Build phaseActivityDataFrame

One named worksheet per phase: Pull these into a list of DataFrames, adjusting column names to create valid identifiers. Turn NaN entries into blank strings.

In [51]:
phaseDataList = []
phaseOrder = 0
for phName in phaseNames:
  df = pd.read_excel(xls,phName)
  df = df.rename(columns={"Activities":"Activity",
                     "Security / Testing / CM": "SecurityTestingCM",
                     "Tool Dependencies": "ToolDependency",
                     "Tool Dependency": "ToolDependency"
                     })
  # add Phase and order columns - alternative avoid tricky MultiIndex when concatening below
  df["Phase"] = phName
  df["PhaseOrder"] = phaseOrder
  df["OrderInPhase"] = df.index
  phaseOrder+=1
  #
  phaseDataList.append(df.replace(np.nan, ""))

phaseDataList : list of dataframes :- concatenate into one dataframe

In [52]:
phaseActivityDataFrame = pd.concat(phaseDataList)
phaseActivityDataFrame.reset_index(drop=True, inplace=True)
# MultiIndex example: phaseActivityDataFrame = pd.concat(phaseDataList,keys=phaseNames, names=["Phase","IDinPhase"])

In [53]:
phaseActivityDataFrame["ActivityIdentifier"] = phaseActivityDataFrame["Activity"].apply( cleanCamel )

In [54]:
# just inspecting as json (enables processing using GPT3/4 possibilities)
phaseActivityDataFrame.to_json(orient='table')



In [55]:
phaseActivityDataFrame.rename(columns={"ToolDependency":"ToolDependenciesText", "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
## prepare empty lists to accept values parsed from text fields
phaseActivityDataFrame["ToolsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["InputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["OutputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]

In [56]:
# just checking sheet names
xls.sheet_names

['Cover',
 'Overview',
 'Continuous Activities',
 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback',
 'Tools']

### Build Tools data-frame

In [57]:
toolsDataFrame = pd.read_excel(xls, "Tools")
toolsDataFrame.replace(np.nan, "", inplace=True)

In [58]:
toolsDataFrame.rename(columns={ "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
toolsDataFrame["InputsList"]=[[] for _ in range(len(toolsDataFrame))]
toolsDataFrame["OutputsList"]=[[] for _ in range(len(toolsDataFrame))]

### Import "Continuous Activities" sheet regions

In [118]:
contActDataFrame = pd.read_excel(xls, "Continuous Activities")
contActDataFrame.rename(columns={ "Security Activities":"Activity", "Cyber Tool Dependencies":"Tool Dependencies"},inplace=True)
contActDataFrame = contActDataFrame.drop('Column1', axis=1) # extra hidden Excel column
contActDataFrame.loc[contActDataFrame.index <= 23 , "Aspect"] = "Security"
contActDataFrame.loc[(contActDataFrame.index >= 26 ) & ( contActDataFrame.index <= 59) , "Aspect"] = "Test"
contActDataFrame.loc[contActDataFrame.index >= 62 , "Aspect"] = "CM"
contActDataFrame = contActDataFrame[contActDataFrame.Aspect.notnull()]
contActDataFrame.replace(np.nan, "", inplace=True)


Unnamed: 0,Activity,Phase,Tool Dependencies,Aspect
0,Mission Based Cyber Risk Assessments,All Phases,Mission Cyber Risk Assessment tool,Security
1,Threat modeling,Plan,Threat modeling tool,Security
2,Code commit scan,Develop,Source code repository security plugin,Security
3,Security code development,Develop,IDE,Security
4,Static code scan before commit,Develop,IDE security plugins,Security
...,...,...,...,...
71,Test configuration control,Test,Team collaboration system; Issue tracking syst...,CM
72,Infrastructure provisioning automation,Deploy,Configuration automation tool; IaC,CM
73,Asset inventory to include SBOMs,Monitor,Inventory Management,CM
74,System performance monitoring,Monitor,Operation monitoring; Issue tracking system; A...,CM


### Dataframe select, filter, order examples

In [None]:
### Dataframe slicing examples
phaseActivityDataFrame.iloc[3:39]

In [None]:
### Dataframe ordering and filtering examples
phaseActivityDataFrame.sort_values(["PhaseOrder","OrderInPhase"])[phaseActivityDataFrame["Phase"]>="Plan"]

In [None]:
### Dataframe selection examples
phaseActivityDataFrame[["Phase","Baseline"]]

In [None]:
### access to lists within cells
toolsDataFrame.iloc[0].InputsList.append(123)
toolsDataFrame.iloc[0].InputsList.clear()
toolsDataFrame.iloc[0]

### Parse out text cells

In [None]:
#helper function - check what text isn't being extracted from a string during parsing
def remaining_text(txt, strs):
    rem = txt
    for s in strs:
        rem = rem.replace(s,'')

    return rem.replace("\n","")

### Cleansing of text list cells

In [None]:
# stash uncleansed data to allow repeated, perhaps destructive cleansing attempts below
toolsDataFramePreMods = toolsDataFrame.copy()
phaseActivityDataFramePreMods = phaseActivityDataFrame.copy()


In [None]:
# start (again) from stashed data
toolsDataFrame = toolsDataFramePreMods.copy()
phaseActivityDataFrame = phaseActivityDataFramePreMods.copy()

inputOutputReplaces ={    # "is real" phrase addition leads NLP into treating elements a noun chunk
           "Change management" : "Change-management" ,
           "IT infrastructure asset" : "IT-infrastructure-asset",
            "Artifacts (Infrastructure as Code)" : "IAC Artifacts,",
            "Infrastructure as Code" : "IAC is real",
            "Infrastructure as code" : "IAC is real",
            "NIST 800-53 RMF Control Implementations" :"NIST80053RMFControl-Implementation,",
            "FIPS 199 system categorization" : "FIPS-199-system-categorization,",
            "Stakeholder inputs or feedback" : "Stakeholder-input, Stakeholder feedback",
            "Requirements database or documents" : "Requirements database, Requirements document",
            "Test environment applications and infrastructure" : "Test environment application, Test environment infrastructure",
            "Developer coding and appropriate unit, integration, etc. testing input" : "Code-Development input, Unit test input, Integration test input, Other test input",
            "- Review Comments" : "Review-Comment,",
            "- Source Code Weakness Findings" : "Source Code Weakness Findings,",
            "- Version-Controlled Source Code" :"Version-Controlled Source Code,",
            "- Security Findings and Warnings" : "Security Findings, Security Warning,",
            'Technical feedback as to “is the system built right” and operational feedback as to “was the right system built”' : "Technical feedback. Operational feedback.",
            "Database traffic, event, and activities" : "Database traffic. Database events. Database activities.",
            "Artifacts in the repository;" : "Artifacts.",
            "Requirements documents and/or database;" : "Requirements documents. Requirements database.",
            "Requirement documents and/or database" : "Requirements documents. Requirements database",
            "Requirements database or documents" : "Requirements documents. Requirements database",
            "Software Factory controls" : "Software-Factory controls are real.",
            "Software Bill of Materials" : "SBOM",
            "Functional and non-functional regression test cases" : "Regression-test cases.",
            "and the software system" : ". Software system",
            "\nThe software system" : ". Software system",
            "\nDatabase artifacts;" : ". Database artifacts are real .",
            "\nRelease notes" : ". Release notes are real.",
            "APIs for integrated systems" : "APIs are real",
            "\nDependency checking report" : ". Dependency-checking report is real",
            "individual software unit under test (a function, method or an interface)" : "software unit",
            "expected output data" : "expected-output-data is real",
            "Updates to the Product Backlog" : "Product-backlog updates",
            "Performance KPI measures" : "KPI-measures are real",
            "and remediation reports" : ". Remediation reports are real",
            "Point-in-time recovered file" : "Recovered file",
            "Reports of observed performance" : "Observed-performance reports",
            "Reports of test results" : "Test-result Reports",
            "New release in the artifact repository" : "Release-To-Repo is real",
            "\nPush go/no-go decision" : ". Push go/no-go decision",
            "go / no-go decision; \nArtifacts are tagged with release tag if go decision is made" : "Go/no-go decision is real. Artifacts-release-tag is real",
            "Artifacts in all regional artifact repositories" : "Replicated Artifacts",
            "Test report to determine whether the individual software unit performs as designed." : "Test report is real",
            "test scripts, the software units under test, test input data, and expected output data" : "test scripts. Software-units under test. Test-input data. Expected-output data",
            "- Auto generated Application Programming Interface (API) documentation" : "Generated-API documentation",
            "" : "",
            "" : "",
            "" : "",
            "" : "",

            "Aggregated filtered logs from the Log Aggregator" : "Aggregated filtered logs are real",
            "Vulnerability and non-compliance findings from Information Security Continuous Monitoring" : "Vulnerability findings . non-compliance findings",
            "Recommendations from Information Security Continuous Monitoring" : "information security recommendations",
            "Performance statistics from Operations Monitoring" : "Performance statistics",
            "Performance alerts from Operations Monitoring" : "Performance-alert are real",
            "IT assets (applications, software licenses, libraries, operating systems, and versioning information)" : "IT-infrastructure-asset",
            "Access to the backup source" : "backup-source access",
            "Source code under version control" : "Version-Controlled Source Code",
            "IT hardware and software components information" : "IT hardware information . Software component information is real",
            "Everything as Code" : "EAC is real", 
            "Cyber threat condition feeds" : "Cyber-threat-condition feed",
            "Running status and events" : "Database status . Database-event",
            "BOM, including:\nDependency list\nLicensing" : "BOM, including:\nDependency list\n Licensing is real",
            "Running software application" : "running software application is real",
            "Fuzz inputs" : "Fuzz inputs are real",
            "Developer coding input" : "Code-Development input",
            "Running application and operating systems" : "Running software application is real . Running operating system is real .",
            "Root cause analysis" : "Root-cause analysis",
            "Root cause analysis" : "Root-cause analysis",
            "Feature/change request" : "Feature request . change request",
            "security audit logs" : "security audit logs are real",
            "Event Logs" : "Event Log is real",
            "Event logs" : "Event Log is real",
            "All user, network, application, and data activities" : "user activities. network activities. application activities. data activities",
            "All operational monitoring status, alerts" : "Operational-monitoring status, Operational-monitoring alert",
            "Performance KPI" : "KPI is real. ",
            "Service Level Agreement (SLA)" : "SLA is real.",
            "Service Level Agreements" : "SLA is real.",
            "Software instances" : "Software instances are real",

            "\nSystem VM or container snapshot" : "\nSystem VM snapshot . container snapshot",
            "Binary artifacts stored in the Artifact repository" : "Binary artifacts",
            "code comments" : "code comments are real",
            "Compliance reports" : "Compliance reports are real",
            "Recommend changes in CSRP" : "CSRP-change recommendations",
            "Audit logs;" : "Audit logs are real .",
            "Recommended mitigation actions" : ". Mitigation recommendations.",
            "Dynamic code scan report " : "Dynamic-code-scan report.",
            "and recommended mitigation" : ". Mitigation recommendations.",
            "Issue resolution tracking history" : "Issue-resolution-tracking history",
            "Remediation report and log" : "Remediation report . Remediation log is real",
            "Performance alerts" : "Performance alerts are real",
            "Release package with checksum and digital signature (a bundle of artifacts, such as a self-extracting software installer, or a tar file, etc.)" : "Release package is real",
            "Security findings and warnings" : "Security findings. Security warnings",
            "Static code scan report" : "Static-code-scan report",
            "The percentage of code that is exercised by the tests." : "Code-Test-Coverage-Pct is real",
            "Input data for the system under test" : "Test data",
            "Test results statistics" : "Test-result statistics",
            "Running VM" : "Running-VM is real",

            "" : "",
            "" : "",

}
toolsDataFrame["InputsTextNlpPrepped"] = toolsDataFrame.InputsText
toolsDataFrame["OutputsTextNlpPrepped"] = toolsDataFrame.OutputsText
phaseActivityDataFrame["InputsTextNlpPrepped"] = phaseActivityDataFrame.InputsText
phaseActivityDataFrame["OutputsTextNlpPrepped"] = phaseActivityDataFrame.OutputsText
phaseActivityDataFrame["ToolsTextNlpPrepped"] = phaseActivityDataFrame.ToolDependenciesText

for pair in inputOutputReplaces.items():
    toolsDataFrame.InputsTextNlpPrepped = toolsDataFrame.apply(lambda x : x.InputsTextNlpPrepped.replace(pair[0], pair[1]), axis='columns')
    toolsDataFrame.OutputsTextNlpPrepped = toolsDataFrame.apply(lambda x : x.OutputsTextNlpPrepped.replace(pair[0], pair[1]), axis='columns')
    phaseActivityDataFrame.InputsTextNlpPrepped = phaseActivityDataFrame.apply(lambda x : x.InputsTextNlpPrepped.replace(pair[0], pair[1]), axis='columns')
    phaseActivityDataFrame.OutputsTextNlpPrepped = phaseActivityDataFrame.apply(lambda x : x.OutputsTextNlpPrepped.replace(pair[0], pair[1]), axis='columns')

toolReplaces = {
            "Monitoring tool suite" : "Monitoring-tool suite",
            "Test tool suite" : "Test-tool suite",
            "Log aggregator" : "Log-aggregator",
            "Log analysis & auditing" : "Log-analysis, Log-auditing, ",
            "Logging" : "Logging-tool",
            "Logging-tool tool" : "Logging-tool",
            "Track test and security scan results" : "test-and-scan tracker",
            "\nAlerting and notification" : ";. Alerting-notification system. ",
            "\nIssue tracking system" : ";. Issue-tracking system;",
            "Compliance as Code" : "Compliance-as-Code is real",
            "Non-security compliance scan" : "Non-security-compliance-scan tool",
            "Artifacts repositories (release, regional)" : "Artifacts repository",
            "Vulnerability findings;\nRecommended mitigation actions" : "",
            "Test results documenting the performance of the system." : "",
            "Test results documenting the functioning of the system." : "",
            "Test report documenting the performance of the integrated unit." : "",
            "Dependency checking / BOM checking tool" : "Dependency-BOM-checking tool",
            "IDE or document editor or build tool" : "IDE is real. Document editor. Build tool.",
            "IDE or tools come with the database software" : "IDE is real. Database-tools are real.",
            "" : "",
            "" : "",
            "" : "",
            "" : "",
}
for pair in toolReplaces.items():
    toolsDataFrame.Tool = toolsDataFrame.apply(lambda x : x.Tool.replace(pair[0], pair[1]), axis='columns')
    phaseActivityDataFrame.ToolsTextNlpPrepped = phaseActivityDataFrame.apply(lambda x : x.ToolsTextNlpPrepped.replace(pair[0], pair[1]), axis='columns')


# nlp extraction of noun phrases
toolsDataFrame.InputsList = toolsDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.InputsTextNlpPrepped.replace("\n"," . ")).noun_chunks], axis='columns') # period better than \n as separator between noun chunks
toolsDataFrame.OutputsList = toolsDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.OutputsTextNlpPrepped.replace("\n"," . ")).noun_chunks], axis='columns')
phaseActivityDataFrame.InputsList = phaseActivityDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.InputsTextNlpPrepped.replace("\n"," . ")).noun_chunks], axis='columns')
phaseActivityDataFrame.OutputsList = phaseActivityDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.OutputsTextNlpPrepped.replace("\n"," . ")).noun_chunks], axis='columns')
phaseActivityDataFrame.ToolsList = phaseActivityDataFrame.apply(lambda x :  [chunk.text for chunk in nlp(x.ToolsTextNlpPrepped.replace("\n"," . ")).noun_chunks], axis='columns')

# text left behind
toolsDataFrame['InputsTextRemainder'] = toolsDataFrame.apply(lambda x : remaining_text(x.InputsText, x.InputsList) , axis='columns')
toolsDataFrame['OutputsTextRemainder'] = toolsDataFrame.apply(lambda x : remaining_text(x.OutputsText, x.OutputsList) , axis='columns')
phaseActivityDataFrame['InputsTextRemainder'] = phaseActivityDataFrame.apply(lambda x : remaining_text(x.InputsText, x.InputsList) , axis='columns')
phaseActivityDataFrame['OutputsTextRemainder'] = phaseActivityDataFrame.apply(lambda x : remaining_text(x.OutputsText, x.OutputsList) , axis='columns')
phaseActivityDataFrame['ToolsTextRemainder'] = phaseActivityDataFrame.apply(lambda x : remaining_text(x.ToolDependenciesText, x.ToolsList) , axis='columns')


### Cleansed - Now turn item names into identifiers - camelCase

In [None]:
toolsDataFrame["ToolIdentifier"]=toolsDataFrame["Tool"].apply( cleanCamel )
toolsDataFrame.InputsList = toolsDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.InputsList ], axis='columns' )
toolsDataFrame.OutputsList = toolsDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.OutputsList ], axis='columns' )
phaseActivityDataFrame.InputsList = phaseActivityDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.InputsList ], axis='columns' )
phaseActivityDataFrame.OutputsList = phaseActivityDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.OutputsList ], axis='columns' )
phaseActivityDataFrame.ToolsList = phaseActivityDataFrame.apply(lambda x : [ cleanCamel(y) for y in x.ToolsList ], axis='columns' )



### Inspection post-cleasning

In [None]:

#filtered column view to aid visual inspection post-cleansing
checkToolInputs = toolsDataFrame[["Tool","InputsText","InputsList","InputsTextRemainder", "InputsTextNlpPrepped",]]
checkToolOutputs = toolsDataFrame[["Tool","OutputsText","OutputsList","OutputsTextRemainder", "OutputsTextNlpPrepped"]]
checkPAInputs = phaseActivityDataFrame[["Phase","Activity","InputsText","InputsList","InputsTextRemainder", "InputsTextNlpPrepped"]]
checkPAOutputs = phaseActivityDataFrame[["Phase","Activity","OutputsText","OutputsList","OutputsTextRemainder", "OutputsTextNlpPrepped"]]
checkPATools = phaseActivityDataFrame[["Phase","Activity","ToolDependenciesText","ToolsList","ToolsTextRemainder", "ToolsTextNlpPrepped"]]

## uncomment a line to inspect cleansing results

#json.loads(checkToolInputs.to_json(orient='table'))
#json.loads(checkToolOutputs.to_json(orient='table'))
#json.loads(checkPAInputs.to_json(orient='table'))
#json.loads(checkPAOutputs.to_json(orient='table'))
#json.loads(checkPATools.to_json(orient='table'))


## TODO : Other sheets / regions

# RDF creation

In [None]:
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import SKOS, RDF, RDFS, XSD, NamespaceManager, DCTERMS # DC, DOAP, FOAF, VOID, XMLNS





In [None]:
# setup namespaces
ONT = Namespace("http://nodehenge.org/ont#")
INST = Namespace("http://nodehenge.org/inst#")
PHASE = Namespace("http://nodehenge.org/inst/phase#")
TOOL = Namespace("http://nodehenge.org/inst/tool#")
ACT = Namespace("http://nodehenge.org/inst/activity#")
ART = Namespace("http://nodehenge.org/inst/artifact#")
# create graph and bind namespaces
def newNodehengeGraph():
  g = Graph() ###base="http://nodehenge.org/inst/")
  g.bind("rdf", RDF)
  g.bind("rdfs", RDFS)
  g.bind("skos", SKOS)
  g.bind("xsd", XSD)
  g.bind("ont", ONT)
  g.bind("inst", INST)
  g.bind("phase", PHASE)
  g.bind("tool", TOOL)
  g.bind("act", ACT)
  g.bind("art", ART)
  # g.parse("resource/ont.ttl")  ## optional actually
  return g

In [None]:
# populate the top level INST namespace
gTopLevel = newNodehengeGraph()
gTopLevel.add ((INST.dodDsopScheme, RDF.type, SKOS.ConceptScheme ))
gTopLevel.add ((INST.dodDsopScheme, DCTERMS.title, Literal("DoD DevSecOps Abstract Phase Activites and Tools Scheme") ))
gTopLevel.add ((INST.dodDsopScheme, DCTERMS.source, URIRef(sourceURI) ))


In [None]:
#populate the PHASE namespace
gPhases = newNodehengeGraph()
orderedPhaseBNodes = dict([(ph, BNode()) for ph in phaseIDs ])
gPhases.add ((PHASE.phasing, RDF.type, SKOS.OrderedCollection))
gPhases.add ((PHASE.phasing, SKOS.inScheme, INST.dodDsopScheme))
gPhases.add ((PHASE.phasing, SKOS.memberList, orderedPhaseBNodes[phaseIDs[0]]))
sideEffectTraversePhases = [ (
    gPhases.add((
        PHASE[current], RDF.type, ONT.Phase
    )),
    gPhases.add ((
        PHASE[current], SKOS.inScheme, INST.dodDsopScheme
    )),
    gPhases.add((
        PHASE[current], RDF.type, SKOS.Concept
    )),
    gPhases.add((
        PHASE[current], SKOS.prefLabel, Literal(currentName)
    )),
    gPhases.add((
        orderedPhaseBNodes[current], RDF.first, PHASE[current]    #list head
    )),
    gPhases.add((
        orderedPhaseBNodes[current], RDF.rest , orderedPhaseBNodes[next] if next != None else RDF.nil
    )),
    )
    for current, currentName, next in zip( phaseIDs, phaseNames , phaseIDs[1:]+[None]) ] # Traverse : zip current with next phase, until next is None. Allows for linked list creation

print(gPhases.serialize(format="turtle",destination="gen/phases.ttl"))

### Load

In [None]:


gToolsArts = newNodehengeGraph()
toolsDataFrame.apply( lambda row :
    (
        gToolsArts.add((
            TOOL[row.ToolIdentifier], RDF.type, ONT.AbstractTool
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], RDF.type, SKOS.Concept
        )),
        gToolsArts.add ((
            TOOL[row.ToolIdentifier], SKOS.inScheme, INST.dodDsopScheme
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], SKOS.prefLabel, Literal(row.Tool)
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], SKOS.definition, Literal(row.Benefits)
        )),
        gToolsArts.add((
            TOOL[row.ToolIdentifier], SKOS.scopeNote, Literal(row.Features)
        )),
        [ ( gToolsArts.add(( ART[item], RDF.type, ONT.AbstractArtifact )),
            gToolsArts.add(( ART[item], RDF.type, SKOS.Concept )),
            gToolsArts.add(( ART[item], SKOS.inScheme, INST.dodDsopScheme )),
            gToolsArts.add(( TOOL[row.ToolIdentifier], ONT.input, ART[item] )),
            gToolsArts.add(( ART[item], ONT.input, TOOL[row.ToolIdentifier] )),
            ) for item in row.InputsList ],
        [ ( gToolsArts.add(( ART[item], RDF.type, ONT.AbstractArtifact )),
            gToolsArts.add(( ART[item], RDF.type, SKOS.Concept )),
            gToolsArts.add(( ART[item], SKOS.inScheme, INST.dodDsopScheme )),
            gToolsArts.add(( TOOL[row.ToolIdentifier], ONT.output, ART[item] )),
            gToolsArts.add(( ART[item], ONT.output, TOOL[row.ToolIdentifier] )),
            ) for item in row.OutputsList ],
    )
    , axis='columns' )
None

In [None]:
gPhaseActsArts = newNodehengeGraph()
phaseActivityDataFrame.apply( lambda row :
    (
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], RDF.type, ONT.AbstractActivity
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], RDF.type, SKOS.Concept
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.inScheme, INST.dodDsopScheme
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.prefLabel, Literal(row.Activity)
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.definition, Literal(row.Description)
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.scopeNote, Literal("Baseline: " + row.Baseline )
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.scopeNote, Literal("Relevance: " + row.SecurityTestingCM )
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.scopeNote, Literal("SSDF: " + row.SSDF )
        )),
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], SKOS.editorialNote, Literal("skos:scopeNote currently holding three fields. Could normalize")
        )),
        [ ( gPhaseActsArts.add(( ART[item], RDF.type, ONT.AbstractArtifact )),
            gPhaseActsArts.add(( ART[item], RDF.type, SKOS.Concept )),
            gPhaseActsArts.add(( ART[item], SKOS.inScheme, INST.dodDsopScheme )),
            gPhaseActsArts.add(( ACT[row.ActivityIdentifier], ONT.input, ART[item] )),
            gPhaseActsArts.add(( ART[item], ONT.input, ACT[row.ActivityIdentifier] )),
            ) for item in row.InputsList ],
        [ ( gPhaseActsArts.add(( ART[item], RDF.type, ONT.AbstractArtifact )),
            gPhaseActsArts.add(( ART[item], RDF.type, SKOS.Concept )),
            gPhaseActsArts.add(( ART[item], SKOS.inScheme, INST.dodDsopScheme )),
            gPhaseActsArts.add(( ACT[row.ActivityIdentifier], ONT.output, ART[item] )),
            gPhaseActsArts.add(( ART[item], ONT.output, ACT[row.ActivityIdentifier] )),
            ) for item in row.OutputsList ],
        [ ( gPhaseActsArts.add(( TOOL[item], RDF.type, TOOL.AbstractTool )),
            gPhaseActsArts.add(( TOOL[item], RDF.type, SKOS.Concept )),
            gPhaseActsArts.add(( TOOL[item], SKOS.inScheme, INST.dodDsopScheme )),
            gPhaseActsArts.add(( ACT[row.ActivityIdentifier], ONT.activityTool, TOOL[item] )),
            gPhaseActsArts.add(( TOOL[item], ONT.activityTool, ACT[row.ActivityIdentifier] )),
            ) for item in row.ToolsList ],
        gPhaseActsArts.add((
            ACT[row.ActivityIdentifier], ONT.phaseActivity, PHASE[cleanCamel(row.Phase)]
        )),
        gPhaseActsArts.add((
            PHASE[cleanCamel(row.Phase)], ONT.phaseActivity, ACT[row.ActivityIdentifier]
        )),
        gPhaseActsArts.add(( ACT[row.ActivityIdentifier] , SKOS.editorialNote, Literal("Unused Inputs text::"+row.InputsTextRemainder))) if any(letter.isalpha() for letter in row.InputsTextRemainder) else None,
        gPhaseActsArts.add(( ACT[row.ActivityIdentifier] , SKOS.editorialNote, Literal("Unused Outputs text::"+row.OutputsTextRemainder))) if any(letter.isalpha() for letter in row.OutputsTextRemainder) else None,
        gPhaseActsArts.add(( ACT[row.ActivityIdentifier] , SKOS.editorialNote, Literal("Unused Tools text::"+row.ToolsTextRemainder))) if any(letter.isalpha() for letter in row.ToolsTextRemainder) else None,
    )
    , axis='columns' )
None

In [None]:

gTopLevel.serialize(format="turtle",destination="gen/topLevel.ttl")
gPhases.serialize(format="turtle",destination="gen/phases.ttl")
gToolsArts.serialize(format="turtle",destination="gen/toolsArts.ttl")
gPhaseActsArts.serialize(format="turtle",destination="gen/phaseActsArts.ttl")

## Publish the combined generated graph contributions

In [None]:
# publish combined graph
# simply a case of parsing generated source graphs into the same graph

gCombined = Graph()
gCombined.add((BNode(), RDFS.label, Literal("Script Version: " + str(scriptRevision))) )
import glob
for file in glob.glob("gen/*.ttl"):
    gCombined.parse(file)
gCombined.serialize(format="turtle",destination="publish/nodehenge.org/inst.ttl")

## Some commented-out RDF exploration tests

In [None]:
# some commented-out RDF exploration tests
%%script false --no-raise-error

# ONT = Namespace("http://nodehenge.org/ont#")
# INST = Namespace("http://nodehenge.org/inst#")
# PHASE = Namespace("http://nodehenge.org/inst/phase#")
# TOOL = Namespace("http://nodehenge.org/inst/tool#")
# ACT = Namespace("http://nodehenge.org/inst/activity#")
# ART = Namespace("http://nodehenge.org/inst/artifact#")
context = {"@ont": "http://nodehenge.org/ont#",
           "@inst": "http://nodehenge.org/inst#",
           "@phase": "http://nodehenge.org/inst/phase#",
           "@tool": "http://nodehenge.org/inst/tool#",
           "@act": "http://nodehenge.org/inst/activity#",
           "@artifact": "http://nodehenge.org/inst/artifact#",
           "@skos" :  "http://www.w3.org/2004/02/skos/core#",
           "@language": "en"}
#print(g.serialize(format="json-ld", context=context ))  # example with manually constructed @context
jsontxt=(g.serialize(format="json-ld", auto_compact=True))  #auto_compact create @context entries
jsonobj = json.loads(jsontxt)
json.dumps(jsonobj, separators=(',', ':')) # minified output

In [None]:
# some commented-out RDF exploration tests
%%script false --no-raise-error

g.add((
    URIRef("#nick"),
    SKOS.prefLabel,
    Literal("Nick") #, datatype=XSD.string)
    )
)
g.add((
    URIRef("#bob"),
    SKOS.prefLabel,
    Literal("Bob") #, datatype=XSD.string)
    )
)

print(g.serialize(format="turtle"))

my_query = """
SELECT DISTINCT ?a ?b
WHERE {
    ?a skos:prefLabel "Nick" .
}"""

qres = g.query(my_query)
for row in qres:
    print(f"{row.a} ")

bob=URIRef("#bob")
print(g.value(bob,SKOS.prefLabel))
#```

In [None]:
%%script false --no-raise-error

g.add( (PHASE.plan, SKOS.prefLabel, Literal("Plan")))
print(  PHASE.plan )
print( g.value( PHASE.plan, SKOS.prefLabel ) )

In [None]:
%%script false --no-raise-error

phaseID = "pha+--=sfgsdfg   \n se13"
phaseID2 = "phase13432"
g.remove( (PHASE[cleanCamel(phaseID)], None, None) )  #dict notation as alternative to explicit value and dot notation
g.add( (PHASE[cleanCamel(phaseID)], SKOS.related, PHASE[phaseID2]))
print( g.value( PHASE[cleanCamel(phaseID)], SKOS.related ) )