<a href="https://colab.research.google.com/github/mech0s/nodehenge/blob/main/doc2rdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
revision = 7
print ("Revision", revision)

##  Install steps
Remember to re-run for each fresh google colab session

In [None]:
### commented out - rdfpandas not needed?
#  %pip install rdfpandas
#  %pip install openpyxl
%pip install rdflib

##   Imports

In [204]:
import numpy as np
import pandas as pd
import os
import re
import json

### Optional    google  colab   enhancements

In [None]:
#optional
if 'COLAB_JUPYTER_TOKEN' in os.environ:
  from google.colab import data_table
  data_table.enable_dataframe_formatter()

## Source Data Read/Setup

### Read dodcio DevSecOpsActivitesToolsGuidebookTables.xlsx

In [None]:
def cleanCamel(s):
  r = re.sub('\W|^(?=\d)','_', s) ## cleans to make a valid identifier
  r = re.sub(r"(_|-|\n|!)+", " ", r).title().replace(" ", "")  ## turns to camel-case
  return ''.join([s[0].lower(), r[1:]])

In [None]:
sourceURI = "https://dodcio.defense.gov/Portals/0/Documents/Library/DevSecOpsActivitesToolsGuidebookTables.xlsx"
xls = pd.ExcelFile(sourceURI)

### Setup fixed data

In [None]:
phaseNames = [ 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback']

##  Build phaseActivityDataFrame

One named worksheet per phase: Pull these into a list of DataFrames, adjusting column names to create valid identifiers. Turn NaN entries into blank strings.

In [None]:
phaseDataList = []
phaseOrder = 0
for phName in phaseNames:
  df = pd.read_excel(xls,phName)
  df = df.rename(columns={"Activities":"Activity",
                     "Security / Testing / CM": "SecurityTestingCM",
                     "Tool Dependencies": "ToolDependency",
                     "Tool Dependency": "ToolDependency"
                     })
  # add Phase and order columns - alternative avoid tricky MultiIndex when concatening below
  df["Phase"] = phName
  df["PhaseOrder"] = phaseOrder
  df["OrderInPhase"] = df.index
  phaseOrder+=1
  #
  phaseDataList.append(df.replace(np.nan, ""))

phaseDataList : list of dataframes :- concatenate into one dataframe

In [198]:
phaseActivityDataFrame = pd.concat(phaseDataList)
phaseActivityDataFrame.reset_index(drop=True, inplace=True)
# MultiIndex example: phaseActivityDataFrame = pd.concat(phaseDataList,keys=phaseNames, names=["Phase","IDinPhase"])

In [199]:
phaseActivityDataFrame["ActivityIdentifier"] = phaseActivityDataFrame["Activity"].apply( cleanCamel )

In [200]:
phaseActivityDataFrame.to_json()



In [208]:
phaseActivityDataFrame.rename(columns={"ToolDependency":"ToolDependenciesText", "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
## prepare empty lists to accept values parsed from text fields
phaseActivityDataFrame["ToolsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["InputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["OutputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
pajson = phaseActivityDataFrame[["Activity","Phase","ToolDependenciesText","InputsText","OutputsText","ToolsList","InputsList","OutputsList"]].to_json(orient='records')


{'ActivityData': [{'Activity': 'Change management planning',
   'Phase': 'Plan',
   'ToolDependenciesText': 'Team collaboration system;\nIssue tracking system',
   'InputsText': 'Organizational policy;\nSoftware development best practices',
   'OutputsText': 'Change control procedures;\nReview procedures;\nControl review board;\nChange management plan',
   'ToolsList': [],
   'InputsList': [],
   'OutputsList': []},
  {'Activity': 'Configuration identification',
   'Phase': 'Plan',
   'ToolDependenciesText': 'CMDB;\nSource code repository;\nArtifact repository;\nTeam collaboration system',
   'InputsText': 'IT infrastructure asset;\nSoftware system components (include DevSecOps tools);\ncode baselines;\ndocument baselines',
   'OutputsText': 'Configuration items',
   'ToolsList': [],
   'InputsList': [],
   'OutputsList': []},
  {'Activity': 'Configuration management (CM) planning',
   'Phase': 'Plan',
   'ToolDependenciesText': 'Team collaboration system;\nIssue tracking system',
   '

In [152]:
# check cheet names
xls.sheet_names

['Cover',
 'Overview',
 'Continuous Activities',
 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback',
 'Tools']

In [209]:
toolsDataFrame = pd.read_excel(xls, "Tools")
toolsDataFrame.replace(np.nan, "", inplace=True)

In [215]:
toolsDataFrame["ToolIdentifier"]=toolsDataFrame["Tool"].apply( cleanCamel )
toolsDataFrame.rename(columns={ "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
toolsDataFrame["InputsList"]=[[] for _ in range(len(toolsDataFrame))]
toolsDataFrame["OutputsList"]=[[] for _ in range(len(toolsDataFrame))]
tjson = toolsDataFrame[["Tool","InputsText","OutputsText","InputsList","OutputsList"]].to_json(orient='records')


In [216]:
allData = {"ActivitiesData" : json.loads(pajson) ,
 "ToolsData" : json.loads(tjson) }
allDataJson = json.dumps(allData)

In [240]:
toolsDataFrame.iloc[0].InputsList.append(123)
toolsDataFrame.iloc[0].InputsList.clear()
toolsDataFrame.iloc[0]

Tool                                      Alerting and notification
Features          Notify security teams and/or administrators ab...
Benefits          Improve visibility of system events\nReduce sy...
InputsText        Aggregated filtered logs from the Log Aggregat...
OutputsText       Alert messages, emails, etc.\nRemediation repo...
ToolIdentifier                              alertingAndNotification
InputsList                                [123, 123, 123, 123, 123]
OutputsList                                                      []
Name: 0, dtype: object

### Dataframe select, filter, order examples

In [None]:
### Dataframe slicing examples
phaseActivityDataFrame.iloc[3:39]

In [None]:
### Dataframe ordering and filtering examples
phaseActivityDataFrame.sort_values(["PhaseOrder","OrderInPhase"])[phaseActivityDataFrame["Phase"]>="Plan"]

In [None]:
### Dataframe selection examples
phaseActivityDataFrame[["Phase","Baseline"]]

### more...

## TODO : Other sheets / regions

# RDF creation

In [150]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import SKOS, RDF, RDFS, XSD, NamespaceManager # DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

ONT = Namespace("http://nodehenge.org/ont#")
PHASE = Namespace("http://nodehenge.org/inst/phase#")
TOOL = Namespace("http://nodehenge.org/inst/tool#")
ACT = Namespace("http://nodehenge.org/inst/activity#")
ART = Namespace("http://nodehenge.org/inst/artifact#")
g = Graph() ###base="http://nodehenge.org/inst/")
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("skos", SKOS)
g.bind("xsd", XSD)
g.bind("ont", ONT)
g.bind("phase", PHASE)
g.bind("tool", TOOL)
g.bind("act", ACT)
g.bind("art", ART)

In [143]:
# some commented-out tests
%%script false --no-raise-error

g.add((
    URIRef("#nick"),
    SKOS.prefLabel,
    Literal("Nick") #, datatype=XSD.string)
    )
)
g.add((
    URIRef("#bob"),
    SKOS.prefLabel,
    Literal("Bob") #, datatype=XSD.string)
    )
)

print(g.serialize(format="turtle"))

my_query = """
SELECT DISTINCT ?a ?b
WHERE {
    ?a skos:prefLabel "Nick" .
}"""

qres = g.query(my_query)
for row in qres:
    print(f"{row.a} ")

bob=URIRef("#bob")
print(g.value(bob,SKOS.prefLabel))
#```

UsageError: Line magic function `%%script` not found.


In [None]:
%%script false --no-raise-error

g.add( (PHASE.plan, SKOS.prefLabel, Literal("Plan")))
print(  PHASE.plan )
print( g.value( PHASE.plan, SKOS.prefLabel ) )

In [None]:
%%script false --no-raise-error

phaseID = "pha+--=sfgsdfg   \n se13"
phaseID2 = "phase13432"
g.remove( (PHASE[cleanCamel(phaseID)], None, None) )  #dict notation as alternative to explicit value and dot notation
g.add( (PHASE[cleanCamel(phaseID)], SKOS.related, PHASE[phaseID2]))
print( g.value( PHASE[cleanCamel(phaseID)], SKOS.related ) )

### Load

... pre-load from onto.ttl here ?
g.parse('publish/nodehenge.org/ont.ttl')

In [151]:
print(g.serialize(format="turtle"))



