<a href="https://colab.research.google.com/github/mech0s/nodehenge/blob/main/doc2rdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
revision = 7
print ("Revision", revision)

##  Install steps
Remember to re-run for each fresh google colab session

In [None]:
### commented out - rdfpandas not needed?
#  %pip install rdfpandas
#  %pip install openpyxl
%pip install rdflib

##   Imports

In [204]:
import numpy as np
import pandas as pd
import os
import re
import json

### Optional    google  colab   enhancements

In [None]:
#optional
if 'COLAB_JUPYTER_TOKEN' in os.environ:
  from google.colab import data_table
  data_table.enable_dataframe_formatter()

## Source Data Read/Setup

### Read dodcio DevSecOpsActivitesToolsGuidebookTables.xlsx

In [None]:
def cleanCamel(s):
  r = re.sub('\W|^(?=\d)','_', s) ## cleans to make a valid identifier
  r = re.sub(r"(_|-|\n|!)+", " ", r).title().replace(" ", "")  ## turns to camel-case
  return ''.join([s[0].lower(), r[1:]])

In [None]:
sourceURI = "https://dodcio.defense.gov/Portals/0/Documents/Library/DevSecOpsActivitesToolsGuidebookTables.xlsx"
xls = pd.ExcelFile(sourceURI)

### Setup fixed data

In [None]:
phaseNames = [ 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback']

##  Build phaseActivityDataFrame

One named worksheet per phase: Pull these into a list of DataFrames, adjusting column names to create valid identifiers. Turn NaN entries into blank strings.

In [None]:
phaseDataList = []
phaseOrder = 0
for phName in phaseNames:
  df = pd.read_excel(xls,phName)
  df = df.rename(columns={"Activities":"Activity",
                     "Security / Testing / CM": "SecurityTestingCM",
                     "Tool Dependencies": "ToolDependency",
                     "Tool Dependency": "ToolDependency"
                     })
  # add Phase and order columns - alternative avoid tricky MultiIndex when concatening below
  df["Phase"] = phName
  df["PhaseOrder"] = phaseOrder
  df["OrderInPhase"] = df.index
  phaseOrder+=1
  #
  phaseDataList.append(df.replace(np.nan, ""))

phaseDataList : list of dataframes :- concatenate into one dataframe

In [198]:
phaseActivityDataFrame = pd.concat(phaseDataList)
phaseActivityDataFrame.reset_index(drop=True, inplace=True)
# MultiIndex example: phaseActivityDataFrame = pd.concat(phaseDataList,keys=phaseNames, names=["Phase","IDinPhase"])

In [199]:
phaseActivityDataFrame["ActivityIdentifier"] = phaseActivityDataFrame["Activity"].apply( cleanCamel )

In [200]:
phaseActivityDataFrame.to_json()



In [259]:
phaseActivityDataFrame.rename(columns={"ToolDependency":"ToolDependenciesText", "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
## prepare empty lists to accept values parsed from text fields
phaseActivityDataFrame["ToolsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["InputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["OutputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
pajson = phaseActivityDataFrame[["ActivityIdentifier","Phase","ToolDependenciesText","InputsText","OutputsText","ToolsList","InputsList","OutputsList"]].to_json(orient='records')


In [152]:
# check cheet names
xls.sheet_names

['Cover',
 'Overview',
 'Continuous Activities',
 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback',
 'Tools']

In [209]:
toolsDataFrame = pd.read_excel(xls, "Tools")
toolsDataFrame.replace(np.nan, "", inplace=True)

In [261]:
toolsDataFrame["ToolIdentifier"]=toolsDataFrame["Tool"].apply( cleanCamel )
toolsDataFrame.rename(columns={ "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
toolsDataFrame["InputsList"]=[[] for _ in range(len(toolsDataFrame))]
toolsDataFrame["OutputsList"]=[[] for _ in range(len(toolsDataFrame))]
tjson = toolsDataFrame[["ToolIdentifier","InputsText","OutputsText","InputsList","OutputsList"]].to_json(orient='records')


In [262]:
allData = {"ActivitiesData" : json.loads(pajson) ,
 "ToolsData" : json.loads(tjson) }
allDataJson = json.dumps(allData)

In [240]:
toolsDataFrame.iloc[0].InputsList.append(123)
toolsDataFrame.iloc[0].InputsList.clear()
toolsDataFrame.iloc[0]

Tool                                      Alerting and notification
Features          Notify security teams and/or administrators ab...
Benefits          Improve visibility of system events\nReduce sy...
InputsText        Aggregated filtered logs from the Log Aggregat...
OutputsText       Alert messages, emails, etc.\nRemediation repo...
ToolIdentifier                              alertingAndNotification
InputsList                                [123, 123, 123, 123, 123]
OutputsList                                                      []
Name: 0, dtype: object

In [241]:
%pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
                                              0.0/73.6 kB ? eta -:--:--
     ---------------                        30.7/73.6 kB 660.6 kB/s eta 0:00:01
     ---------------------------------------- 73.6/73.6 kB 1.0 MB/s eta 0:00:00
Collecting requests>=2.20 (from openai)
  Using cached requests-2.31.0-py3-none-any.whl (62 kB)
Collecting tqdm (from openai)
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
                                              0.0/77.1 kB ? eta -:--:--
     ---------------------------------------- 77.1/77.1 kB 4.2 MB/s eta 0:00:00
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-win_amd64.whl (319 kB)
                                              0.0/319.8 kB ? eta -:--:--
     ---------                               81.9/319.8 kB 2.3 MB/s eta 0:00:01
     -----------------------                194.6/319.8 kB 2.4 MB/s eta 0:00:01
     ----------------------------------

In [243]:
import openai
from getpass import getpass
openai.api_key = getpass("Paste your OpenAI API Key here (https://platform.openai.com/account/api-keys):")

In [246]:
tmp  = openai.Model.list()

In [256]:
[x.id for x in tmp.data if x.id.startswith('gpt')]

['gpt-3.5-turbo-16k-0613',
 'gpt-3.5-turbo-16k',
 'gpt-3.5-turbo-0301',
 'gpt-3.5-turbo',
 'gpt-3.5-turbo-0613']

In [293]:
result = openai.ChatCompletion.create(
  model="gpt-3.5-turbo-16k",
  messages=[
        {"role": "system", "content": "You methodically process every input row of JSON data, returning every output row of JSON as specified."},
        {"role": "user", "content": pajson + """

The above table describes typical activities, tools, inputs and outputs for DevSecOps phases. 
Plain text fields need to be interpreted as item lists: item separators may include 'and', newlines and punctuation such as comma and semi-colon.
Return a JSON table with the fields:
        Phase;
        ActivityIdentifier;
        InputsList : a list of input artifacts for the activity, parsed from the InputsText plain text field;
        OutputsList : a list of output artifacts from the activity, parsed from the OutputsText plain text field;
        ToolsList : a list of tools used for processing artifacts during the activity, patsed from the ToolDependenciesText plain text field."""},
    ]
)
result

<OpenAIObject chat.completion id=chatcmpl-7aYB91UEAJ58kC6TeG7WUfeLrrOrb at 0x24910e68cc0> JSON: {
  "id": "chatcmpl-7aYB91UEAJ58kC6TeG7WUfeLrrOrb",
  "object": "chat.completion",
  "created": 1688945547,
  "model": "gpt-3.5-turbo-16k-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
      },
      "finish_reason": "length"
    }
  ],
  "usage": {
    "prompt_tokens": 8326,
    "completion_tokens": 8059,
    "total_tokens": 16385
  }
}

In [272]:
result1 = result
print(result1.choices[0].message.content)

The processed JSON data is as follows:



In [275]:
result2 = result
print(result2.choices[0].message.content)

Processed JSON data follows:

[
  {
    "ActivityIdentifier": "changeManagementPlanning",
    "Phase": "Plan",
    "ToolDependenciesList": ["Team collaboration system", "Issue tracking system"],
    "InputsList": ["Organizational policy", "Software development best practices"],
    "OutputsList": ["Change control procedures", "Review procedures", "Control review board", "Change management plan"]
  },
  {
    "ActivityIdentifier": "configurationIdentification",
    "Phase": "Plan",
    "ToolDependenciesList": ["CMDB", "Source code repository", "Artifact repository", "Team collaboration system"],
    "InputsList": ["IT infrastructure asset", "Software system components (include DevSecOps tools)", "code baselines", "document baselines"],
    "OutputsList": ["Configuration items"]
  },
  {
    "ActivityIdentifier": "configurationManagementCmPlanning",
    "Phase": "Plan",
    "ToolDependenciesList": ["Team collaboration system", "Issue tracking system"],
    "InputsList": ["Software develo

In [291]:
result3 = result
print(result3.choices[0].message.content)

{
  "Data": [
    {
      "Phase": "Plan",
      "ActivityIdentifier": "changeManagementPlanning",
      "InputsList": [
        "Organizational policy",
        "Software development best practices"
      ],
      "OutputsList": [
        "Change control procedures",
        "Review procedures",
        "Control review board",
        "Change management plan"
      ],
      "ToolsList": [
        "Team collaboration system",
        "Issue tracking system"
      ]
    },
    {
      "Phase": "Plan",
      "ActivityIdentifier": "configurationIdentification",
      "InputsList": [
        "IT infrastructure asset",
        "Software system components (include DevSecOps tools)",
        "code baselines",
        "document baselines"
      ],
      "OutputsList": [
        "Configuration items"
      ],
      "ToolsList": [
        "CMDB",
        "Source code repository",
        "Artifact repository",
        "Team collaboration system"
      ]
    },
    {
      "Phase": "Plan",
      

### Dataframe select, filter, order examples

In [None]:
### Dataframe slicing examples
phaseActivityDataFrame.iloc[3:39]

In [None]:
### Dataframe ordering and filtering examples
phaseActivityDataFrame.sort_values(["PhaseOrder","OrderInPhase"])[phaseActivityDataFrame["Phase"]>="Plan"]

In [None]:
### Dataframe selection examples
phaseActivityDataFrame[["Phase","Baseline"]]

### more...

## TODO : Other sheets / regions

# RDF creation

In [150]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import SKOS, RDF, RDFS, XSD, NamespaceManager # DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

ONT = Namespace("http://nodehenge.org/ont#")
PHASE = Namespace("http://nodehenge.org/inst/phase#")
TOOL = Namespace("http://nodehenge.org/inst/tool#")
ACT = Namespace("http://nodehenge.org/inst/activity#")
ART = Namespace("http://nodehenge.org/inst/artifact#")
g = Graph() ###base="http://nodehenge.org/inst/")
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("skos", SKOS)
g.bind("xsd", XSD)
g.bind("ont", ONT)
g.bind("phase", PHASE)
g.bind("tool", TOOL)
g.bind("act", ACT)
g.bind("art", ART)

In [143]:
# some commented-out tests
%%script false --no-raise-error

g.add((
    URIRef("#nick"),
    SKOS.prefLabel,
    Literal("Nick") #, datatype=XSD.string)
    )
)
g.add((
    URIRef("#bob"),
    SKOS.prefLabel,
    Literal("Bob") #, datatype=XSD.string)
    )
)

print(g.serialize(format="turtle"))

my_query = """
SELECT DISTINCT ?a ?b
WHERE {
    ?a skos:prefLabel "Nick" .
}"""

qres = g.query(my_query)
for row in qres:
    print(f"{row.a} ")

bob=URIRef("#bob")
print(g.value(bob,SKOS.prefLabel))
#```

UsageError: Line magic function `%%script` not found.


In [None]:
%%script false --no-raise-error

g.add( (PHASE.plan, SKOS.prefLabel, Literal("Plan")))
print(  PHASE.plan )
print( g.value( PHASE.plan, SKOS.prefLabel ) )

In [None]:
%%script false --no-raise-error

phaseID = "pha+--=sfgsdfg   \n se13"
phaseID2 = "phase13432"
g.remove( (PHASE[cleanCamel(phaseID)], None, None) )  #dict notation as alternative to explicit value and dot notation
g.add( (PHASE[cleanCamel(phaseID)], SKOS.related, PHASE[phaseID2]))
print( g.value( PHASE[cleanCamel(phaseID)], SKOS.related ) )

### Load

... pre-load from onto.ttl here ?
g.parse('publish/nodehenge.org/ont.ttl')

In [151]:
print(g.serialize(format="turtle"))



