<a href="https://colab.research.google.com/github/mech0s/nodehenge/blob/main/experiment/doc2rdf-gpt-tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
revision = 7
print ("Revision", revision)

Revision 7


##  Install steps
Remember to re-run for each fresh google colab session

In [2]:
### commented out - rdfpandas not needed?
#  %pip install rdfpandas
#  %pip install openpyxl
%pip install rdflib

Collecting rdflib
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.3.2


##   Imports

In [3]:
import numpy as np
import pandas as pd
import os
import re
import json

### Optional    google  colab   enhancements

In [4]:
#optional
if 'COLAB_JUPYTER_TOKEN' in os.environ:
  from google.colab import data_table
  data_table.enable_dataframe_formatter()

## Source Data Read/Setup

### Read dodcio DevSecOpsActivitesToolsGuidebookTables.xlsx

In [5]:
def cleanCamel(s):
  r = re.sub('\W|^(?=\d)','_', s) ## cleans to make a valid identifier
  r = re.sub(r"(_|-|\n|!)+", " ", r).title().replace(" ", "")  ## turns to camel-case
  return ''.join([s[0].lower(), r[1:]])

In [6]:
sourceURI = "https://dodcio.defense.gov/Portals/0/Documents/Library/DevSecOpsActivitesToolsGuidebookTables.xlsx"
xls = pd.ExcelFile(sourceURI)

### Setup fixed data

In [7]:
phaseNames = [ 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback']

##  Build phaseActivityDataFrame

One named worksheet per phase: Pull these into a list of DataFrames, adjusting column names to create valid identifiers. Turn NaN entries into blank strings.

In [8]:
phaseDataList = []
phaseOrder = 0
for phName in phaseNames:
  df = pd.read_excel(xls,phName)
  df = df.rename(columns={"Activities":"Activity",
                     "Security / Testing / CM": "SecurityTestingCM",
                     "Tool Dependencies": "ToolDependency",
                     "Tool Dependency": "ToolDependency"
                     })
  # add Phase and order columns - alternative avoid tricky MultiIndex when concatening below
  df["Phase"] = phName
  df["PhaseOrder"] = phaseOrder
  df["OrderInPhase"] = df.index
  phaseOrder+=1
  #
  phaseDataList.append(df.replace(np.nan, ""))

phaseDataList : list of dataframes :- concatenate into one dataframe

In [9]:
phaseActivityDataFrame = pd.concat(phaseDataList)
phaseActivityDataFrame.reset_index(drop=True, inplace=True)
# MultiIndex example: phaseActivityDataFrame = pd.concat(phaseDataList,keys=phaseNames, names=["Phase","IDinPhase"])

In [10]:
phaseActivityDataFrame["ActivityIdentifier"] = phaseActivityDataFrame["Activity"].apply( cleanCamel )

In [11]:
phaseActivityDataFrame.to_json()



In [12]:
phaseActivityDataFrame.rename(columns={"ToolDependency":"ToolDependenciesText", "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
## prepare empty lists to accept values parsed from text fields
phaseActivityDataFrame["ToolsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["InputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
phaseActivityDataFrame["OutputsList"]=[[] for _ in range(len(phaseActivityDataFrame))]
pajson = phaseActivityDataFrame[["ActivityIdentifier","Phase","ToolDependenciesText","InputsText","OutputsText","ToolsList","InputsList","OutputsList"]].to_json(orient='records')


In [13]:
# check cheet names
xls.sheet_names

['Cover',
 'Overview',
 'Continuous Activities',
 'Plan',
 'Develop',
 'Build',
 'Test',
 'Release',
 'Deliver',
 'Deploy',
 'Operate',
 'Monitor',
 'Feedback',
 'Tools']

In [14]:
toolsDataFrame = pd.read_excel(xls, "Tools")
toolsDataFrame.replace(np.nan, "", inplace=True)

In [15]:
toolsDataFrame["ToolIdentifier"]=toolsDataFrame["Tool"].apply( cleanCamel )
toolsDataFrame.rename(columns={ "Inputs":"InputsText", "Outputs":"OutputsText"},inplace=True)
toolsDataFrame["InputsList"]=[[] for _ in range(len(toolsDataFrame))]
toolsDataFrame["OutputsList"]=[[] for _ in range(len(toolsDataFrame))]
tjson = toolsDataFrame[["ToolIdentifier","InputsText","OutputsText","InputsList","OutputsList"]].to_json(orient='records')


In [16]:
allData = {"ActivitiesData" : json.loads(pajson) ,
 "ToolsData" : json.loads(tjson) }
allDataJson = json.dumps(allData)

In [17]:
toolsDataFrame.iloc[0].InputsList.append(123)
toolsDataFrame.iloc[0].InputsList.clear()
toolsDataFrame.iloc[0]

Tool                                      Alerting and notification
Features          Notify security teams and/or administrators ab...
Benefits          Improve visibility of system events\nReduce sy...
InputsText        Aggregated filtered logs from the Log Aggregat...
OutputsText       Alert messages, emails, etc.\nRemediation repo...
ToolIdentifier                              alertingAndNotification
InputsList                                                       []
OutputsList                                                      []
Name: 0, dtype: object

In [18]:
%pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m573.5 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8


In [19]:
import openai
from getpass import getpass
openai.api_key = getpass("Paste your OpenAI API Key here (https://platform.openai.com/account/api-keys):")

Paste your OpenAI API Key here (https://platform.openai.com/account/api-keys):··········


In [51]:
tmp  = openai.Model.list()
tmp2 = [x.id for x in tmp.data ]
tmp2.sort()
tmp2

['ada',
 'ada-code-search-code',
 'ada-code-search-text',
 'ada-search-document',
 'ada-search-query',
 'ada-similarity',
 'babbage',
 'babbage-code-search-code',
 'babbage-code-search-text',
 'babbage-search-document',
 'babbage-search-query',
 'babbage-similarity',
 'code-davinci-edit-001',
 'code-search-ada-code-001',
 'code-search-ada-text-001',
 'code-search-babbage-code-001',
 'code-search-babbage-text-001',
 'curie',
 'curie-instruct-beta',
 'curie-search-document',
 'curie-search-query',
 'curie-similarity',
 'davinci',
 'davinci-instruct-beta',
 'davinci-search-document',
 'davinci-search-query',
 'davinci-similarity',
 'gpt-3.5-turbo',
 'gpt-3.5-turbo-0301',
 'gpt-3.5-turbo-0613',
 'gpt-3.5-turbo-16k',
 'gpt-3.5-turbo-16k-0613',
 'text-ada-001',
 'text-babbage-001',
 'text-curie-001',
 'text-davinci-001',
 'text-davinci-002',
 'text-davinci-003',
 'text-davinci-edit-001',
 'text-embedding-ada-002',
 'text-search-ada-doc-001',
 'text-search-ada-query-001',
 'text-search-babbag

In [44]:
[x.id for x in tmp.data if x.id.startswith('gpt')]

['gpt-3.5-turbo-16k-0613',
 'gpt-3.5-turbo-0301',
 'gpt-3.5-turbo-16k',
 'gpt-3.5-turbo-0613',
 'gpt-3.5-turbo']

In [42]:
result = openai.ChatCompletion.create(
  model="gpt-3.5-turbo-16k",
  messages=[
        {"role": "system", "content": "You seek clarification when given a task. You subsequently follow instructions to methodically process every element of provided data."},
        {"role": "user", "content": """
I have JSON that describes typical activities, tools, inputs and outputs for DevSecOps phases.
Plain text fields need to be interpreted as lists of nouns or noun phrases. Can you process the data? """ },
        {"role": "assistant" , "content" : "Certainly! Tell me the key identifier fields and those that need parsed into lists and I will fully process the data you attach, returning it in JSON format."""},
        {"role": "user", "content": """
Identifier fields are Phase and ActivityIdentifier.
I need you to process InputsText, OutputsText and ToolDependenciesText into three separate result lists InputsList, OutputsList and ToolDependenciesList. Here is the data:
------------------
        """ + pajson },
    ]
)
result

<OpenAIObject chat.completion id=chatcmpl-7fpPQGuXIFMkZP4hMJCnqspMAkL0X at 0x792d7c7ad760> JSON: {
  "id": "chatcmpl-7fpPQGuXIFMkZP4hMJCnqspMAkL0X",
  "object": "chat.completion",
  "created": 1690203420,
  "model": "gpt-3.5-turbo-16k-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
      },
      "finish_reason": "length"
    }
  ],
  "usage": {
    "prompt_tokens": 8320,
    "completion_tokens": 8065,
    "total_tokens": 16385
  }
}

In [23]:
result1 = result
print(result1.choices[0].message.content)

import json

data = [
    {"ActivityIdentifier":"changeManagementPlanning","Phase":"Plan","ToolDependenciesText":"Team collaboration system;\nIssue tracking system","InputsText":"Organizational policy;\nSoftware development best practices","OutputsText":"Change control procedures;\nReview procedures;\nControl review board;\nChange management plan","ToolsList":[],"InputsList":[],"OutputsList":[]},
    {"ActivityIdentifier":"configurationIdentification","Phase":"Plan","ToolDependenciesText":"CMDB;\nSource code repository;\nArtifact repository;\nTeam collaboration system","InputsText":"IT infrastructure asset;\nSoftware system components (include DevSecOps tools);\ncode baselines;\ndocument baselines","OutputsText":"Configuration items","ToolsList":[],"InputsList":[],"OutputsList":[]},
    {"ActivityIdentifier":"configurationManagementCmPlanning","Phase":"Plan","ToolDependenciesText":"Team collaboration system;\nIssue tracking system","InputsText":"Software development, security and opera

In [24]:
result2 = result
print(result2.choices[0].message.content)

import json

data = [
    {"ActivityIdentifier":"changeManagementPlanning","Phase":"Plan","ToolDependenciesText":"Team collaboration system;\nIssue tracking system","InputsText":"Organizational policy;\nSoftware development best practices","OutputsText":"Change control procedures;\nReview procedures;\nControl review board;\nChange management plan","ToolsList":[],"InputsList":[],"OutputsList":[]},
    {"ActivityIdentifier":"configurationIdentification","Phase":"Plan","ToolDependenciesText":"CMDB;\nSource code repository;\nArtifact repository;\nTeam collaboration system","InputsText":"IT infrastructure asset;\nSoftware system components (include DevSecOps tools);\ncode baselines;\ndocument baselines","OutputsText":"Configuration items","ToolsList":[],"InputsList":[],"OutputsList":[]},
    {"ActivityIdentifier":"configurationManagementCmPlanning","Phase":"Plan","ToolDependenciesText":"Team collaboration system;\nIssue tracking system","InputsText":"Software development, security and opera

In [25]:
result3 = result
print(result3.choices[0].message.content)

import json

data = [
    {"ActivityIdentifier":"changeManagementPlanning","Phase":"Plan","ToolDependenciesText":"Team collaboration system;\nIssue tracking system","InputsText":"Organizational policy;\nSoftware development best practices","OutputsText":"Change control procedures;\nReview procedures;\nControl review board;\nChange management plan","ToolsList":[],"InputsList":[],"OutputsList":[]},
    {"ActivityIdentifier":"configurationIdentification","Phase":"Plan","ToolDependenciesText":"CMDB;\nSource code repository;\nArtifact repository;\nTeam collaboration system","InputsText":"IT infrastructure asset;\nSoftware system components (include DevSecOps tools);\ncode baselines;\ndocument baselines","OutputsText":"Configuration items","ToolsList":[],"InputsList":[],"OutputsList":[]},
    {"ActivityIdentifier":"configurationManagementCmPlanning","Phase":"Plan","ToolDependenciesText":"Team collaboration system;\nIssue tracking system","InputsText":"Software development, security and opera

### Dataframe select, filter, order examples

In [34]:
### Dataframe slicing examples
phaseActivityDataFrame.iloc[3:39]

Unnamed: 0,Activity,Baseline,SSDF,Description,InputsText,OutputsText,ToolDependenciesText,SecurityTestingCM,Phase,PhaseOrder,OrderInPhase,ActivityIdentifier,ToolsList,InputsList,OutputsList
3,Database design,PREFERRED,"PO.1.2, PO.3.1, PO.5.2, PW.1.1, PW.5.1",Data modeling; \nDatabase selection;\nDatabase...,System requirement;\nSystem design,- Database design document\n-,Data modeling tool;\nTeams collaboration system,,Plan,0,3,databaseDesign,[],[],[]
4,Design review,PREFERRED,"PO.1.2, PW.1.2, PW.2.1, PW.8.2, RV.2.2",Review and approve plans and documents,Plans and design documents;,Review comments;\nAction items,Team collaboration system,Configuration Management,Plan,0,4,designReview,[],[],[]
5,DevSecOps process design,REQUIRED,PO.1.1,Design the DevSecOps process workflows that ar...,Change management process;\nSystem design;\nRe...,DevSecOps process flow chart;\nDevSecOps ecosy...,Team collaboration system,,Plan,0,5,devsecopsProcessDesign,[],[],[]
6,Documentation version control,REQUIRED,"PO.1.1, PO.1.2, PO.1.3, PS.1.1",Track design changes,Plans and design documents;,Version controlled documents,Team collaboration system,Configuration Management,Plan,0,6,documentationVersionControl,[],[],[]
7,IaC deployment,REQUIRED,"PO.3.2, PO.3.3",Deploy infrastructure and set up environment u...,Artifacts (Infrastructure as Code)\nInfrastruc...,The environment ready,Configuration automation tool;\nIaC,,Plan,0,7,iacDeployment,[],[],[]
8,Mission-Based Cyber Risk Assessments,REQUIRED,"PW.7.2, RV.1.1, RV.1.2, RV.2.1, RV.3.1, RV.3.2...",An assessment of risks based upon the stated m...,NIST 800-53 RMF Control Implementations\nFIPS ...,Risk assessment,Risk assessment tool,Security,Plan,0,8,missionBasedCyberRiskAssessments,[],[],[]
9,Project/Release planning,REQUIRED,"PS.3.1, PS.3.2",Project task management\nRelease planning,Project charter\nProject constraints,Project Plan\nTask plan & schedule\nRelease pl...,Team collaboration system;\nProject management...,,Plan,0,9,projectReleasePlanning,[],[],[]
10,Project team onboarding planning,REQUIRED,"PO.2.1, PO.2.2, PO.2.3","Plan the project team onboarding process, inte...",Organization policy,Onboarding plan,Team collaboration system,,Plan,0,10,projectTeamOnboardingPlanning,[],[],[]
11,Risk management,REQUIRED,"PO.1.2, PO.3.1, PO.4.1, PW.1.1, PW.1.2, PW.2.1...",Risk assessment,System architecture;\nSupply chain information...,Risk management plan,Team collaboration system;,,Plan,0,11,riskManagement,[],[],[]
12,Software requirement analysis,REQUIRED,"PO.1.1, PO.1.2, PO.1.3",Gather the requirements from all stakeholders,Stakeholder inputs or feedback;\nOperation mon...,Requirements Documents:\n- Feature requirement...,Requirements tool;\nTeam collaboration system;...,,Plan,0,12,softwareRequirementAnalysis,[],[],[]


In [27]:
### Dataframe ordering and filtering examples
phaseActivityDataFrame.sort_values(["PhaseOrder","OrderInPhase"])[phaseActivityDataFrame["Phase"]>="Plan"]

Unnamed: 0,Activity,Baseline,SSDF,Description,InputsText,OutputsText,ToolDependenciesText,SecurityTestingCM,Phase,PhaseOrder,OrderInPhase,ActivityIdentifier,ToolsList,InputsList,OutputsList
0,Change management planning,REQUIRED,"PO.1.1, PS.1.1, PS.3.1, PW.6.1",Plan the change control process,Organizational policy;\nSoftware development b...,Change control procedures;\nReview procedures;...,Team collaboration system;\nIssue tracking system,,Plan,0,0,changeManagementPlanning,[],[],[]
1,Configuration identification,REQUIRED,"PO.2.1, PS.1.1, PW.2.1, PW.4.1, PW.4.2, PW.6.2",Discover or manual input configuration items i...,IT infrastructure asset;\nSoftware system comp...,Configuration items,CMDB;\nSource code repository;\nArtifact repos...,Configuration Management,Plan,0,1,configurationIdentification,[],[],[]
2,Configuration management (CM) planning,REQUIRED,"PO.3.1, PO.3.3, PO.4.1, PO.4.2, PW.2.1",Plan the configuration control process;\nIdent...,"Software development, security and operations ...",CM processes and plan;\nCM tool selection;\nRe...,Team collaboration system;\nIssue tracking system,Configuration Management,Plan,0,2,configurationManagementCmPlanning,[],[],[]
3,Database design,PREFERRED,"PO.1.2, PO.3.1, PO.5.2, PW.1.1, PW.5.1",Data modeling; \nDatabase selection;\nDatabase...,System requirement;\nSystem design,- Database design document\n-,Data modeling tool;\nTeams collaboration system,,Plan,0,3,databaseDesign,[],[],[]
4,Design review,PREFERRED,"PO.1.2, PW.1.2, PW.2.1, PW.8.2, RV.2.2",Review and approve plans and documents,Plans and design documents;,Review comments;\nAction items,Team collaboration system,Configuration Management,Plan,0,4,designReview,[],[],[]
5,DevSecOps process design,REQUIRED,PO.1.1,Design the DevSecOps process workflows that ar...,Change management process;\nSystem design;\nRe...,DevSecOps process flow chart;\nDevSecOps ecosy...,Team collaboration system,,Plan,0,5,devsecopsProcessDesign,[],[],[]
6,Documentation version control,REQUIRED,"PO.1.1, PO.1.2, PO.1.3, PS.1.1",Track design changes,Plans and design documents;,Version controlled documents,Team collaboration system,Configuration Management,Plan,0,6,documentationVersionControl,[],[],[]
7,IaC deployment,REQUIRED,"PO.3.2, PO.3.3",Deploy infrastructure and set up environment u...,Artifacts (Infrastructure as Code)\nInfrastruc...,The environment ready,Configuration automation tool;\nIaC,,Plan,0,7,iacDeployment,[],[],[]
8,Mission-Based Cyber Risk Assessments,REQUIRED,"PW.7.2, RV.1.1, RV.1.2, RV.2.1, RV.3.1, RV.3.2...",An assessment of risks based upon the stated m...,NIST 800-53 RMF Control Implementations\nFIPS ...,Risk assessment,Risk assessment tool,Security,Plan,0,8,missionBasedCyberRiskAssessments,[],[],[]
9,Project/Release planning,REQUIRED,"PS.3.1, PS.3.2",Project task management\nRelease planning,Project charter\nProject constraints,Project Plan\nTask plan & schedule\nRelease pl...,Team collaboration system;\nProject management...,,Plan,0,9,projectReleasePlanning,[],[],[]


In [28]:
### Dataframe selection examples
phaseActivityDataFrame[["Phase","Baseline"]]

Unnamed: 0,Phase,Baseline
0,Plan,REQUIRED
1,Plan,REQUIRED
2,Plan,REQUIRED
3,Plan,PREFERRED
4,Plan,PREFERRED
...,...,...
137,Monitor,REQUIRED
138,Monitor,PREFERRED
139,Monitor,REQUIRED
140,Feedback,REQUIRED


### more...

## TODO : Other sheets / regions

# RDF creation

In [29]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import SKOS, RDF, RDFS, XSD, NamespaceManager # DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

ONT = Namespace("http://nodehenge.org/ont#")
PHASE = Namespace("http://nodehenge.org/inst/phase#")
TOOL = Namespace("http://nodehenge.org/inst/tool#")
ACT = Namespace("http://nodehenge.org/inst/activity#")
ART = Namespace("http://nodehenge.org/inst/artifact#")
g = Graph() ###base="http://nodehenge.org/inst/")
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("skos", SKOS)
g.bind("xsd", XSD)
g.bind("ont", ONT)
g.bind("phase", PHASE)
g.bind("tool", TOOL)
g.bind("act", ACT)
g.bind("art", ART)

In [30]:
# some commented-out tests
%%script false --no-raise-error

g.add((
    URIRef("#nick"),
    SKOS.prefLabel,
    Literal("Nick") #, datatype=XSD.string)
    )
)
g.add((
    URIRef("#bob"),
    SKOS.prefLabel,
    Literal("Bob") #, datatype=XSD.string)
    )
)

print(g.serialize(format="turtle"))

my_query = """
SELECT DISTINCT ?a ?b
WHERE {
    ?a skos:prefLabel "Nick" .
}"""

qres = g.query(my_query)
for row in qres:
    print(f"{row.a} ")

bob=URIRef("#bob")
print(g.value(bob,SKOS.prefLabel))
#```

In [31]:
%%script false --no-raise-error

g.add( (PHASE.plan, SKOS.prefLabel, Literal("Plan")))
print(  PHASE.plan )
print( g.value( PHASE.plan, SKOS.prefLabel ) )

In [32]:
%%script false --no-raise-error

phaseID = "pha+--=sfgsdfg   \n se13"
phaseID2 = "phase13432"
g.remove( (PHASE[cleanCamel(phaseID)], None, None) )  #dict notation as alternative to explicit value and dot notation
g.add( (PHASE[cleanCamel(phaseID)], SKOS.related, PHASE[phaseID2]))
print( g.value( PHASE[cleanCamel(phaseID)], SKOS.related ) )

### Load

... pre-load from onto.ttl here ?
g.parse('publish/nodehenge.org/ont.ttl')

In [33]:
print(g.serialize(format="turtle"))



