# Codes for simulating text descriptions of threat attributes on a list of CWE entries

## Preparation

Read Raw CWE list (csv)

In [1]:
import csv

In [2]:
# Specify the file path
#file_path_input = 'Raw Lists/1387.csv'
#file_path_input = 'Raw Lists/699.csv'
file_path_input = 'Raw Lists/1194.csv'

# Open the file
with open(file_path_input, mode='r', encoding='utf-8') as file:
    # Create a CSV reader
    csv_reader = csv.reader(file)
    
    # Read the headers (first row) into a list
    headers = next(csv_reader)
    
    # Create a list to store the data
    data = []
    
    # Loop through each row in the CSV file
    for row in csv_reader:
        # Append the row data as a list to the data list
        data.append(row)

Setup for OpenAI GPT API request

In [3]:
import openai

# Provide OpenAI API Key
openai.api_key = "fill your API key"

# Specify GPT model
#MODEL = "gpt-4"
MODEL = "gpt-3.5-turbo"

## Setting up functions

Function for creating query text for GPT to answer

In [4]:
def getQueryText(cwe_id, cwe_desc, style="full sentences"):

    query_text = \
    f"""Here is the description of CWE {cwe_id}:
{cwe_desc}

Use what you know about this CWE and the description provided to describe the following attributes of this threat for me: the vulnerability, method, technical impact, security properties affected, severity, likelihood, relevant assets, the attack vector(s), the attacker type(s), the attacker motive(s), relevant cyber controls/countermeasures, and detection methods.
Please use {style} for each attribute."""
        
    return query_text

Function for requesting GPT response

In [5]:
def gptChat(MODEL, query_text, TEMP=0):

    response = openai.ChatCompletion.create(
        model=MODEL,
        messages=[
            {"role": "user", "content": query_text},
        ],
        temperature=TEMP,
    )
    
    return response['choices'][0]['message']['content']

## Main Run

Read the potential output file to continue from the stopping point
(in case to continue the run from previous stopping point)

In [6]:
# Specify the file path of output file in halt
file_path_output = 'Outputs/simulated_data_AllHardware.csv'

# Open the file
try:
    with open(file_path_output, mode='r', encoding='utf-8') as file:
    # Create a CSV reader
        csv_reader = csv.reader(file)

        # Read the headers (first row) into a list (if headers in place)
        #headers = next(csv_reader)

        # Create a set to store the processed CWE id
        processed_cwe = set()

        # Loop through each row in the CSV file
        for row in csv_reader:
            # Add the CWE id to the set
            processed_cwe.add(row[0])
except:
    processed_cwe = set()

Main program for simulating text data by GPT

In [7]:
# Initialize empty list to store simulated data
sim_data = []

# Loop thru each row of original data lists, i.e., each cyber threat
for i, row in enumerate(data):
    print(f"Running {i}/{len(data)}")
    
    # Check if the CWE id is already in the output file
    if row[0] in processed_cwe:
        continue
    
    # Retrieve the query text
    query_text = getQueryText(row[0], row[4]) # default to generate in "full sentences"
    #query_text = getQueryText(row[0], row[4], "short form")
    
    # Retrieve the reponse from GPT
    threat_text = gptChat(MODEL, query_text)
    
    # Split the text into lines
    lines = threat_text.strip().split('\n\n') # GPT may generate response in one/two line breaks
    #lines = threat_text.strip().split('\n')

    # Convert the lines into a list
    threat_row = [row[0]]
    for line in lines:
        try: # GPT may generate resposne in the format of [attribute]: [explanation]
            # Split each line into attribute and explanation
            attribute, explanation = line.split(': ', 1)
            # Add to the data list
            threat_row.append(explanation)
        except:
            threat_row.append(line)
        
    sim_data.append(threat_row)
    # each row contains 1 column of CWE ID and 12 columns of threat attribute descriptions
    # number of rows = number of CWE entries
    
print(f"Completed Running {i+1}/{len(data)}.")

Running 0/104
Running 1/104
Running 2/104
Running 3/104
Running 4/104
Running 5/104
Running 6/104
Running 7/104
Running 8/104
Running 9/104
Running 10/104
Running 11/104
Running 12/104
Running 13/104
Running 14/104
Running 15/104
Running 16/104
Running 17/104
Running 18/104
Running 19/104
Running 20/104
Running 21/104
Running 22/104
Running 23/104
Running 24/104
Running 25/104
Running 26/104
Running 27/104
Running 28/104
Running 29/104
Running 30/104
Running 31/104
Running 32/104
Running 33/104
Running 34/104
Running 35/104
Running 36/104
Running 37/104
Running 38/104
Running 39/104
Running 40/104
Running 41/104
Running 42/104
Running 43/104
Running 44/104
Running 45/104
Running 46/104
Running 47/104
Running 48/104
Running 49/104
Running 50/104
Running 51/104
Running 52/104
Running 53/104
Running 54/104
Running 55/104
Running 56/104
Running 57/104
Running 58/104
Running 59/104
Running 60/104
Running 61/104
Running 62/104
Running 63/104
Running 64/104
Running 65/104
Running 66/104
Runni

Output the Simulated text data

In [9]:
# Specify the file path for output file
#file_path_output = 'Outputs/simulated_data.csv'
#file_path_output = 'Outputs/simulated_data_shortform.csv'
#file_path_output = 'Outputs/simulated_data_shortform_wHeaders.csv'
#file_path_output = 'Outputs/simulated_data_AllSoftware.csv'
file_path_output = 'Outputs/simulated_data_AllHardware.csv'

# Open the file for writing
with open(file_path_output, mode='a', newline='', encoding='utf-8') as file:
    # Create a CSV writer
    csv_writer = csv.writer(file)
    
    # Write the data to the CSV file
    csv_writer.writerows(sim_data)