In [1]:
from src.model.container import AttackPatternsContainer
import json
from src.model.interfaceToMitre.mitreData.utils.FileUtils import write_to_file, read_from_file

training_file_name = "attack_pattern_training.jsonl"
data = []

for at in AttackPatternsContainer().get_data():
    if 'AML' in at.x_mitre_id:
        system_targeted = ['LLM model']
    else:
        system_targeted = at.x_mitre_platforms

    json_data = {
        "messages": [
            {
                "role": "system",
                "content": "You are a chatbot specializing in the context of cybersecurity, you know many attack patterns."
            },
            {
                "role": "user",
                "content": "What is the attack pattern known with the name: \'" + at.name + "\' and with the Id: " + at.x_mitre_id + " ?"
            },
            {
                "role": "assistant",
                "content": "The attack pattern with the name: \'" + at.name + "\' and with the Id: " + at.x_mitre_id +
                           " is described as follows: " + at.description.replace("\n", "") +
                           ". The systems targeted by this attack pattern are: " + ', '.join(system_targeted).replace("\n", "") + ' .'
            }
        ]
    }

    data.append(json_data)

    write_to_file(json.dumps(data, ensure_ascii=False, separators=(',', ':')), training_file_name, './')


validation_file_name = "attack_pattern_validation.jsonl"
data =  []

for at in AttackPatternsContainer().get_data():
    if 'AML' in at.x_mitre_id:
        system_targeted = ['LLM model']
    else:
        system_targeted = at.x_mitre_platforms

    json_data = {
        "messages": [
            {
                "role": "system",
                "content": "You are a chatbot specializing in the context of cybersecurity, you know many attack patterns."
            },
            {
                "role": "user",
                "content": "What is the attack pattern Id " + at.x_mitre_id + " ?"
            },
            {
                "role": "assistant",
                "content": "The attack pattern " + at.x_mitre_id + " is : " + at.description.replace("\n", "") +
                           ". The systems targeted by this attack pattern are: " + ', '.join(system_targeted).replace("\n", "") + ' .'
            }
        ]
    }
    
    data.append(json_data)

    write_to_file(json.dumps(data, ensure_ascii=False, separators=(',', ':')), validation_file_name, './')
        
    
print('Done')

enterprise-attack not change
mobile-attack not change
ics-attack not change
atlas not change
mitre-to-cve not change
Done


In [2]:
# Run preliminary checks

import json

# Load the training set
training_dataset = json.loads(read_from_file('./', training_file_name))

# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
print(training_dataset[0])
for message in training_dataset[0]["messages"]:
    print(message)

# Load the validation set
validation_dataset = json.loads(read_from_file('./', validation_file_name))

# Validation dataset stats
print("\nNumber of examples in validation set:", len(validation_dataset))
print("First example in validation set:")
for message in validation_dataset[0]["messages"]:
    print(message)

Number of examples in training set: 1144
First example in training set:
{'messages': [{'role': 'system', 'content': 'You are a chatbot specializing in the context of cybersecurity, you know many attack patterns.'}, {'role': 'user', 'content': "What is the attack pattern known with the name: 'Extra Window Memory Injection' and with the Id: T1055.011 ?"}, {'role': 'assistant', 'content': "The attack pattern with the name: 'Extra Window Memory Injection' and with the Id: T1055.011 is described as follows: Adversaries may inject malicious code into process via Extra Window Memory (EWM) in order to evade process-based defenses as well as possibly elevate privileges. EWM injection is a method of executing arbitrary code in the address space of a separate live process. Before creating a window, graphical Windows-based processes must prescribe to or register a windows class, which stipulate appearance and behavior (via windows procedures, which are functions that handle input/output of data).(

In [3]:
# Validate token counts

import json
import tiktoken
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("cl100k_base") # default encoding used by gpt-4, turbo, and text-embedding-ada-002 models

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

datasets = [training_dataset, validation_dataset]

for dataset in datasets:
    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))

    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)


#### Distribution of total tokens:
min / max: 94, 1105
mean / median: 335.2132867132867, 305.0
p5 / p95: 176.0, 529.4000000000001

#### Distribution of assistant tokens:
min / max: 37, 1044
mean / median: 276.4973776223776, 247.0
p5 / p95: 116.30000000000001, 469.70000000000005
**************************************************

#### Distribution of total tokens:
min / max: 63, 1067
mean / median: 302.8863636363636, 272.0
p5 / p95: 142.3, 498.0

#### Distribution of assistant tokens:
min / max: 20, 1024
mean / median: 258.8339160839161, 229.0
p5 / p95: 98.60000000000002, 453.0
**************************************************


In [4]:
# Reformat the file jsonl to right format for the training
import os
training_file_name_formatted = "attack_pattern_training_formatted.jsonl"
validation_file_name_formatted = "attack_pattern_validation_formatted.jsonl"
os.remove(training_file_name_formatted)
os.remove(validation_file_name_formatted)

for filenames in [[training_file_name, training_file_name_formatted], [validation_file_name, validation_file_name_formatted]]:
    file = open(filenames[0], "rb")
    print(file)
    
    data = file.read()
    
    for line in json.loads(data):
        with open(filenames[1], 'a') as f:
            f.write(json.dumps(line))
            f.write('\n')

<_io.BufferedReader name='attack_pattern_training.jsonl'>
<_io.BufferedReader name='attack_pattern_validation.jsonl'>


In [5]:
from dotenv import load_dotenv
# Upload fine-tuning files

import os
from openai import AzureOpenAI

load_dotenv()


client = AzureOpenAI(
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-02-01"
)

# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file = open(training_file_name_formatted, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file = open(validation_file_name_formatted, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

HTTP Request: POST https://cti-gpt-fine-tuned.openai.azure.com//openai/files?api-version=2024-02-01 "HTTP/1.1 201 Created"
HTTP Request: POST https://cti-gpt-fine-tuned.openai.azure.com//openai/files?api-version=2024-02-01 "HTTP/1.1 201 Created"
Training file ID: file-8ad152d521144a19a9e0b9343aafba1d
Validation file ID: file-c3499464dd0f426a959d844bf67d19c5


In [6]:
# Submit fine-tuning training job

response = client.fine_tuning.jobs.create(
    training_file = training_file_id,
    validation_file = validation_file_id,
    model = "gpt-35-turbo-0125", # The model to fine-tune
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

HTTP Request: POST https://cti-gpt-fine-tuned.openai.azure.com//openai/fine_tuning/jobs?api-version=2024-02-01 "HTTP/1.1 201 Created"
Job ID: ftjob-9f7d681a3f264f2d8767a3711061ee6c
Status: pending
{
  "id": "ftjob-9f7d681a3f264f2d8767a3711061ee6c",
  "created_at": 1714680030,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": -1,
    "batch_size": -1,
    "learning_rate_multiplier": 1
  },
  "model": "gpt-35-turbo-0125",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-8ad152d521144a19a9e0b9343aafba1d",
  "validation_file": "file-c3499464dd0f426a959d844bf67d19c5"
}


In [8]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status = response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)

    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response.model_dump_json(indent=2))
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))
    status = response.status
    print(f'Status: {status}')
    clear_output(wait=True)

print(f'Fine-tuning job {job_id} finished with status: {status}')

# List all fine-tuning jobs for this resource.
print('Checking other fine-tune jobs for this resource.')
response = client.fine_tuning.jobs.list()
print(f'Found {len(response.data)} fine-tune jobs.')

In [None]:
# Retrieve fine_tuned_model name 

response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model