In [327]:
from openai import OpenAI
import random
import json
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_exponential

In [328]:
API_KEY = "==OPEN-AI-KEY=="

In [329]:
prompt = \
    "You are tasked with analyzing a C++, SQL, Java code language which focus in cloud security identify security vulnerabilities. "\
    "The input of the model will be stricly code text." \
    "Your response / output should be in the following json string format output:\n\n"\
    "title\n-----------\n$title_goes_here\n-----------\n\n "\
    "isFixNecessary\n-----------\n$boolean result (true / false)\n-----------\n\n "\
    "reasoning\n-----------\n#reasoning_goes_here\n-----------\n\n "\
    "ammendedCode\n-----------\n$ammended_code_goes_here\n-----------\n```\n\n "\
    "Do not use ``` ``` in code or response. Treat everything as json key-value pair"\
    "Follow this format strictly and order has to be the same. Do not say anything else. This is the code: "

# Lower values are great for precise tasks, like writing code, whereas larger values are better for creative tasks, like writing stories.
temperature = 0

# Number of self-generated examples (At production, minimal should be 100, min testing is 10)
number_of_examples = 100

In [330]:
N_RETRIES = 3

@retry(stop=stop_after_attempt(N_RETRIES), wait=wait_exponential(multiplier=1, min=4, max=70))
def generate_example(prompt, temperature=.8):
    messages=[
        {
            "role": "system",
            "content": (
                "You are generating data which will be used to train a machine learning model." \
                f"This is the prompt that is given to the machine learning model: {prompt}"\
                "Your task is to generate ONE unique and diverse code snippet in either C++, Java, or SQL (Choose just 1), focusing on cloud security practices, vulnerabilities, or mitigation strategies." \
                "Each submission should include a code and a corresponding response in the following format:\n\n"\
                "code\n-----------\n$code_goes_here\n-----------\n\n "\
                "response\n-----------\n$response_goes_here\n-----------\n\n\n "\
                "Do not use any ``` for response. Just use string"
                "Ensure each example builds in complexity compared to the previous one, maintaining diversity and quality." \
                "Make sure that your ammended code follows industry-standard change. (For example, do not just change password. Make it salt)"
                "Follow this format strictly. Do not say anything else."
            )
        }
    ]

    client = OpenAI(api_key=API_KEY)
    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=temperature,
        max_tokens=2000,
    )
    return response.choices[0].message.content

# Generate examples
prev_examples = []
for i in range(number_of_examples):
    print(f'Generating example {i}')
    example = generate_example(prompt)
    prev_examples.append(example)

Generating example 0
Generating example 1
Generating example 2
Generating example 3
Generating example 4
Generating example 5
Generating example 6
Generating example 7
Generating example 8
Generating example 9
Generating example 10
Generating example 11
Generating example 12
Generating example 13
Generating example 14
Generating example 15
Generating example 16
Generating example 17
Generating example 18
Generating example 19
Generating example 20
Generating example 21
Generating example 22
Generating example 23
Generating example 24
Generating example 25
Generating example 26
Generating example 27
Generating example 28
Generating example 29
Generating example 30
Generating example 31
Generating example 32
Generating example 33
Generating example 34
Generating example 35
Generating example 36
Generating example 37
Generating example 38
Generating example 39
Generating example 40
Generating example 41
Generating example 42
Generating example 43
Generating example 44
Generating example 4

RetryError: RetryError[<Future at 0x14315de40 state=finished raised RateLimitError>]

In [317]:
system_message = \
  "Given a piece of C++, SQL, or Java code focused on cloud security, analyze the code for security vulnerabilities." \
  "If a fix is necessary, provide reasoning and suggest an amended code in the same programming language."\
  "Use json as output."

In [318]:
# Initialize lists to store prompts and responses
prompts = []
responses = []


# Parse out prompts and detailed responses from examples
for idx, example in enumerate(prev_examples):
    try:
        # Split the example into sections
        split_example = example.split('-----------')

        # Extract and strip the prompt and append to the list
        prompt = split_example[1].strip()
        prompts.append(prompt)

        # Append the detailed response dictionary to the responses list
        responses.append(split_example[3].strip() )

    except IndexError:
        # Skip the current example if it doesn't conform to the expected format
        print("Error parsing example:", idx)

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

print('There are ' + str(len(df)) + ' successfully-generated examples.')

# Initialize list to store training examples
training_examples = []

# Create training examples in the format required for GPT-4 fine-tuning
for index, row in df.iterrows():
    training_example = {
        "messages": [
            {"role": "system", "content": system_message.strip()},
            {"role": "user", "content": row['prompt']},
            {"role": "assistant", "content": row['response']}
        ]
    }
    training_examples.append(training_example)

# Save training examples to a .jsonl file in a prettier format
with open('training_examples3.jsonl', 'w') as f:
    for example in training_examples:
        f.write(json.dumps(example) + '\n')

There are 20 successfully-generated examples.


In [167]:
# import json
# import pandas as pd

# # Initialize lists to store prompts and responses
# prompts = []
# responses = []

# for example in prev_examples:
#     try:
#         split_example = example.split('-----------')
#         prompts.append(split_example[1].strip())
#         response = '-----------'.join(split_example[3:])
#         responses.append(response.strip())
#     except:
#         pass

# # Create a DataFrame
# df = pd.DataFrame({
   
#     'prompt': prompts,
#     'response': responses
# })

# # Remove duplicates
# df = df.drop_duplicates()

# print('There are ' + str(len(df)) + ' successfully-generated examples.')

# # Initialize list to store training examples
# training_examples = []

# # Create training examples in the format required for GPT-3.5 fine-tuning
# for index, row in df.iterrows():
#     training_example = {
#         "messages": [
#             {"role": "system", "content": system_message.strip()},
#             {"role": "user", "content": row['prompt']},
#             {"role": "assistant", "content": row['response']}
#         ]
#     }
#     training_examples.append(training_example)

# # Save training examples to a .jsonl file
# with open('training.jsonl', 'w') as f:
#     for example in training_examples:
#         f.write(json.dumps(example) + '\n')

There are 10 successfully-generated examples.


In [302]:
client = OpenAI(api_key=API_KEY)

In [324]:
training_file_id = client.files.create(
  file=open("training_examples2.jsonl", "rb"),
  purpose='fine-tune'
).id

testing_file_id = client.files.create(
  file=open("testing_examples1.jsonl", "rb"),
  purpose='fine-tune'
).id


In [325]:
job = client.fine_tuning.jobs.create(
        model="gpt-3.5-turbo",
        training_file=training_file_id,
        validation_file=testing_file_id
    )
job_id = job.id

In [326]:
# Retrieve the state of a fine-tune (It will take 20 mins to complete)
client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10)


SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-s7DyQE41RlUEi3etUgVhFHZn', created_at=1713407405, level='info', message='Validating training file: file-3FOZu0aV3wSMh70Ayksfqb7T and validation file: file-3sHSUAv3cC0oY1YsuhXD3HEN', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-BAHqtTGUy6uhIoCdrjn7l1jn', created_at=1713407405, level='info', message='Created fine-tuning job: ftjob-qMclTblrwEEbP1Jr7UoMDFFa', object='fine_tuning.job.event', data={}, type='message')], object='list', has_more=False)

In [171]:
# Get the model name, once fine tuned is complete
model_name_pre_object = client.fine_tuning.jobs.retrieve('job_id')
model_name = model_name_pre_object.fine_tuned_model
print(model_name)

NotFoundError: Error code: 404 - {'error': {'message': 'Could not find fine tune: job_id', 'type': 'invalid_request_error', 'param': 'fine_tune_id', 'code': 'fine_tune_not_found'}}