###  Training GPT-3 on synthetic dataset

In [2]:
# Creating some synthetic data for fine-tuning 
# We are going to create serveral characters for video games in here: 

import os
import openai
import pandas as pd

# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define input values
ages = ['30', '40', '50', '60']
genders = ['man', 'woman']
superpowers = ['invisibility', 'read in the thoughts', 'turning lead into gold', 'immortality'] 

# Create an empty list to store results
results = []

# Loop through input values
record_number = 1
for age in ages:
    for gender in genders:
        for power in superpowers:
            # Define the prompt with placeholders
            prompt = f"Imagine a complete and detailed description of a {age}-year-old {gender} fictional character who has the superpower of {power}. Write out the entire description in a maximum of 100 words in great detail."

            # Generate the response using OpenAI's GPT-3 model
            response = openai.Completion.create(
                model="text-davinci-003",
                prompt=prompt,
                temperature=0.7,  # Adjust temperature for response creativity
                max_tokens=100,  # Limit response to 100 tokens (adjust as needed)
                n=1,  # Generate a single response
                stop=None,  # Let the model determine when to stop
            )

            # Extract and store the response text
            response_text = response.choices[0].text.strip()

            # Create a subprompt string
            subprompt = f"{age}, {gender}, {power}"

            # Store the results in a dictionary
            result = {
                'record number': record_number,
                'age': age,
                'gender': gender,
                'power': power,
                'prompt': prompt,
                'subprompt': subprompt,
                'response': response_text
            }

            results.append(result)

            # Increment the record number
            record_number += 1

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
df.to_csv("character_descriptions.csv", index=False)


In [5]:
import pandas as pd
import openai
import subprocess

# read the previous file with the generated data 
df = pd.read_csv("character_descriptions.csv")

# now we are using the parameters of the prompt (subprompt) as final prompt for the training. 
prepared_data = df.loc[:,['subprompt','response']]
prepared_data.rename(columns={'subprompt':'prompt', 'response':'completion'}, inplace=True)
prepared_data.to_csv('prepared_data.csv',index=False)

## prepared_data.csv --> prepared_data_prepared.json
subprocess.run('openai tools fine_tunes.prepare_data --file prepared_data.csv --quiet'.split())

## Start fine-tuning
subprocess.run('openai api fine_tunes.create --training_file prepared_data_prepared.jsonl --model davinci --suffix "SuperHero"'.split())