# Generating Synthetic Data 
Using OpenAI's API GPT-4 
____

In [1]:
import openai
import random
import os
import dotenv
import llm

In [29]:
key = os.environ.get('OPENAI_API_KEY')

prompt = """A model that receives an instruction from the user to show certain information from a postgres database in colloquial english. 
Followed by information on table definitions under TABLE_DEFINITIONS formatted as the output of this function:

    def get_table_definitions_for_prompt(self):
        table_names = self.get_all_table_names()
        return "\n".join([self.get_table_definition(table) for table in table_names])

With the instruction and the table definitions formatted like so:

<user instruction>

TABLE_DEFINITIONS

<output of function>

It is the model's job to then respond with an SQL query that will be run to fetch the requested data""" 
temperature = 0.4
number_of_examples = 1

In [30]:
def generate_example(prompt, prev_examples, temperature=.5):
    messages=[
        {
            "role": "system",
            "content": f"""You are generating data which will be used to train a machine learning model.\n\n
            You will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\n
            You will do so in this format:\n```\nprompt\n-----------\n$prompt_goes_here\n-----------\n\nresponse\n-----------\n$response_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\n\nHere is the type of model we want to train:\n`{prompt}`"""
        }
    ]

    if len(prev_examples) > 0:
        if len(prev_examples) > 10:
            prev_examples = random.sample(prev_examples, 10)
        for example in prev_examples:
            messages.append({
                "role": "assistant",
                "content": example
            })

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        temperature=temperature,
        max_tokens=1354,
    )

    return response.choices[0].message['content']

In [31]:
prev_examples = []
for i in range(number_of_examples):
    print(f'Generating example {i}')
    example = generate_example(prompt, prev_examples, temperature)
    prev_examples.append(example)

print(prev_examples)

Generating example 0
['prompt\n-----------\nShow me the names of all employees.\n\nTABLE_DEFINITIONS\n\nemployees: id (integer), name (text), position (text), salary (integer), department_id (integer)\n\ndepartments: id (integer), name (text), location (text)\n-----------\n\nresponse\n-----------\nSELECT name FROM employees;\n-----------']


In [32]:
import pandas as pd

# Initialize lists to store prompts and responses
prompts = []
responses = []

# Parse out prompts and responses from examples
for example in prev_examples:
  try:
    split_example = example.split('-----------')
    prompts.append(split_example[1].strip())
    responses.append(split_example[3].strip())
  except:
    pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')

pd.set_option('display.max_colwidth', None)
df.head()

There are 1 successfully-generated examples. Here are the first few:


Unnamed: 0,prompt,response
0,"Show me the names of all employees.\n\nTABLE_DEFINITIONS\n\nemployees: id (integer), name (text), position (text), salary (integer), department_id (integer)\n\ndepartments: id (integer), name (text), location (text)",SELECT name FROM employees;


In [6]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_csv('train.csv', index = False)
test_df.to_csv('test.csv', index=False)