# Synthetic Data Generation
____

In [4]:
import openai
import random
import os
import dotenv
import llm
import pandas as pd
import re

In [None]:
key = os.environ.get('OPENAI_API_KEY')

### LLM to create a dataset with the same prompt-response as app.py
A transition from the gpt-4 API to a local llm will require fine-tuning to uphold performance with a smaller model. To fine-tune to this specific use-case, the training data needs to resemble the prompt-response pair found when interacting with app.py. 

In [128]:
#general questions are needed to avoid specific numbers
prompt_definition = """A model that receives an instruction from the user to show certain information from a postgres database in colloquial english. 
Followed by information on table definitions under TABLE_DEFINITIONS formatted as the output of this function:

    def get_table_definitions_for_prompt(self):
        table_names = self.get_all_table_names()
        return "\n".join([self.get_table_definition(table) for table in table_names])

With the instruction and the table definitions formatted like so:

<user instruction>

TABLE_DEFINITIONS

<output of function> """

response_definition = """

It is the model's job to then respond with an SQL query that will be run to fetch the requested data with an output that follows this formatting:

$insert an explanation of the sql query as raw text here
            ---------
$insert sql query exclusively as raw text here
"""

prompt = f'{prompt_definition}{response_definition}'

In [137]:
#configuring the gpt-4 model params
#reasonable temperature to promote some creativity
#number_of_examples cannot exceed 15 as that reaches token limit provided by openai API
temperature = 0.4
number_of_examples = 15

In [138]:
#messages will describe the role of the llm model to generate synthetic data
def generate_example(prompt, prev_examples, temperature=.5):
    
    #system message will set the role of the llm
    messages=[
        {
            "role": "system",
            "content": f"""You are generating data which will be used to train a machine learning model.\n\n
            You will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\n
            You will do so in this format:\n```\nprompt\n-----------\n$prompt_goes_here\n-----------\n\nresponse\n-----------\n$response_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\n\nHere is the type of model we want to train:\n`{prompt}`"""
        }
    ]

#storing previous examples into 'messages' list so that the model can avoid repetitions, ensure diversity of the outputs.
    if len(prev_examples) > 0:
        if len(prev_examples) > 10:
            prev_examples = random.sample(prev_examples, 10)
        for example in prev_examples:
            messages.append({
                "role": "assistant",
                "content": example
            })



#configuring the gpt-4 model
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        temperature=temperature,
        max_tokens=1354,
    )

    return response.choices[0].message['content']

In [139]:
#storing generated prompt-pairs into list
prev_examples = []
for i in range(number_of_examples):
    print(f'Generating example {i}')
    example = generate_example(prompt, prev_examples, temperature)
    prev_examples.append(example)

print(prev_examples)

Generating example 0
Generating example 1
Generating example 2
Generating example 3
Generating example 4
Generating example 5
Generating example 6
Generating example 7
Generating example 8
Generating example 9
Generating example 10
Generating example 11
Generating example 12
Generating example 13
Generating example 14
Generating example 15
Generating example 16
Generating example 17
Generating example 18
Generating example 19
Generating example 20
Generating example 21


RateLimitError: Rate limit reached for gpt-4 in organization org-zTW9wjYDiv2a7Rtw2RP9ANrU on tokens_usage_based per min: Limit 10000, Used 7592, Requested 2451. Please try again in 258ms. Visit https://platform.openai.com/account/rate-limits to learn more.

In [140]:
# Initialize lists to store prompts and responses
prompts = []
responses = []

# Parse out prompts and responses from examples
for example in prev_examples:
  try:
    split_example = example.split('-----------')
    prompts.append(split_example[1].strip())
    responses.append(split_example[3].strip())
  except:
    pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,prompt,response
0,Can you provide me with the email of a patient named John Doe?,SELECT email FROM patients WHERE first_name = 'John' AND last_name = 'Doe';
1,Can you tell me the date of registration for a patient with the email johndoe@example.com?,SELECT date_of_registration FROM clinic_info WHERE patient_id = (SELECT id FROM patients WHERE email = 'johndoe@example.com');
2,I need to know the last visit date for a patient named Jane Smith.,SELECT last_visit_date FROM clinic_info WHERE patient_id = (SELECT id FROM patients WHERE first_name = 'Jane' AND last_name = 'Smith');
3,How many visits has a patient with the email address janedoe@example.com made?,SELECT number_of_visits FROM clinic_info WHERE patient_id = (SELECT id FROM patients WHERE email = 'janedoe@example.com');
4,Can you provide the medical record number for a patient named Robert Johnson?,SELECT medical_record_number FROM clinic_info WHERE patient_id = (SELECT id FROM patients WHERE first_name = 'Robert' AND last_name = 'Johnson');


In [141]:
df.to_csv('df_evaluate4.csv', index = False)

In [5]:
df1 = pd.read_csv('./data/df_evaluate1.csv')
df2 = pd.read_csv('./data/df_evaluate2.csv')
df3 = pd.read_csv('./data/df_evaluate3.csv')
df4 = pd.read_csv('./data/df_evaluate4.csv')

In [6]:
def join_df(*dataframes):
    joined_df = pd.concat(dataframes, ignore_index = True)
    return joined_df

df = join_df(df1,df2,df3,df4)

df.to_excel('health_prompts.xlsx',index = False)

In [49]:
df = pd.read_csv('./data/sql_prompt_pairs.csv')

In [64]:
#storing the prompts for future evaluation step
pattern = r'[.?\n]'
prompts = []
for i in range(len(df)):
    prompt_sentence = re.split(pattern, df.loc[i, 'prompt'])[0]
    prompts.append(prompt_sentence)


In [74]:
prompts = pd.Series(prompts)
prompts.to_csv('./data/prompts.csv', index = False)
prompts_list = pd.read_csv('./data/prompts.csv', index_col= False)

In [75]:
prompts_list

Unnamed: 0,0
0,Show me the names of all the employees
1,I want to know the highest salary in the company
2,Can you tell me how many employees work in each department
3,I need to know the average salary of employees in each department
4,Show me the details of the employee with the highest salary
...,...
56,"Can you show me the names and ages of all employees, sorted by age in descending order"
57,Can you show me the names of all employees who are in their 20s
58,Show me the names of all employees who are not in the 'Finance' department
59,Can you show me the names of all departments that have employees younger than 20 years
