# Data simulation

This workbook is used to simulate Personality data from BFI-2 to Mini-marker. 

In [None]:
import pandas as pd

In [None]:
# load facet_lvl_simulated_data.csv
data = pd.read_csv('facet_lvl_simulated_data.csv')

# change the value type of the first 60 columns from decimal to integer
data = data.astype({col: 'int' for col in data.columns[:60]})

In [None]:
# view the first 5 rows of the data
data.head()

In [None]:
# Generate column names for tda1 to tda40
tda_columns = [f"tda{i}" for i in range(1, 41)]

# Generate column names for sbfi1 to sbfi60
bfi_columns = [f"bfi{i}" for i in range(1, 61)]

# Concatenate the two lists of column names
selected_columns = tda_columns + bfi_columns

In [None]:
from schema_bfi2 import likert_scale

# Function to map the numeric values to strings
def convert_values_to_string(series, mapping):
    # Copy the series to not alter the original data
    series_converted = series.copy()
    # Apply the string mapping
    if series.name in mapping:
        series_converted = series_converted.apply(lambda x: f"{mapping[series.name]} {x};")
    return series_converted

# Apply the mapping function to each row of the dataset
mapped_data = data[bfi_columns].apply(lambda df: convert_values_to_string(df, likert_scale))
mapped_data['combined_bfi2'] = mapped_data[['bfi' + str(i) for i in range(1, 61)]].apply(lambda row: ' '.join(row), axis=1)

data['combined_bfi2'] = mapped_data['combined_bfi2']

data.head()

## Generate prompts

In [None]:
import os
from mini_marker_prompt import get_prompt

api_key = os.getenv('OPENAI_API_KEY')

api_key

## Test with a single participant
Let us first test the framework with a single participant.

In [None]:
person1 = data.iloc[0]

prompt = get_prompt(person1['combined_bfi2'])

print(prompt)

The prompt is looking good. Let us now create a ChatOpenAI object and test it with the prompt.

In [None]:
from openai import OpenAI

client = OpenAI()
completion = client.chat.completions.create(
  temperature=0,
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are an agent participating in a research study. You will be given a personality profile."},
    {"role": "user", "content": prompt}
  ]
)
completion

# Generate completions for the entire dataset
Now that we have tested the framework, and it works very well. Let us generate completions for the entire dataset.

In [None]:
# define a function to get the completion of an individual
def get_completion(person):
    prompt = get_prompt(person['combined_bfi2'])
    completion = client.chat.completions.create(
        temperature=0, 
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an agent participating in a research study. You will be given a personality profile."},
            {"role": "user", "content": prompt}
        ]
    )
    return completion

In [None]:
# get completions for the entire dataset
completions = data.apply(get_completion, axis=1)

In [None]:
# print the shape of df 
print(data.shape)

# print the shape of completions
print(completions.shape)

In [None]:
import json

# Define a custom serializer for Choice objects
def serialize_choice(choice):
    return {
        "finish_reason": choice.finish_reason,
        "index": choice.index,
        "logprobs": choice.logprobs,
        "message": {
            "content": choice.message.content,
            "role": choice.message.role,
            "function_call": choice.message.function_call,
            "tool_calls": choice.message.tool_calls
        }
    }

# Define a custom serializer for ChatCompletion objects
def serialize_chat_completion(completion):
    return {
        "id": completion.id,
        "choices": [serialize_choice(choice) for choice in completion.choices],
        "created": completion.created,
        "model": completion.model,
        "object": completion.object,
        "usage": {
            "completion_tokens": completion.usage.completion_tokens,
            "prompt_tokens": completion.usage.prompt_tokens,
            "total_tokens": completion.usage.total_tokens
        }
    }

# Serialize the list of ChatCompletion objects to a JSON file
with open('bfi_to_mini_temp0.json', 'w') as json_file:
    json.dump([serialize_chat_completion(completion) for completion in completions], json_file, indent=4)