# Data simulation

This workbook is used to simulate Personality data from BFI-2 to Mini-marker. 

In [1]:
import pandas as pd

In [2]:
# load project/raw_data/Soto_data.xlsx
data = pd.read_excel('../../../raw_data/Soto_data.xlsx', sheet_name='data')

In [3]:
# view the first 5 rows of the data
data.head()

Unnamed: 0,case_id,age,sex,ethnicity,rel_acquaintance,rel_friend,rel_roommate,rel_boygirlfriend,rel_relative,rel_other,...,tneo_n3_dep,tneo_n4_sel,tneo_n5_imp,tneo_n6_vul,tneo_o1_fan,tneo_o2_aes,tneo_o3_fee,tneo_o4_act,tneo_o5_ide,tneo_o6_val
0,1,27.0,M,2.0,,,,,,,...,51.25,40.181818,64.0,55.102041,46.639344,46.969697,66.7,57.065217,41.984127,58.039216
1,2,26.0,M,3.0,,,,,,,...,69.632353,60.636364,66.272727,65.306122,54.836066,56.439394,51.7,51.630435,51.904762,45.784314
2,3,24.0,F,4.0,,,,,,,...,60.441176,74.272727,54.909091,65.306122,75.327869,56.439394,56.7,40.76087,51.904762,58.039216
3,4,33.0,M,3.0,,1.0,,,,,...,67.794118,58.363636,64.0,52.55102,54.836066,50.757576,36.7,65.217391,63.809524,58.039216
4,5,23.0,F,5.0,,,,,,,...,62.279412,67.454545,41.272727,60.204082,50.737705,48.863636,49.2,46.195652,38.015873,38.431373


In [4]:
# Generate column names for tda1 to tda40
tda_columns = [f"tda{i}" for i in range(1, 41)]

# Generate column names for sbfi1 to sbfi60
bfi_columns = [f"bfi{i}" for i in range(1, 61)]

# Concatenate the two lists of column names
selected_columns = tda_columns + bfi_columns

In [5]:
# print dimensions of the selected data
print(data.shape)

# remove rows with missing values
data = data.dropna(subset=selected_columns)

# again print dimensions of the selected data
print(data.shape)

(470, 704)
(438, 704)


In [6]:
from schema_bfi2 import likert_scale

# Function to map the numeric values to strings
def convert_values_to_string(series, mapping):
    # Copy the series to not alter the original data
    series_converted = series.copy()
    # Apply the string mapping
    if series.name in mapping:
        series_converted = series_converted.apply(lambda x: f"{mapping[series.name]} {x};")
    return series_converted

# Apply the mapping function to each row of the dataset
mapped_data = data[bfi_columns].apply(lambda df: convert_values_to_string(df, likert_scale))
mapped_data['combined_bfi2'] = mapped_data[['bfi' + str(i) for i in range(1, 61)]].apply(lambda row: ' '.join(row), axis=1)

data['combined_bfi2'] = mapped_data['combined_bfi2']

data.head()

Unnamed: 0,case_id,age,sex,ethnicity,rel_acquaintance,rel_friend,rel_roommate,rel_boygirlfriend,rel_relative,rel_other,...,tneo_n4_sel,tneo_n5_imp,tneo_n6_vul,tneo_o1_fan,tneo_o2_aes,tneo_o3_fee,tneo_o4_act,tneo_o5_ide,tneo_o6_val,combined_bfi2
0,1,27.0,M,2.0,,,,,,,...,40.181818,64.0,55.102041,46.639344,46.969697,66.7,57.065217,41.984127,58.039216,"Is outgoing, sociable: 5; Is compassionate, ha..."
1,2,26.0,M,3.0,,,,,,,...,60.636364,66.272727,65.306122,54.836066,56.439394,51.7,51.630435,51.904762,45.784314,"Is outgoing, sociable: 4; Is compassionate, ha..."
2,3,24.0,F,4.0,,,,,,,...,74.272727,54.909091,65.306122,75.327869,56.439394,56.7,40.76087,51.904762,58.039216,"Is outgoing, sociable: 1; Is compassionate, ha..."
3,4,33.0,M,3.0,,1.0,,,,,...,58.363636,64.0,52.55102,54.836066,50.757576,36.7,65.217391,63.809524,58.039216,"Is outgoing, sociable: 4; Is compassionate, ha..."
4,5,23.0,F,5.0,,,,,,,...,67.454545,41.272727,60.204082,50.737705,48.863636,49.2,46.195652,38.015873,38.431373,"Is outgoing, sociable: 3; Is compassionate, ha..."


In [7]:
# save data
data.to_csv('study1_data_no_simulation.csv', index=False)

## Generate prompts

In [8]:
import os
from mini_marker_prompt import get_prompt

api_key = os.getenv('OPENAI_API_KEY')

## Test with a single participant
Let us first test the framework with a single participant.

In [9]:
person1 = data.iloc[0]

prompt = get_prompt(person1['combined_bfi2'])

print(prompt)

### Context ###
You are participating in a personality psychology study. You have been assigned with personality traits.

### Your Assigned Personality ### 
The number indicates the extent to which you agree or disagree with that statement. 1 means 'Disagree Strongly', 3 means 'Neural', and 5 means 'Agree Strongly'.

Is outgoing, sociable: 5; Is compassionate, has a soft heart: 5; Tends to be disorganized: 2; Is relaxed, handles stress well: 3; Has few artistic interests: 2; Has an assertive personality: 4; Is respectful, treats others with respect: 5; Tends to be lazy: 4; Stays optimistic after experiencing a setback: 5; Is curious about many different things: 2; Rarely feels excited or eager: 2; Tends to find fault with others: 2; Is dependable, steady: 5; Is moody, has up and down mood swings: 4; Is inventive, finds clever ways to do things: 4; Tends to be quiet: 2; Feels little sympathy for others: 1; Is systematic, likes to keep things in order: 2; Can be tense: 3; Is fascinated b

The prompt is looking good. Let us now create a ChatOpenAI object and test it with the prompt.

In [10]:
from openai import OpenAI

client = OpenAI()
completion = client.chat.completions.create(
  temperature=0,
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are an agent participating in a research study. You will be given a personality profile."},
    {"role": "user", "content": prompt}
  ]
)
completion

ChatCompletion(id='chatcmpl-8yXjjXO6c9pkiOxZIbWicxuSxjnJ8', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "Bashful": 2,\n  "Bold": 4,\n  "Careless": 4,\n  "Cold": 1,\n  "Complex": 2,\n  "Cooperative": 5,\n  "Creative": 4,\n  "Deep": 2,\n  "Disorganized": 2,\n  "Efficient": 4,\n  "Energetic": 5,\n  "Envious": 1,\n  "Extraverted": 5,\n  "Fretful": 4,\n  "Harsh": 2,\n  "Imaginative": 2,\n  "Inefficient": 2,\n  "Intellectual": 2,\n  "Jealous": 1,\n  "Kind": 5,\n  "Moody": 4,\n  "Organized": 2,\n  "Philosophical": 2,\n  "Practical": 4,\n  "Quiet": 2,\n  "Relaxed": 3,\n  "Rude": 3,\n  "Shy": 4,\n  "Sloppy": 2,\n  "Sympathetic": 5,\n  "Systematic": 2,\n  "Talkative": 5,\n  "Temperamental": 1,\n  "Touchy": 1,\n  "Uncreative": 4,\n  "Unenvious": 5,\n  "Unintellectual": 4,\n  "Unsympathetic": 1,\n  "Warm": 5,\n  "Withdrawn": 4\n}', role='assistant', function_call=None, tool_calls=None))], created=1709440055, model='gpt-3.5-turbo-0125', 

# Generate completions for the entire dataset
Now that we have tested the framework, and it works very well. Let us generate completions for the entire dataset.

In [11]:
# define a function to get the completion of an individual
def get_completion(person):
    prompt = get_prompt(person['combined_bfi2'])
    completion = client.chat.completions.create(
        temperature=0, 
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an agent participating in a research study. You will be given a personality profile."},
            {"role": "user", "content": prompt}
        ]
    )
    return completion

In [12]:
# get completions for the entire dataset
completions = data.apply(get_completion, axis=1)

In [13]:
# print the shape of df 
print(data.shape)

# print the shape of completions
print(completions.shape)

(438, 705)
(438,)


In [14]:
import json

# Define a custom serializer for Choice objects
def serialize_choice(choice):
    return {
        "finish_reason": choice.finish_reason,
        "index": choice.index,
        "logprobs": choice.logprobs,
        "message": {
            "content": choice.message.content,
            "role": choice.message.role,
            "function_call": choice.message.function_call,
            "tool_calls": choice.message.tool_calls
        }
    }

# Define a custom serializer for ChatCompletion objects
def serialize_chat_completion(completion):
    return {
        "id": completion.id,
        "choices": [serialize_choice(choice) for choice in completion.choices],
        "created": completion.created,
        "model": completion.model,
        "object": completion.object,
        "usage": {
            "completion_tokens": completion.usage.completion_tokens,
            "prompt_tokens": completion.usage.prompt_tokens,
            "total_tokens": completion.usage.total_tokens
        }
    }

# Serialize the list of ChatCompletion objects to a JSON file
with open('bfi_to_mini_temp0.json', 'w') as json_file:
    json.dump([serialize_chat_completion(completion) for completion in completions], json_file, indent=4)