In [1]:
import pandas as pd

In [2]:
# load bfi_moral_temp0.7.json 
import json
with open('bfi_to_mini_temp0.json') as f:
    data = json.load(f)

In [3]:
import json
import re

def clean_json(content):
    # Regex to find and replace numbers and periods at the start of property names
    cleaned_content = re.sub(r'\s*\d+\.\s*"(.*?)"', r'"\1"', content)
    return cleaned_content

response = []

for obj in data:
    for choice in obj['choices']:
        content = choice['message']['content']
        if content.startswith("```json") and content.endswith("```"):
            start_index = len("```json\n")
            end_index = -3 if content.endswith("\n```") else -4
            content = content[start_index:end_index]

        # Clean the JSON content
        cleaned_content = clean_json(content)

        # Attempt to parse the content as JSON
        try:
            content_json = json.loads(cleaned_content)
            response.append(content_json)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON due to: {e}")
            print("Faulty JSON content:", cleaned_content)
            continue

print(response)

[{'Bashful': 2, 'Bold': 6, 'Careless': 4, 'Cold': 2, 'Complex': 2, 'Cooperative': 8, 'Creative': 6, 'Deep': 4, 'Disorganized': 4, 'Efficient': 6, 'Energetic': 7, 'Envious': 2, 'Extraverted': 8, 'Fretful': 4, 'Harsh': 3, 'Imaginative': 6, 'Inefficient': 4, 'Intellectual': 7, 'Jealous': 2, 'Kind': 8, 'Moody': 5, 'Organized': 6, 'Philosophical': 7, 'Practical': 6, 'Quiet': 3, 'Relaxed': 5, 'Rude': 2, 'Shy': 3, 'Sloppy': 4, 'Sympathetic': 8, 'Systematic': 5, 'Talkative': 7, 'Temperamental': 4, 'Touchy': 3, 'Uncreative': 3, 'Unenvious': 8, 'Unintellectual': 3, 'Unsympathetic': 2, 'Warm': 8, 'Withdrawn': 3}, {'Bashful': 2, 'Bold': 4, 'Careless': 4, 'Cold': 3, 'Complex': 7, 'Cooperative': 6, 'Creative': 6, 'Deep': 7, 'Disorganized': 5, 'Efficient': 4, 'Energetic': 5, 'Envious': 2, 'Extraverted': 7, 'Fretful': 4, 'Harsh': 3, 'Imaginative': 6, 'Inefficient': 4, 'Intellectual': 5, 'Jealous': 2, 'Kind': 7, 'Moody': 6, 'Organized': 3, 'Philosophical': 4, 'Practical': 5, 'Quiet': 4, 'Relaxed': 6, '

In [4]:
decision_df = pd.DataFrame(response)

In [5]:
decision_df.head()

Unnamed: 0,Bashful,Bold,Careless,Cold,Complex,Cooperative,Creative,Deep,Disorganized,Efficient,...,Systematic,Talkative,Temperamental,Touchy,Uncreative,Unenvious,Unintellectual,Unsympathetic,Warm,Withdrawn
0,2,6,4,2,2,8,6,4,4,6,...,5,7,4,3,3,8,3,2,8,3
1,2,4,4,3,7,6,6,7,5,4,...,4,6,6,4,3,7,4,3,7,3
2,9,2,4,2,8,7,7,8,4,6,...,6,2,4,4,2,8,2,2,8,9
3,2,6,4,2,7,6,6,7,7,4,...,3,6,4,4,3,7,3,2,7,4
4,4,6,2,4,6,7,7,6,2,8,...,8,4,6,4,3,7,4,4,7,4


In [6]:
# check if there is any missing value
decision_df.isnull().sum()

Bashful           0
Bold              0
Careless          0
Cold              0
Complex           0
Cooperative       0
Creative          0
Deep              0
Disorganized      0
Efficient         0
Energetic         0
Envious           0
Extraverted       0
Fretful           0
Harsh             0
Imaginative       0
Inefficient       0
Intellectual      0
Jealous           0
Kind              0
Moody             0
Organized         0
Philosophical     0
Practical         0
Quiet             0
Relaxed           0
Rude              0
Shy               0
Sloppy            0
Sympathetic       0
Systematic        0
Talkative         0
Temperamental     0
Touchy            0
Uncreative        0
Unenvious         0
Unintellectual    0
Unsympathetic     0
Warm              0
Withdrawn         0
dtype: int64

Great, the data is complete and ready for analysis.

In [7]:
def reverse_score(score):
    return 10 - score

def calculate_big_five_scores(df):
    # Mapping of dimensions to their items (with indication of whether to reverse-score the item)
    dimensions = {
        'miniMarker_simulated_E': [('Bashful', True), ('Bold', False), ('Energetic', False), ('Extraverted', False), ('Quiet', True), ('Shy', True), ('Talkative', False), ('Withdrawn', True)],
        
        'miniMarker_simulated_A': [('Cold', True), ('Cooperative', False), ('Harsh', True), ('Kind', False), ('Rude', True), ('Sympathetic', False), ('Unsympathetic', True), ('Warm', False)],
        
        'miniMarker_simulated_C': [('Careless', True), ('Disorganized', True), ('Efficient', False), ('Inefficient', True), ('Organized', False), ('Practical', False), ('Sloppy', True), ('Systematic', False)],
        
        'miniMarker_simulated_N': [('Envious', False), ('Fretful', False), ('Jealous', False), ('Moody', False), ('Relaxed', True), ('Temperamental', False), ('Touchy', False), ('Unenvious', True)],
        
        'miniMarker_simulated_O': [('Complex', False), ('Deep', False), ('Creative', False), ('Imaginative', False), ('Intellectual', False), ('Philosophical', False), ('Uncreative', True), ('Unintellectual', True)]
    }

    # Initialize a dictionary to hold the total scores for each dimension
    scores = {dimension: 0 for dimension in dimensions}

    # Iterate through each dimension and its items
    for dimension, items in dimensions.items():
        for item, reverse in items:
            # Sum the scores for each item, reversing the score if necessary
            if reverse:
                scores[dimension] += df[item].apply(reverse_score)
            else:
                scores[dimension] += df[item]

    # Convert the scores dictionary to a DataFrame for easier viewing
    scores_df = pd.DataFrame(scores, index=df.index)

    return scores_df

domain_score = calculate_big_five_scores(decision_df)

domain_score.head()

Unnamed: 0,miniMarker_simulated_E,miniMarker_simulated_A,miniMarker_simulated_C,miniMarker_simulated_N,miniMarker_simulated_O
0,57,63,47,27,46
1,50,56,39,31,48
2,14,63,47,31,61
3,49,58,30,31,55
4,44,53,61,35,48


In [8]:
# concatenate the two dataframes
simulation_df = pd.concat([decision_df, domain_score], axis=1)

# preview the first 3 rows
simulation_df.head(3)

Unnamed: 0,Bashful,Bold,Careless,Cold,Complex,Cooperative,Creative,Deep,Disorganized,Efficient,...,Unenvious,Unintellectual,Unsympathetic,Warm,Withdrawn,miniMarker_simulated_E,miniMarker_simulated_A,miniMarker_simulated_C,miniMarker_simulated_N,miniMarker_simulated_O
0,2,6,4,2,2,8,6,4,4,6,...,8,3,2,8,3,57,63,47,27,46
1,2,4,4,3,7,6,6,7,5,4,...,7,4,3,7,3,50,56,39,31,48
2,9,2,4,2,8,7,7,8,4,6,...,8,2,2,8,9,14,63,47,31,61


In [9]:
# load soto's data 
soto_df = pd.read_csv('study1_data_no_simulation.csv')

# concatenate the two dataframes
df = pd.concat([soto_df, simulation_df], axis=1)

# preview the first 3 rows
df.head(3)

Unnamed: 0,case_id,age,sex,ethnicity,rel_acquaintance,rel_friend,rel_roommate,rel_boygirlfriend,rel_relative,rel_other,...,Unenvious,Unintellectual,Unsympathetic,Warm,Withdrawn,miniMarker_simulated_E,miniMarker_simulated_A,miniMarker_simulated_C,miniMarker_simulated_N,miniMarker_simulated_O
0,1,27.0,M,2.0,,,,,,,...,8,3,2,8,3,57,63,47,27,46
1,2,26.0,M,3.0,,,,,,,...,7,4,3,7,3,50,56,39,31,48
2,3,24.0,F,4.0,,,,,,,...,8,2,2,8,9,14,63,47,31,61


In [10]:
# save the dataframe to a csv file
df.to_csv('study1_with_simulation_result.csv', index=False)