# Process Mini-Marker Simulation Data

In [1]:
import pandas as pd

In [2]:
# load bfi_moral_temp0.7.json 
import json
with open('bfi_to_mini_temp0.json') as f:
    data = json.load(f)

In [3]:
response = []

for obj in data:
    # Iterate through each choice
    for choice in obj['choices']:
        # Extract the content field
        content = choice['message']['content']
        # Check if content starts with ```json\n and ends with \n```, and remove these parts
        if content.startswith("```json\n") and content.endswith("\n```"):
            content = content[7:-4]  # Remove the ```json\n prefix and \n``` suffix
        # Parse the content as JSON
        content_json = json.loads(content)
        # Add the decision to the list of response
        response.append(content_json)

In [4]:
decision_df = pd.DataFrame(response)

# save decision_df to csv 
decision_df.to_csv('mini_marker_response.csv', index=False)

In [5]:
decision_df.head()

Unnamed: 0,Bashful,Bold,Careless,Cold,Complex,Cooperative,Creative,Deep,Disorganized,Efficient,...,Systematic,Talkative,Temperamental,Touchy,Uncreative,Unenvious,Unintellectual,Unsympathetic,Warm,Withdrawn
0,4,2,3,2,4,3,3,4,5,3,...,2,3,4,2,3,4,2,2,4,4
1,2,4,2,3,2,4,2,2,1,3,...,5,3,4,4,3,4,3,3,4,2
2,4,2,2,4,9,7,7,9,5,4,...,5,2,4,4,2,8,2,2,7,4
3,2,8,4,3,8,8,7,8,4,4,...,4,8,4,4,4,7,4,3,7,2
4,4,2,4,1,8,9,6,8,4,4,...,4,3,4,4,2,8,4,1,9,4


In [6]:
# check if there is any missing value
decision_df.isnull().sum()

Bashful           0
Bold              0
Careless          0
Cold              0
Complex           0
Cooperative       0
Creative          0
Deep              0
Disorganized      0
Efficient         0
Energetic         0
Envious           0
Extraverted       0
Fretful           0
Harsh             0
Imaginative       0
Inefficient       0
Intellectual      0
Jealous           0
Kind              0
Moody             0
Organized         0
Philosophical     0
Practical         0
Quiet             0
Relaxed           0
Rude              0
Shy               0
Sloppy            0
Sympathetic       0
Systematic        0
Talkative         0
Temperamental     0
Touchy            0
Uncreative        0
Unenvious         0
Unintellectual    0
Unsympathetic     0
Warm              0
Withdrawn         0
dtype: int64

Great, the data is complete and ready for analysis.

In [7]:
def reverse_score(score):
    return 10 - score

def calculate_big_five_scores(df):
    # Mapping of dimensions to their items (with indication of whether to reverse-score the item)
    dimensions = {
        'miniMarker_simulated_E': [('Bashful', True), ('Bold', False), ('Energetic', False), ('Extraverted', False), ('Quiet', True), ('Shy', True), ('Talkative', False), ('Withdrawn', True)],
        
        'miniMarker_simulated_A': [('Cold', True), ('Cooperative', False), ('Harsh', True), ('Kind', False), ('Rude', True), ('Sympathetic', False), ('Unsympathetic', True), ('Warm', False)],
        
        'miniMarker_simulated_C': [('Careless', True), ('Disorganized', True), ('Efficient', False), ('Inefficient', True), ('Organized', False), ('Practical', False), ('Sloppy', True), ('Systematic', False)],
        
        'miniMarker_simulated_N': [('Envious', False), ('Fretful', False), ('Jealous', False), ('Moody', False), ('Relaxed', True), ('Temperamental', False), ('Touchy', False), ('Unenvious', True)],
        
        'miniMarker_simulated_O': [('Complex', False), ('Deep', False), ('Creative', False), ('Imaginative', False), ('Intellectual', False), ('Philosophical', False), ('Uncreative', True), ('Unintellectual', True)]
    }

    # Initialize a dictionary to hold the total scores for each dimension
    scores = {dimension: 0 for dimension in dimensions}

    # Iterate through each dimension and its items
    for dimension, items in dimensions.items():
        for item, reverse in items:
            # Sum the scores for each item, reversing the score if necessary
            if reverse:
                scores[dimension] += df[item].apply(reverse_score)
            else:
                scores[dimension] += df[item]

    # Convert the scores dictionary to a DataFrame for easier viewing
    scores_df = pd.DataFrame(scores, index=df.index)

    return scores_df

domain_score = calculate_big_five_scores(decision_df)

domain_score.head()

Unnamed: 0,miniMarker_simulated_E,miniMarker_simulated_A,miniMarker_simulated_C,miniMarker_simulated_N,miniMarker_simulated_O
0,36,48,35,35,33
1,47,45,50,34,29
2,36,56,47,37,62
3,63,57,40,31,56
4,36,71,40,30,50


In [8]:
# concatenate the two dataframes
simulation_df = pd.concat([decision_df, domain_score], axis=1)

# preview the first 3 rows
simulation_df.head(3)

Unnamed: 0,Bashful,Bold,Careless,Cold,Complex,Cooperative,Creative,Deep,Disorganized,Efficient,...,Unenvious,Unintellectual,Unsympathetic,Warm,Withdrawn,miniMarker_simulated_E,miniMarker_simulated_A,miniMarker_simulated_C,miniMarker_simulated_N,miniMarker_simulated_O
0,4,2,3,2,4,3,3,4,5,3,...,4,2,2,4,4,36,48,35,35,33
1,2,4,2,3,2,4,2,2,1,3,...,4,3,3,4,2,47,45,50,34,29
2,4,2,2,4,9,7,7,9,5,4,...,8,2,2,7,4,36,56,47,37,62


# Process BFI2 input data 

In [9]:
# load facet_lvl_simulated_data.csv 
bfi2_df = pd.read_csv('facet_lvl_simulated_data.csv')
bfi2_df

Unnamed: 0,bfi1,bfi16,bfi31,bfi46,bfi6,bfi21,bfi36,bfi51,bfi11,bfi26,...,bfi_n_depression,bfi_n_emotional_volatility,bfi_o_intellectual_curiosity,bfi_o_aesthetic_sensitivity,bfi_o_creative_imagination,bfi_e,bfi_a,bfi_c,bfi_n,bfi_o
0,3.0,2.0,4.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,...,2.00,4.50,4.00,4.50,3.25,2.666667,3.833333,2.333333,3.750000,3.916667
1,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,...,2.75,4.50,2.50,3.25,3.00,3.250000,2.833333,4.083333,3.500000,2.916667
2,3.0,3.0,4.0,1.0,1.0,2.0,3.0,4.0,2.0,3.0,...,2.00,4.50,3.50,4.75,4.25,2.666667,2.750000,3.833333,3.833333,4.166667
3,5.0,1.0,2.0,5.0,5.0,5.0,1.0,1.0,2.0,2.0,...,4.25,2.25,4.75,4.25,3.75,4.583333,3.166667,2.916667,3.583333,4.250000
4,3.0,4.0,3.0,3.0,1.0,2.0,4.0,3.0,2.0,4.0,...,3.00,1.75,4.50,4.00,4.50,2.750000,4.333333,2.500000,3.000000,4.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.25,4.00,4.75,3.75,4.50,3.000000,4.666667,3.250000,3.833333,4.333333
196,4.0,1.0,2.0,4.0,5.0,3.0,1.0,2.0,3.0,4.0,...,3.00,2.00,3.50,4.50,3.75,3.750000,2.250000,3.666667,2.250000,3.916667
197,2.0,3.0,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,...,3.50,3.50,4.50,3.50,4.75,3.500000,3.333333,2.583333,3.583333,4.250000
198,3.0,2.0,4.0,3.0,3.0,3.0,1.0,3.0,2.0,2.0,...,2.50,2.75,3.00,2.50,3.00,3.583333,3.666667,3.916667,2.666667,2.833333


In [10]:
# combine bfi2_df and simulation_df
combined_df = pd.concat([bfi2_df, simulation_df], axis=1)
combined_df

Unnamed: 0,bfi1,bfi16,bfi31,bfi46,bfi6,bfi21,bfi36,bfi51,bfi11,bfi26,...,Unenvious,Unintellectual,Unsympathetic,Warm,Withdrawn,miniMarker_simulated_E,miniMarker_simulated_A,miniMarker_simulated_C,miniMarker_simulated_N,miniMarker_simulated_O
0,3.0,2.0,4.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,...,4,2,2,4,4,36,48,35,35,33
1,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,...,4,3,3,4,2,47,45,50,34,29
2,3.0,3.0,4.0,1.0,1.0,2.0,3.0,4.0,2.0,3.0,...,8,2,2,7,4,36,56,47,37,62
3,5.0,1.0,2.0,5.0,5.0,5.0,1.0,1.0,2.0,2.0,...,7,4,3,7,2,63,57,40,31,56
4,3.0,4.0,3.0,3.0,1.0,2.0,4.0,3.0,2.0,4.0,...,8,4,1,9,4,36,71,40,30,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,...,5,1,1,4,4,37,53,41,33,37
196,4.0,1.0,2.0,4.0,5.0,3.0,1.0,2.0,3.0,4.0,...,6,4,4,6,2,58,44,40,32,50
197,2.0,3.0,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,...,6,4,4,6,8,40,48,40,34,56
198,3.0,2.0,4.0,3.0,3.0,3.0,1.0,3.0,2.0,2.0,...,4,3,4,4,4,39,43,49,30,36


# Analysis of correlation between BFI2 and Mini-Marker scores

In [11]:
from scipy.stats import pearsonr
r_bfi_tda_e, _ = pearsonr(combined_df['miniMarker_simulated_E'], combined_df['bfi_e'])
r_bfi_tda_a, _ = pearsonr(combined_df['miniMarker_simulated_A'], combined_df['bfi_a'])
r_bfi_tda_c, _ = pearsonr(combined_df['miniMarker_simulated_C'], combined_df['bfi_c'])
r_bfi_tda_n, _ = pearsonr(combined_df['miniMarker_simulated_N'], combined_df['bfi_n'])
r_bfi_tda_o, _ = pearsonr(combined_df['miniMarker_simulated_O'], combined_df['bfi_o'])

r_bfi_tda_e, r_bfi_tda_a, r_bfi_tda_c, r_bfi_tda_n, r_bfi_tda_o

(0.8871256075892594,
 0.591491760960283,
 0.8435240493692607,
 0.6210621712583648,
 0.45243618778784633)

In [12]:
r_bfi_tda = (r_bfi_tda_e + r_bfi_tda_a + r_bfi_tda_c + r_bfi_tda_n + r_bfi_tda_o) / 5
r_bfi_tda

0.6791279553930029