# Setup

In [22]:
# general python packages
import json
import os
import sys
from itertools import combinations
import pickle
import re

# general data processing/analysis
import pandas as pd
import numpy as np
from functools import reduce
import pingouin as pg # chronbach's alpha
from scipy.stats import ttest_ind_from_stats

# used for BBC data processing
import pyreadstat
import pycountry

# used for API data collection
import requests
sys.path.append('shared')
from api_request_parallel_processor import process_api_requests_from_file
from dotenv import load_dotenv
load_dotenv()

True

In [23]:
# Get all scales item info as df - used throughout during data processing
all_scales_item_info_df = pd.DataFrame()
for scale in ["bfi", "panas", "bpaq", "sscs"]:
    with open(os.path.join("shared", f"{scale}_item_info.json"), 'r') as f:
        scale_item_info_df = pd.DataFrame(json.load(f)).transpose().reset_index().rename(columns={"index": "item_index"})
    scale_item_info_df["scale"] = scale.upper()
    # scale_item_info_df["item_index"] = scale_item_info_df["item_index"].astype(str)

    all_scales_item_info_df = pd.concat([all_scales_item_info_df, scale_item_info_df]).reset_index(drop=True)

all_scales_item_info_df["item_index"] = all_scales_item_info_df["item_index"].astype(int)

# BBC data processing

The full BBC data is not shareable, so we only load the processed files that we need. The original data was held in a bbc_data folder, so every time the original data were used, you will see that check so you can see the code used to process the data. However, that code won't be run as this folder is not shared publicly.

This pattern of checking for the `bbc_data` folder and loading only sharable files repeats throughout this codebook.

In [24]:
if not os.path.exists(os.path.join("data", "bbc_data")):
    # we can only share the bbc_meta file - this shows all variables in the BBC dataset, as well as value labels etc
    # for more info and methods, see https://ofajardo.github.io/pyreadstat_documentation/_build/html/index.html#metadata-object-description
    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_meta.pickle"), "rb") as f:
        bbc_meta = pickle.load(f)
else:
    load_from_picklefile = True

    if load_from_picklefile:
        with open(os.path.join("data", "bbc_data", "raw_bbc_df, bbc_meta, bbc_df.pickle"), "rb") as f:
            raw_bbc_df, bbc_meta, bbc_df = pickle.load(f)
    else:
        raw_bbc_df, bbc_meta = pyreadstat.read_sav(os.path.join("data", "bbc_data", "raw_bbc_data.sav"))

        # a bit confusing here as we lightly process the raw bbc data, but this is needed for the later filtering/sampling
        raw_bbc_df['uid'] = raw_bbc_df['uid'].str.strip()
        # keep unique user id entries - only cases where all BFIs are non-NA and take the earliest entry
        raw_bbc_df = raw_bbc_df.dropna(subset=[e for e in raw_bbc_df.columns if e.startswith('bfi_')]).sort_values(by=['uid', 'time_st']).drop_duplicates(subset='uid')

        bbc_df = raw_bbc_df.copy()
        bbc_df = bbc_df[["uid"] + [e for e in bbc_df.columns if e.startswith("bfi_")]].melt(id_vars='uid', var_name='item_index', value_name='numeric_response')
        bbc_df["item_index"] = bbc_df["item_index"].apply(lambda x: x.split("_")[1]).astype(int)
        bbc_df = pd.merge(bbc_df, all_scales_item_info_df.query("scale=='BFI'").drop(columns=['scale']), on="item_index", how="left")
        # NB: We assume that scores are not to be reversed as per the syntax - reliability and scoring.sps ---
        bbc_df['response_reversed'] = bbc_df['numeric_response']

        # these were files used during development for faster loading
        with open(os.path.join("data", "bbc_data", "raw_bbc_df, bbc_meta, bbc_df.pickle"), "wb") as f:
            pickle.dump([raw_bbc_df, bbc_meta, bbc_df], f)

        # these are files that are shared
        with open(os.path.join("data", "bbc_data_for_sharing", "bbc_meta.pickle"), "wb") as f:
            pickle.dump(bbc_meta, f)

## Creating silicon samples from BBC data

In [25]:
if not os.path.exists(os.path.join("data", "bbc_data")):
    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_silicon_samples_df.pickle"), "rb") as f:
        bbc_silicon_samples_df = pickle.load(f)
else:
    sub_bbc_df = raw_bbc_df[["uid", "age", "country", "ethnic", "m_schl", "f_schl", "n_sib", "sex",
            "st_pub", "occ_sta", "occ_cat", "income", 
            "rstat_1", "chldrn"]].copy()

    sub_bbc_df['country_name'] = sub_bbc_df['country'].apply(lambda x: pycountry.countries.get(alpha_2=x).name if pycountry.countries.get(alpha_2=x) else np.nan)

    sub_bbc_df = sub_bbc_df.query('(age>=18 and age<=99 and age.notna()) and '
                '(ethnic in [1,2,3,4,5,6,8]) and '
                '(country_name.notna()) and '
                '(m_schl in [1,2,3,4,5,6]) and '
                '(f_schl in [1,2,3,4,5,6]) and '
                '(n_sib.notna()) and '
                '(sex.notna()) and '
                '(st_pub.notna()) and '
                '(occ_sta.notna()) and '
                '(occ_cat.notna() and occ_cat!=23) and '
                '(income in [1,2,3,4,5,6,7]) and '
                '(rstat_1 in [0,1]) and '
                '(chldrn.notna())'
    ).reset_index(drop=True)

    # function code courtesy of chatGPT
    # if you give a list of categories, it will do stratified sampling based on those categories
    # an empty list will results in proportional sampling across all categories
    def stratified_sampling(df, categories, n_samples):
        # Create a composite key combining all categories
        df['composite_key'] = df[categories].apply(lambda x: '_'.join(x.astype(str)), axis=1)
        final_sample_composite = (df.groupby('composite_key', group_keys=False)
                                    .apply(lambda x: x.sample(min(len(x), n_samples // len(df['composite_key'].unique())), random_state=0)))
        # If the sample is smaller than required, randomly sample additional rows
        if len(final_sample_composite) < n_samples:
            additional_samples = df.drop(final_sample_composite.index).sample(n_samples - len(final_sample_composite), random_state=0)
            final_sample_composite = pd.concat([final_sample_composite, additional_samples])

        return final_sample_composite.drop(columns=['composite_key']).reset_index(drop=True)
        
    # random sampling, i.e. proportional... reproducible
    bbc_silicon_samples_df = stratified_sampling(sub_bbc_df, [], 1000)

    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_silicon_samples_df.pickle"), "wb") as f:
        pickle.dump(bbc_silicon_samples_df, f)

  .apply(lambda x: x.sample(min(len(x), n_samples // len(df['composite_key'].unique())), random_state=0)))


Adding the sentences to the `bbc_silicon_samples_df`

NB: Originally, `country` should have been used but was inadvertently omitted when creating the personas.

In [26]:
def get_sentence(var_name, value):
    if var_name=="age":
        return f"I am {int(value)} years old."
    elif var_name=="ethnic":
        return f"My ethnic background is {bbc_meta.variable_value_labels['ethnic'][value].strip()}."
    elif var_name=="m_schl" or var_name=="f_schl":
        parent = "mother" if var_name=="f_schl" else "father"
        if value==1:
            sentence = f"My {parent} did not complete GCSE / CSE / O-Levels."
        elif value==2:
            sentence = f"The highest level of formal schooling my {parent} completed was GCSE / CSE / O-levels."
        elif value==3:
            sentence = f"The highest level of formal schooling my {parent} completed was a post-16 vocational course."
        elif value==4:
            sentence = f"The highest level of formal schooling my {parent} completed was A-Levels."
        elif value==5:
            sentence = f"The highest level of formal schooling my {parent} completed was an Undergraduate degree."
        elif value==6:
            sentence = f"The highest level of formal schooling my {parent} completed was a Postgraduate degree."
        return sentence
    elif var_name=="n_sib":
        if value==0:
            return f"I do not have any siblings."
        elif value>0 and value<6:
            return f"I have {int(value)} siblings."
        elif value==6:
            return f"I have more than 5 siblings."
    elif var_name=="sex":
        return f"I am {bbc_meta.variable_value_labels['sex'][value].strip()}."
    elif var_name=="st_pub":
        return f"The majority of my education up to the age of 18 was in a {bbc_meta.variable_value_labels['st_pub'][value].strip()} school."
    elif var_name=="occ_sta":
        return f"My occupational status can be defined as {bbc_meta.variable_value_labels['occ_sta'][value].strip()}."
    elif var_name=="occ_cat":
        return f"I work in {bbc_meta.variable_value_labels['occ_cat'][value].strip()}."
    elif var_name=="income":
        sentence = f"I earn {bbc_meta.variable_value_labels['income'][value].strip()}."
        # replace £ before value with GBP afterwards
        sentence = re.sub(r'£(\d[\d,.]*)', r'\1GBP', sentence)
        return sentence
    elif var_name=="rstat_1":
        if value==0:
            return f"I am currently not in an intimate relationship."
        elif value==1:
            return f"I am currently in an intimate relationship."
    elif var_name=="chldrn":
        if value==0:
            return f"I do not have any children."
        elif value>0 and value<6:
            return f"I have {int(value)} children."
        elif value==6:
            return f"I have more than 5 children."
    return ""

def row_to_persona_description(row):
    person_description = ""
    for col_name, value in row.items():
        person_description += get_sentence(col_name, value) + " "
    return person_description.strip()

bbc_silicon_samples_df['persona_description'] = bbc_silicon_samples_df.apply(row_to_persona_description, axis=1).reset_index(drop=True)
bbc_silicon_samples_df.head()

Unnamed: 0,uid,age,country,ethnic,m_schl,f_schl,n_sib,sex,st_pub,occ_sta,occ_cat,income,rstat_1,chldrn,country_name,persona_description
0,e1df2ed9e02e998d7e3efa037a4926a7abaddb6e,38.0,GB,8.0,1.0,1.0,1.0,1.0,1.0,3.0,8.0,6.0,1.0,2.0,United Kingdom,I am 38 years old. My ethnic background is Wh...
1,21f1b6f7c6ced9e3f4c805d3cccaee6de4a7c657,50.0,GB,8.0,2.0,1.0,2.0,1.0,1.0,3.0,19.0,4.0,1.0,3.0,United Kingdom,I am 50 years old. My ethnic background is Wh...
2,62fdeb795ca7af9f0bf0cbf8498e8501aabc612a,41.0,GB,8.0,4.0,3.0,1.0,1.0,1.0,3.0,8.0,7.0,1.0,1.0,United Kingdom,I am 41 years old. My ethnic background is Wh...
3,fbc77cf36c46b0f79f3afd306748638c936033d3,26.0,GB,8.0,2.0,2.0,0.0,0.0,1.0,3.0,17.0,6.0,1.0,0.0,United Kingdom,I am 26 years old. My ethnic background is Wh...
4,7d2c2587f29b5a7c543c424c7fc3216c0e0ec923,33.0,GB,8.0,5.0,5.0,1.0,1.0,1.0,3.0,4.0,7.0,1.0,0.0,United Kingdom,I am 33 years old. My ethnic background is Wh...


# Data collection

Loading the necesssary files

In [27]:
with open(os.path.join('shared', 'scales_items.json'), 'r', encoding='utf-8') as f:
    scales_items = json.load(f)
    print(f"Total number of items across all scales: {len(scales_items)}")

with open(os.path.join('shared', 'generic_persona_descriptions.json'), 'r', encoding='utf-8') as f:
    generic_persona_descriptions = json.load(f)
    print(f"Number of generic persona descriptions: {len(generic_persona_descriptions)}")

Total number of items across all scales: 104
Number of generic persona descriptions: 150


Setting up parameters for the requests

In [28]:
persona_instruction = "For the following task, respond in a way that matches this description:"
test_instruction = "Evaluating the statement,"

item_postambles = {
    "BFI": 'please indicate the extent to which you agree or disagree on a scale from 1 to 5 (where 1 = "disagree strongly", 2 = "disagree a little", 3 = "neither agree nor disagree", 4 = "agree a little", and 5 = "agree strongly"):',
    "PANAS": 'indicate to what extent you agree on a scale from 1 to 5 (where 1 = "very slightly or not at all agree", 2 = "agree a little", 3 = "agree moderately", 4 = "agree quite a bit", and 5 = "agree extremely"):',
    "SSCS": 'please decide to what extent this describes you on a scale from 1 to 5 (where 1 = "strongly disagree", 2 = "disagree", 3 = "neither agree nor disagree", 4 = "agree", 5 = "strongly agree"):',
    "BPAQ": 'rate how characteristic this is of you on a scale from 1 to 5 (where 1 = "extremely uncharacteristic of me", 2 = "uncharacteristic of me", 3 = "neither characteristic nor uncharacteristic of me", 4 = "characteristic of me", and 5 = "extremely characteristic of me"):',
    }
items = []
for item in scales_items:
    new_item = item.copy()
    new_item["item_postamble"] = item_postambles[item["scale"]]
    items.append(new_item)

## Creating the requests for the generic persona descriptions

These are not meant to be rerun (but could be, outputs are deterministic)

In [29]:
if False:
    generic_gpt35_requests = []
    generic_gpt4_requests = []

    request_id = 0
    for persona_description in generic_persona_descriptions:
        for item in items:
            prompt = f'{persona_instruction} "{persona_description}" {test_instruction} "{item["item"]}", {item["item_postamble"]}'
            
            generic_gpt35_requests.append({
                "model": "gpt-3.5-turbo-1106", 
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0,
                "max_tokens": 50,
                "metadata": {"persona_description": persona_description, "item": item, "request_id": f"{request_id}"}
            })

            generic_gpt4_requests.append({
                "model": "gpt-4-1106-preview", 
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0,
                "max_tokens": 250,
                "metadata": {"persona_description": persona_description, "item": item, "request_id": f"{request_id}"}
            })

            request_id += 1
        
    with open(os.path.join("data", "raw_data", "generic_gpt35_requests.jsonl"), "w") as f:
        for request in generic_gpt35_requests:
            json_string = json.dumps(request)
            f.write(json_string + "\n")

    with open(os.path.join("data", "raw_data", "generic_gpt4_requests.jsonl"), "w") as f:
        for request in generic_gpt4_requests:
            json_string = json.dumps(request)
            f.write(json_string + "\n")

## Creating the requests for the silicon samples

NB: The BFI requests for silicon sampling GPT3.5 were created and run previously, on 23/12/2023. The other requests were run on 20/01/2024.

The files (both the `requests` file and the `requests_results`) are merged manually.

In [30]:
if False:
    silicon_gpt35_requests = []
    silicon_gpt4_requests = []

    gpt35_request_id = 44000
    gpt4_request_id = 0

    for ind, row in bbc_silicon_samples_df.iterrows():
        uid = row['uid']
        persona_description = row['persona_description']
        for item in items:
            prompt = f'{persona_instruction} "{persona_description}" {test_instruction} "{item["item"]}", {item["item_postamble"]}'
            
            if item['scale'] != 'BFI':
                silicon_gpt35_requests.append({
                    "model": "gpt-3.5-turbo-1106", 
                    "messages": [{"role": "user", "content": prompt}],
                    "temperature": 0,
                    "max_tokens": 50,
                    "metadata": {"uid": uid, "item": item, "request_id": f"{gpt35_request_id}"}
                })
                gpt35_request_id += 1
            
            silicon_gpt4_requests.append({
                "model": "gpt-4-1106-preview", 
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0,
                "max_tokens": 250,
                "metadata": {"uid": uid, "item": item, "request_id": f"{gpt4_request_id}"}
            })
            gpt4_request_id += 1

    with open(os.path.join("data", "raw_data", "silicon_gpt35_requests.jsonl"), "w") as f:
        for request in silicon_gpt35_requests:
            json_string = json.dumps(request)
            f.write(json_string + "\n")
        
    with open(os.path.join("data", "raw_data", "silicon_gpt4_requests.jsonl"), "w") as f:
        for request in silicon_gpt4_requests:
            json_string = json.dumps(request)
            f.write(json_string + "\n")

## Splitting the silicon files in order to upload to Github

The size of the original `silicon_gpt35_requests.jsonl`, `silicon_gpt35_requests_results.jsonl`, `silicon_gpt4_requests.jsonl`, and `silicon_gpt4_requests_results.jsonl` is >100MB, so here we split them into 3 files. We merge them back later on during data processing.

In [74]:
if False:
    for filename in ["silicon_gpt35_requests", "silicon_gpt35_requests_results", "silicon_gpt4_requests", "silicon_gpt4_requests_results"]:
        curr_file_all_requests = []
        with open(os.path.join("data", "raw_data", f"{filename}.jsonl"), "r", encoding="utf-8") as f:
            for request in f:
                curr_file_all_requests.append(json.loads(request.strip()))

        num_requests = len(curr_file_all_requests)
        num_files = 3
        # Calculate the number of requests per file considering the remainder
        num_requests_per_file = num_requests // num_files
        remainder = num_requests % num_files

        for i in range(num_files):
            start_index = i * num_requests_per_file + min(i, remainder)
            end_index = start_index + num_requests_per_file + (1 if i < remainder else 0)
            
            with open(os.path.join("data", "raw_data", f"{filename}_part{i}.jsonl"), "w") as f:
                for request in curr_file_all_requests[start_index:end_index]:
                    f.write(json.dumps(request) + "\n")

## Executing the API requests

This uses parallel processing as per OpenAI's recommendation: https://github.com/openai/openai-cookbook/blob/main/examples/api_request_parallel_processor.py

Note that the `api_request_parallel_processor.py` has been modified!

Creating the requests from above replaces the contents of the entire requests file, while executing the requests only appends to the results file (`save_filepath`).

In order to rerun, you will need to create a .env file in the root directory of the project with your OpenAI key:<br>OPENAI_API_KEY=< YOUR API KEY >

Also, uncomment the last line to rerun; commented out for safety.

In [32]:
args = {
    'requests_filepath': os.path.join("data", "raw_data", "generic_gpt35_requests.jsonl"),
    'save_filepath': os.path.join("data", "raw_data", "generic_gpt35_requests_results.jsonl"),

    # 'requests_filepath': os.path.join("data", "raw_data", "generic_gpt4_requests.jsonl"),
    # 'save_filepath': os.path.join("data", "raw_data", "generic_gpt4_requests_results.jsonl"),

    # 'requests_filepath': os.path.join("data", "raw_data", "silicon_gpt35_requests.jsonl"),
    # 'save_filepath': os.path.join("data", "raw_data", "silicon_gpt35_requests_results.jsonl"),

    # 'requests_filepath': os.path.join("data", "raw_data", "silicon_gpt4_requests.jsonl"),
    # 'save_filepath': os.path.join("data", "raw_data", "silicon_gpt4_requests_results.jsonl"),

    'request_url': 'https://api.openai.com/v1/chat/completions',
    'api_key': os.getenv("OPENAI_API_KEY"),
    'max_requests_per_minute': 1,
    'max_tokens_per_minute': 500_000,
    'token_encoding_name': 'cl100k_base',
    'max_attempts': 1,
    'logging_level': 30  # This corresponds to logging.INFO
}

# await process_api_requests_from_file(**args)  # Since this is an async function, use await

# Data processing

In [33]:
# useed to get the first digit from chatGPT's responses
def get_first_digit(response):
    for char in response:
        if char.isdigit():
            return int(char)
    return None
    
# used to process the raw responses from the API into a dataframe
def get_df_from_requests_results(requests_filepath, remove_out_of_bounds_responses=True):
    print(f"\nProcessing: {os.path.split(requests_filepath)[1]}")
    
    requests_results=[]
    with open(requests_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            requests_results.append(json.loads(line.strip()))

    df = pd.DataFrame()

    # transforming persona_descriptions to unique user ids for study 1
    if "generic" in requests_filepath:
        df['uid'] = [e[2]['persona_description'] for e in requests_results]
        df['uid'] = df['uid'].astype('category')
        df['uid'] = df['uid'].cat.codes
    else:
        df['uid'] = [e[2]['uid'] for e in requests_results]

    df['request_id'] = [int(e[2]['request_id']) for e in requests_results]
    df['scale'] = [e[2]['item']['scale'] for e in requests_results]
    df['item_index'] = [e[2]['item']['index'] for e in requests_results]
    df['raw_response'] = [e[1]['choices'][0]['message']['content'] for e in requests_results]
    df['first_token_is_digit'] = df['raw_response'].apply(lambda x: x[0].isdigit())
    df['finish_reason'] = [e[1]['choices'][0]['finish_reason'] for e in requests_results]
    df['numeric_response'] = df['raw_response'].apply(get_first_digit)

    out_of_bound_responses_df = df[(df['numeric_response'] < 1) | (df['numeric_response'] > 5)]
    print(f"----Number of responses outside of 1-5 range: {len(out_of_bound_responses_df)} ({np.round(len(out_of_bound_responses_df)/len(df)*100, 2)}%)")
    if remove_out_of_bounds_responses and len(out_of_bound_responses_df) > 0:
        df.loc[(df['numeric_response'] < 1) | (df['numeric_response'] > 5), 'numeric_response'] = None
        print("--------Responses outside of 1-5 range removed!")

    # add scale item info - reversed & dimension
    df = pd.merge(df, all_scales_item_info_df, on=["scale", "item_index"], how="left")
    df['response_reversed'] = np.where(df['reversed'], 6 - df['numeric_response'], df['numeric_response'])

    df = df.sort_values('request_id').reset_index(drop=True)
    return df

Above we did split the requests_results of silicon personas into 3, so here we put them back together before processing. For completeness sake, we also put together the requests, not just the requests_results.

In [85]:
for filename in ["silicon_gpt35_requests", "silicon_gpt35_requests_results", "silicon_gpt4_requests", "silicon_gpt4_requests_results"]:
    # we will check if the file exists first and delete it, as otherwise the writing may fail
    if os.path.exists(os.path.join("data", "raw_data", f"{filename}.jsonl")):
        os.remove(os.path.join("data", "raw_data", f"{filename}.jsonl"))

    num_files = 3

    curr_file_all_requests = []
    for i in range(num_files):
        with open(os.path.join("data", "raw_data", f"{filename}_part{i}.jsonl"), "r", encoding="utf-8") as f:
            for request in f:
                curr_file_all_requests.append(json.loads(request))
    
    with open(os.path.join("data", "raw_data", f"{filename}.jsonl"), "w") as f:
        for request in curr_file_all_requests:
            f.write(json.dumps(request) + "\n")

In [86]:
generic_gpt35_df = get_df_from_requests_results(os.path.join("data", "raw_data", "generic_gpt35_requests_results.jsonl"))
generic_gpt4_df = get_df_from_requests_results(os.path.join("data", "raw_data", "generic_gpt4_requests_results.jsonl"))
silicon_gpt35_df = get_df_from_requests_results(os.path.join("data", "raw_data", "silicon_gpt35_requests_results.jsonl"))
silicon_gpt4_df = get_df_from_requests_results(os.path.join("data", "raw_data", "silicon_gpt4_requests_results.jsonl"))


Processing: generic_gpt35_requests_results.jsonl
----Number of responses outside of 1-5 range: 0 (0.0%)

Processing: generic_gpt4_requests_results.jsonl
----Number of responses outside of 1-5 range: 86 (0.55%)
--------Responses outside of 1-5 range removed!

Processing: silicon_gpt35_requests_results.jsonl
----Number of responses outside of 1-5 range: 0 (0.0%)

Processing: silicon_gpt4_requests_results.jsonl
----Number of responses outside of 1-5 range: 39 (0.04%)
--------Responses outside of 1-5 range removed!


## Summary scores dataframes

In [35]:
if not os.path.exists(os.path.join("data", "bbc_data")):
    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_summary_scores_df.pickle"), "rb") as f:
        bbc_summary_scores_df = pickle.load(f)
else:
    bbc_summary_scores_df = bbc_df.groupby(['uid', 'dimension'])['response_reversed'].mean().reset_index()

    # we only save the summary scores of the 1000 silicon personas
    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_summary_scores_df.pickle"), "wb") as f:
        pickle.dump(bbc_summary_scores_df[bbc_summary_scores_df['uid'].isin(bbc_silicon_samples_df['uid'])], f)

In [87]:
generic_gpt35_summary_scores_df = generic_gpt35_df.groupby(['uid', 'scale', 'dimension'])['response_reversed'].mean().reset_index()
generic_gpt4_summary_scores_df = generic_gpt4_df.groupby(['uid', 'scale', 'dimension'])['response_reversed'].mean().reset_index()
silicon_gpt35_summary_scores_df = silicon_gpt35_df.groupby(['uid', 'scale', 'dimension'])['response_reversed'].mean().reset_index()
silicon_gpt4_summary_scores_df = silicon_gpt4_df.groupby(['uid', 'scale', 'dimension'])['response_reversed'].mean().reset_index()

# Analyses

## Internal consistency - Cronbach's alpha

In [88]:
def get_cronbach_alpha_df(df):
    res = []
    for scale in df['scale'].unique():
        scale_sub_df = df[df['scale']==scale]
        for dimension in scale_sub_df['dimension'].unique():
            dimension_sub_df = scale_sub_df[scale_sub_df['dimension'] == dimension].reset_index(drop=False)
            alpha = pg.cronbach_alpha(data=dimension_sub_df, items="item_index", scores="response_reversed", subject="uid")
            res.append({
                "scale": scale,
                "dimension": dimension,
                "alpha": alpha[0]
            })

    return pd.DataFrame(res)

In [89]:
generic_gpt35_cronbach_alpha_df = get_cronbach_alpha_df(generic_gpt35_df)
generic_gpt4_cronbach_alpha_df = get_cronbach_alpha_df(generic_gpt4_df)
silicon_gpt35_cronbach_alpha_df = get_cronbach_alpha_df(silicon_gpt35_df)
silicon_gpt4_cronbach_alpha_df = get_cronbach_alpha_df(silicon_gpt4_df)

tmp = [
    generic_gpt35_cronbach_alpha_df.rename(columns={'alpha': 'generic_gpt35_alpha'}),
    generic_gpt4_cronbach_alpha_df.rename(columns={'alpha': 'generic_gpt4_alpha'}),
    silicon_gpt35_cronbach_alpha_df.rename(columns={'alpha': 'silicon_gpt35_alpha'}),
    silicon_gpt4_cronbach_alpha_df.rename(columns={'alpha': 'silicon_gpt4_alpha'})
]
combined_cronbach_alpha_df = reduce(lambda left, right: pd.merge(left, right, on=['scale', 'dimension'], how='left'), tmp)
combined_cronbach_alpha_df

Unnamed: 0,scale,dimension,generic_gpt35_alpha,generic_gpt4_alpha,silicon_gpt35_alpha,silicon_gpt4_alpha
0,BFI,Extraversion,0.582327,0.908532,0.550951,0.694504
1,BFI,Agreeableness,0.852248,0.840415,0.537476,0.716157
2,BFI,Conscientiousness,0.798723,0.873373,0.698126,0.837486
3,BFI,Neuroticism,0.722777,0.754785,0.53082,0.236343
4,BFI,Openness,0.74806,0.891988,0.482531,0.759349
5,PANAS,Positive,0.949114,0.900143,0.794565,0.772256
6,PANAS,Negative,0.902645,0.860867,0.795136,0.595437
7,BPAQ,Physical,0.800601,0.880271,0.367608,0.428287
8,BPAQ,Verbal,0.625797,0.696505,0.332181,0.211077
9,BPAQ,Anger,0.81351,0.862313,0.418108,0.544393


### Data export for R

In [90]:
combined_cronbach_alpha_df.to_csv(os.path.join("data", "data_for_R", "combined_cronbach_alpha_df.csv"), index=False)

## Criterion validity

In [91]:
def get_summary_scores_df_wide(summary_scores_df):
    summary_scores_df_wide = summary_scores_df.copy()
    # this if would only apply to bbc anyways
    if 'scale' not in summary_scores_df_wide.columns:
        summary_scores_df_wide['scale'] = 'BFI'
    summary_scores_df_wide['scale'] = summary_scores_df_wide['scale'] + '_' + summary_scores_df_wide['dimension']
    summary_scores_df_wide = summary_scores_df_wide.pivot(index='uid', columns='scale', values='response_reversed').reset_index()
    return summary_scores_df_wide

def get_all_corrs_df(summary_scores_df):
    summary_scores_df_wide = get_summary_scores_df_wide(summary_scores_df)
    all_corrs_df = summary_scores_df_wide.drop(columns=['uid']).corr()
    return all_corrs_df

def get_validity_corr_df(summary_scores_df):
    all_corrs_df = get_all_corrs_df(summary_scores_df)

    corrs = [
        {"var1": "BFI_Extraversion", "var2": "PANAS_Positive"},
        {"var1": "BFI_Extraversion", "var2": "PANAS_Negative"},
        {"var1": "BFI_Agreeableness", "var2": "BPAQ_Physical"},
        {"var1": "BFI_Agreeableness", "var2": "BPAQ_Verbal"},
        {"var1": "BFI_Agreeableness", "var2": "BPAQ_Anger"},
        {"var1": "BFI_Agreeableness", "var2": "BPAQ_Hostility"},
        {"var1": "BFI_Neuroticism", "var2": "PANAS_Positive"},
        {"var1": "BFI_Neuroticism", "var2": "PANAS_Negative"},
        {"var1": "BFI_Openness", "var2": "SSCS_Self-efficacy"},
        {"var1": "BFI_Openness", "var2": "SSCS_Personal Identity"},
    ]
    corrs_res = []
    for e in corrs:
        new_e = e.copy()
        new_e["r"] = all_corrs_df.loc[e["var1"], e["var2"]]
        corrs_res.append(new_e)

    return pd.DataFrame(corrs_res)

In [92]:
generic_gpt35_validity_df = get_validity_corr_df(generic_gpt35_summary_scores_df)
generic_gpt4_validity_df = get_validity_corr_df(generic_gpt4_summary_scores_df)
silicon_gpt35_validity_df = get_validity_corr_df(silicon_gpt35_summary_scores_df)
silicon_gpt4_validity_df = get_validity_corr_df(silicon_gpt4_summary_scores_df)

tmp = [
    generic_gpt35_validity_df.rename(columns={'r': 'generic_gpt35_r'}),
    generic_gpt4_validity_df.rename(columns={'r': 'generic_gpt4_r'}),
    silicon_gpt35_validity_df.rename(columns={'r': 'silicon_gpt35_r'}),
    silicon_gpt4_validity_df.rename(columns={'r': 'silicon_gpt4_r'})
]
combined_validity_df = reduce(lambda left, right: pd.merge(left, right, on=['var1', 'var2'], how='left'), tmp)

serapiogarcia_validity_corrs = pd.DataFrame([
    {'var1': 'BFI_Extraversion', 'var2': 'PANAS_Positive', 'serapiogarcia_r': 0.83},
    {'var1': 'BFI_Extraversion', 'var2': 'PANAS_Negative', 'serapiogarcia_r': -0.59},
    {'var1': 'BFI_Agreeableness', 'var2': 'BPAQ_Physical', 'serapiogarcia_r': -0.88},
    {'var1': 'BFI_Agreeableness', 'var2': 'BPAQ_Verbal', 'serapiogarcia_r': -0.72},
    {'var1': 'BFI_Agreeableness', 'var2': 'BPAQ_Anger', 'serapiogarcia_r': -0.86},
    {'var1': 'BFI_Agreeableness', 'var2': 'BPAQ_Hostility', 'serapiogarcia_r': -0.73},
    {'var1': 'BFI_Neuroticism', 'var2': 'PANAS_Positive', 'serapiogarcia_r': -0.78},
    {'var1': 'BFI_Neuroticism', 'var2': 'PANAS_Negative', 'serapiogarcia_r': 0.91},
    {'var1': 'BFI_Openness', 'var2': 'SSCS_Self-efficacy', 'serapiogarcia_r': 0.74},
    {'var1': 'BFI_Openness', 'var2': 'SSCS_Personal Identity', 'serapiogarcia_r': 0.84},
])

combined_validity_df = pd.merge(combined_validity_df, serapiogarcia_validity_corrs, on=['var1', 'var2'], how='left')
combined_validity_df

Unnamed: 0,var1,var2,generic_gpt35_r,generic_gpt4_r,silicon_gpt35_r,silicon_gpt4_r,serapiogarcia_r
0,BFI_Extraversion,PANAS_Positive,0.549538,0.639075,0.391287,0.659101,0.83
1,BFI_Extraversion,PANAS_Negative,-0.224826,-0.251039,-0.061914,-0.432405,-0.59
2,BFI_Agreeableness,BPAQ_Physical,-0.422217,-0.623247,-0.311311,-0.463211,-0.88
3,BFI_Agreeableness,BPAQ_Verbal,-0.39146,-0.595906,-0.355661,-0.303203,-0.72
4,BFI_Agreeableness,BPAQ_Anger,-0.620775,-0.653593,-0.085949,-0.26291,-0.86
5,BFI_Agreeableness,BPAQ_Hostility,-0.374475,-0.641641,-0.040137,-0.343323,-0.73
6,BFI_Neuroticism,PANAS_Positive,-0.609162,-0.581027,-0.405434,-0.351222,-0.78
7,BFI_Neuroticism,PANAS_Negative,0.447258,0.596179,0.576591,0.35259,0.91
8,BFI_Openness,SSCS_Self-efficacy,0.602939,0.631863,0.186649,0.688984,0.74
9,BFI_Openness,SSCS_Personal Identity,0.675821,0.816544,0.292986,0.595221,0.84


### Data export for R

In [93]:
combined_validity_df.to_csv(os.path.join("data", "data_for_R", "combined_validity_df.csv"), index=False)

### BFI Intercorrelations

In [94]:
def get_intercorrs_df(summary_scores_df):
    all_corrs_df = get_all_corrs_df(summary_scores_df)

    corrs = [{'var1': pair[0], 'var2': pair[1]} for pair in combinations([e for e in all_corrs_df.columns if e.startswith("BFI_")], 2)]
    
    corrs_res = []
    for e in corrs:
        new_e = e.copy()
        new_e["r"] = all_corrs_df.loc[e["var1"], e["var2"]]
        corrs_res.append(new_e)

    # remove the "BFI_" prefix
    intercorrs_df = pd.DataFrame(corrs_res)
    intercorrs_df['var1'] = intercorrs_df['var1'].apply(lambda x: x[4:])
    intercorrs_df['var2'] = intercorrs_df['var2'].apply(lambda x: x[4:])
    return intercorrs_df

In [95]:
if not os.path.exists(os.path.join("data", "bbc_data")):
    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_intercorrs_df.pickle"), "rb") as f:
        bbc_intercorrs_df = pickle.load(f)
else:
    bbc_intercorrs_df = get_intercorrs_df(bbc_summary_scores_df)

    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_intercorrs_df.pickle"), "wb") as f:
        pickle.dump(bbc_intercorrs_df, f)

In [96]:
generic_gpt35_intercorrs_df = get_intercorrs_df(generic_gpt35_summary_scores_df)
generic_gpt4_intercorrs_df = get_intercorrs_df(generic_gpt4_summary_scores_df)
silicon_gpt35_intercorrs_df = get_intercorrs_df(silicon_gpt35_summary_scores_df)
silicon_gpt4_intercorrs_df = get_intercorrs_df(silicon_gpt4_summary_scores_df)

tmp = [
    bbc_intercorrs_df.rename(columns={'r': 'bbc_r'}),
    generic_gpt35_intercorrs_df.rename(columns={'r': 'generic_gpt35_r'}),
    generic_gpt4_intercorrs_df.rename(columns={'r': 'generic_gpt4_r'}),
    silicon_gpt35_intercorrs_df.rename(columns={'r': 'silicon_gpt35_r'}),
    silicon_gpt4_intercorrs_df.rename(columns={'r': 'silicon_gpt4_r'})
]
combined_intercorrs_df = reduce(lambda left, right: pd.merge(left, right, on=['var1', 'var2'], how='left'), tmp)
combined_intercorrs_df

Unnamed: 0,var1,var2,bbc_r,generic_gpt35_r,generic_gpt4_r,silicon_gpt35_r,silicon_gpt4_r
0,Agreeableness,Conscientiousness,0.249485,0.723276,0.561472,0.515692,0.444196
1,Agreeableness,Extraversion,0.133033,0.29051,0.338529,-0.029134,0.240552
2,Agreeableness,Neuroticism,-0.294152,-0.660254,-0.392606,-0.228921,-0.073698
3,Agreeableness,Openness,0.042664,0.328049,0.29193,-0.076287,0.215494
4,Conscientiousness,Extraversion,0.109946,0.421112,0.398048,0.099804,0.47558
5,Conscientiousness,Neuroticism,-0.209726,-0.671202,-0.486946,-0.496567,-0.331981
6,Conscientiousness,Openness,-0.046006,0.394694,0.389919,0.080015,0.287101
7,Extraversion,Neuroticism,-0.334979,-0.370567,-0.467797,-0.269278,-0.379081
8,Extraversion,Openness,0.194667,0.375182,0.232971,0.478022,0.504078
9,Neuroticism,Openness,-0.074902,-0.333999,-0.114686,-0.186933,-0.197465


#### Data export for R

R will require a slightly different format for the plotting, so we will process this here

In [97]:
unique_values = pd.unique(combined_intercorrs_df[['var1', 'var2']].values.ravel())
res_list = []
# Iterate over each model
for model in ['bbc_r', 'generic_gpt35_r', 'generic_gpt4_r', 'silicon_gpt35_r', 'silicon_gpt4_r']:
    # Expand the data for each model
    for v1 in unique_values:
        for v2 in unique_values:
            r_value = combined_intercorrs_df.loc[(combined_intercorrs_df['var1'] == v1) & (combined_intercorrs_df['var2'] == v2), model]
            if r_value.empty:
                r_value = combined_intercorrs_df.loc[(combined_intercorrs_df['var1'] == v2) & (combined_intercorrs_df['var2'] == v1), model]
            if r_value.empty:
                r = pd.NA
            else:
                r = r_value.iloc[0]
            # :-2 is to remove the "_r" suffix
            res_list.append({'var1': v1, 'var2': v2, 'model': model[:-2], 'r': r})

intercorrs_for_R = pd.DataFrame(res_list)
intercorrs_for_R.head()

Unnamed: 0,var1,var2,model,r
0,Agreeableness,Agreeableness,bbc,
1,Agreeableness,Conscientiousness,bbc,0.249485
2,Agreeableness,Extraversion,bbc,0.133033
3,Agreeableness,Neuroticism,bbc,-0.294152
4,Agreeableness,Openness,bbc,0.042664


In [98]:
intercorrs_for_R.to_csv(os.path.join("data", "data_for_R", "intercorrs.csv"), index=False)

## Factor analysis

Everything is run in R

### Data export for R

In [99]:
if not os.path.exists(os.path.join("data", "bbc_data")):
    # no part of this is shared; only the loadings will be shared (see R_codebook.Rmd)
    pass
else:
    bbc_df[["uid", "item_index", "dimension", "response_reversed"]].to_csv(os.path.join("data", "bbc_data", "bbc_df.csv"), index=False)

In [120]:
generic_gpt35_df[["uid", "scale", "item_index", "dimension", "response_reversed"]].to_csv(os.path.join("data", "data_for_R", "generic_gpt35_df.csv"), index=False)
generic_gpt4_df[["uid", "scale", "item_index", "dimension", "response_reversed"]].to_csv(os.path.join("data", "data_for_R", "generic_gpt4_df.csv"), index=False)
silicon_gpt35_df[["uid", "scale", "item_index", "dimension", "response_reversed"]].to_csv(os.path.join("data", "data_for_R", "silicon_gpt35_df.csv"), index=False)
silicon_gpt4_df[["uid", "scale", "item_index", "dimension", "response_reversed"]].to_csv(os.path.join("data", "data_for_R", "silicon_gpt4_df.csv"), index=False)

## Item analysis (frequencies)

In [101]:
# used to get the relative frequencies of individual-level responses on the BFI-44 items
def get_rel_freq(df, id_col="uid", colname_prefix='', group_cols=["dimension", "response_reversed"]):
    df = df[[id_col] + group_cols]
    
    df_grouped = df.groupby(group_cols).count()
    df_grouped[f'{colname_prefix+"_" if colname_prefix!="" else ""}total_count'] = df_grouped[[id_col]].groupby(level=0)[id_col].transform('sum')
    df_grouped[f'{colname_prefix+"_" if colname_prefix!="" else ""}relative_frequency'] = df_grouped[id_col] / df_grouped[f'{colname_prefix+"_" if colname_prefix!="" else ""}total_count']
    df_grouped = df_grouped.drop(columns=[id_col]).reset_index()

    # this ensures that the dataframe contains all possible combinations of dimension and response
    # e.g. if there are no responses of "1" for the dimension "Extraversion", the dataframe will still contain a row for that combination with a count of 0
    # this is important for the visualization
    all_dimensions = df_grouped[group_cols[0]].unique()
    all_responses = df_grouped[group_cols[1]].unique()
    all_combinations = pd.MultiIndex.from_product([all_dimensions, all_responses], names=group_cols).to_frame(index=False)
    
    df_grouped = pd.merge(all_combinations, df_grouped, on=group_cols, how='left').fillna(0, inplace=False).sort_values(group_cols)
    
    return df_grouped

In [102]:
if not os.path.exists(os.path.join("data", "bbc_data")):
    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_bfi_item_frequencies.pickle"), "rb") as f:
        bbc_bfi_item_frequencies = pickle.load(f)
else:
    bbc_bfi_item_frequencies = get_rel_freq(bbc_df[["uid", "dimension", "response_reversed"]], colname_prefix="bbc")

    with open(os.path.join("data", "bbc_data_for_sharing", "bbc_bfi_item_frequencies.pickle"), "wb") as f:
        pickle.dump(bbc_bfi_item_frequencies, f)

In [103]:
generic_gpt35_bfi_item_frequencies = get_rel_freq(generic_gpt35_df.query("scale=='BFI'")[["uid", "dimension", "response_reversed"]], colname_prefix="generic_gpt35")
generic_gpt4_bfi_item_frequencies = get_rel_freq(generic_gpt4_df.query("scale=='BFI'")[["uid", "dimension", "response_reversed"]], colname_prefix="generic_gpt4")
silicon_gpt35_bfi_item_frequencies = get_rel_freq(silicon_gpt35_df.query("scale=='BFI'")[["uid", "dimension", "response_reversed"]], colname_prefix="silicon_gpt35")
silicon_gpt4_bfi_item_frequencies = get_rel_freq(silicon_gpt4_df.query("scale=='BFI'")[["uid", "dimension", "response_reversed"]], colname_prefix="silicon_gpt4")

# use reduce to combine the item_frequency dataframes without renaming columns
tmp = [bbc_bfi_item_frequencies, #sample_bbc_bfi_item_frequencies, 
        generic_gpt35_bfi_item_frequencies, generic_gpt4_bfi_item_frequencies, 
        silicon_gpt35_bfi_item_frequencies, silicon_gpt4_bfi_item_frequencies]
combined_bfi_item_frequencies = reduce(lambda left, right: pd.merge(left, right, on=["dimension", "response_reversed"], how='inner'), tmp)
combined_bfi_item_frequencies

Unnamed: 0,dimension,response_reversed,bbc_total_count,bbc_relative_frequency,generic_gpt35_total_count,generic_gpt35_relative_frequency,generic_gpt4_total_count,generic_gpt4_relative_frequency,silicon_gpt35_total_count,silicon_gpt35_relative_frequency,silicon_gpt4_total_count,silicon_gpt4_relative_frequency
0,Agreeableness,1.0,2270133,0.03697,1349,0.006672,1311,0.026697,0.0,0.0,8996.0,0.000667
1,Agreeableness,2.0,2270133,0.148625,1349,0.025204,1311,0.033562,0.0,0.0,0.0,0.0
2,Agreeableness,3.0,2270133,0.157875,1349,0.667161,1311,0.244851,8995.0,0.155642,8996.0,0.26345
3,Agreeableness,4.0,2270133,0.368632,1349,0.23573,1311,0.404272,8995.0,0.629016,8996.0,0.581258
4,Agreeableness,5.0,2270133,0.287898,1349,0.065234,1311,0.290618,8995.0,0.215342,8996.0,0.154624
5,Conscientiousness,1.0,2270133,0.053783,1347,0.006682,1335,0.006742,0.0,0.0,8997.0,0.000667
6,Conscientiousness,2.0,2270133,0.168023,1347,0.054937,1335,0.047191,8988.0,0.003115,8997.0,0.003446
7,Conscientiousness,3.0,2270133,0.173533,1347,0.542687,1335,0.238951,8988.0,0.199377,8997.0,0.255085
8,Conscientiousness,4.0,2270133,0.333599,1347,0.273942,1335,0.403745,8988.0,0.558634,8997.0,0.508614
9,Conscientiousness,5.0,2270133,0.271063,1347,0.121752,1335,0.303371,8988.0,0.238874,8997.0,0.232189


### Data export for R

In [104]:
(combined_bfi_item_frequencies
    .melt(id_vars=["dimension", "response_reversed"], 
            value_vars=[e for e in combined_bfi_item_frequencies.columns if e.endswith("relative_frequency")], 
            var_name="model", value_name="relative_frequency")
    .to_csv(os.path.join("data", "data_for_R", "combined_bfi_item_frequencies.csv"), index=False)
)

## Trait bias

In [105]:
# Define a function to preprocess the data frames
def get_bias_df(summary_scores_df, prefix):
    # reusing the wide function from the validity section
    summary_scores_df_wide = get_summary_scores_df_wide(summary_scores_df)
    # Filter columns starting with "BFI_" and include "uid"
    filtered_df = summary_scores_df_wide.filter(regex='^BFI_|^uid$')
    # Rename columns using a dictionary comprehension
    rename_map = {col: f"{prefix}_{col.split('_')[-1]}" for col in filtered_df.columns if col.startswith('BFI')}
    return filtered_df.rename(columns=rename_map)

In [106]:
bbc_summary_scores_df_wide = get_summary_scores_df_wide(bbc_summary_scores_df)

silicon_gpt35_bias_df = get_bias_df(silicon_gpt35_summary_scores_df, 'gpt35')
silicon_gpt4_bias_df = get_bias_df(silicon_gpt4_summary_scores_df, 'gpt4')

bias_df = bbc_summary_scores_df_wide.merge(silicon_gpt35_bias_df, on='uid', how='inner').merge(silicon_gpt4_bias_df, on='uid', how='inner')

# Traits for iteration
traits = ['Agreeableness', 'Neuroticism', 'Conscientiousness', 'Extraversion', 'Openness']

# Calculate biases for GPT3.5 and GPT4
for trait in traits:
    for version in ['gpt35', 'gpt4']:
        bias_df[f'bias_{version}_{trait}'] = np.abs(bias_df[f'{version}_{trait}'] - bias_df[f"BFI_{trait}"])

# Calculate average biases
for version in ['gpt35', 'gpt4']:
    bias_df[f'bias_{version}_avg'] = bias_df[[f'bias_{version}_{trait}' for trait in traits]].mean(axis=1)

bias_df

scale,uid,BFI_Agreeableness,BFI_Conscientiousness,BFI_Extraversion,BFI_Neuroticism,BFI_Openness,gpt35_Agreeableness,gpt35_Conscientiousness,gpt35_Extraversion,gpt35_Neuroticism,...,bias_gpt35_Neuroticism,bias_gpt4_Neuroticism,bias_gpt35_Conscientiousness,bias_gpt4_Conscientiousness,bias_gpt35_Extraversion,bias_gpt4_Extraversion,bias_gpt35_Openness,bias_gpt4_Openness,bias_gpt35_avg,bias_gpt4_avg
0,00370bf8406c470e3d4f5d428ddb2acc1a43476c,3.333333,2.000000,3.875,3.125,3.4,3.888889,4.222222,3.625,2.250000,...,0.875000,0.500000,2.222222,2.333333,0.250,0.375,0.300000,0.0,0.840556,0.708333
1,00971fc6fda4283c98bc4c1d8135972c1229a07e,4.000000,3.222222,1.875,3.500,2.5,4.111111,4.222222,3.375,2.333333,...,1.166667,0.928571,1.000000,1.111111,1.500,1.375,1.388889,1.2,1.033333,0.989603
2,00be86e5dc52a407da67ffb5395ad6ea9381d703,4.111111,4.777778,2.875,2.500,3.6,4.111111,4.444444,3.500,2.500000,...,0.000000,0.071429,0.333333,0.444444,0.625,0.750,0.200000,0.2,0.231667,0.293175
3,00c99d987d8808e711c8108a9889384d01c26db3,2.666667,2.666667,1.250,4.375,4.3,3.555556,3.444444,3.000,2.857143,...,1.517857,0.875000,0.777778,0.666667,1.750,1.625,0.700000,0.8,1.126905,0.926667
4,0104a608dbf05a1471ae8a5c701b3177366d0180,3.222222,3.333333,3.000,4.500,2.9,4.111111,4.111111,3.375,2.857143,...,1.642857,1.750000,0.777778,0.444444,0.375,0.500,0.400000,0.5,0.816905,0.838889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,fe96cb89a11af0a1706aac961f87044d6ede4cd9,4.333333,4.222222,4.375,2.375,4.7,4.000000,4.111111,3.500,2.625000,...,0.250000,0.250000,0.111111,0.222222,0.875,0.750,0.800000,1.1,0.473889,0.531111
996,fedd5c55c53db2c554d986adf4781f36963560b8,4.222222,4.555556,3.750,2.375,3.5,4.222222,4.333333,3.500,2.285714,...,0.089286,0.250000,0.222222,0.000000,0.250,0.250,0.200000,0.1,0.152302,0.142222
997,ff4aa0569bce2c0c40ce7abeb5f0f65b67ef8ffe,2.777778,3.888889,2.500,3.750,3.3,3.888889,4.333333,3.625,2.250000,...,1.500000,1.125000,0.444444,0.777778,1.125,1.125,0.400000,0.5,0.916111,0.950000
998,ff6c27789e3033e7b6b44a8fd4f9370693642867,3.666667,4.000000,3.625,3.625,3.8,4.222222,4.000000,3.250,2.625000,...,1.000000,0.910714,0.000000,0.000000,0.375,0.250,0.200000,0.2,0.426111,0.294365


In [107]:
print(f"Bias for GPT-3.5: {np.round(np.mean(bias_df['bias_gpt35_avg']), 2)} (SD: {np.round(np.std(bias_df['bias_gpt35_avg']), 2)})")
print(f"Bias for GPT-4: {np.round(np.mean(bias_df['bias_gpt4_avg']), 2)} (SD: {np.round(np.std(bias_df['bias_gpt4_avg']), 2)})")

Bias for GPT-3.5: 0.63 (SD: 0.25)
Bias for GPT-4: 0.62 (SD: 0.24)


In [108]:
t_stat, p_val = ttest_ind_from_stats(mean1=np.mean(bias_df['bias_gpt35_avg']), std1=np.std(bias_df['bias_gpt35_avg']), nobs1=1000,
                    mean2=np.mean(bias_df['bias_gpt4_avg']), std2=np.std(bias_df['bias_gpt4_avg']), nobs2=1000,
                    equal_var=True)  # assuming equal variances
cohen_d = pg.compute_effsize(bias_df['bias_gpt35_avg'], bias_df['bias_gpt4_avg'], eftype='cohen')
print(f"t-test results: t={np.round(t_stat, 2)}, p-value={np.round(p_val, 3)}; Cohen's d={np.round(cohen_d, 2)}")

t-test results: t=1.38, p-value=0.169; Cohen's d=0.06


### Data export for R

In [109]:
tmp = bias_df.melt(id_vars=["uid"], value_vars=[e for e in bias_df.columns if e.startswith("bias_") and not e.endswith("avg")], var_name="tmp", value_name="bias")
tmp[['x', 'model', 'trait']] = tmp['tmp'].str.split('_', expand=True)

tmp[["uid", "model", "trait", "bias"]].to_csv(os.path.join("data", "data_for_R", "bias_df.csv"), index=False)

### Bias correlations

In [110]:
bias_corrs_df = pd.merge(bias_df[["uid"] + [e for e in bias_df.columns if e.startswith("BFI_") or e.endswith("_avg")]],
        bbc_silicon_samples_df.drop(columns=["country_name", "country"]), 
        on=['uid'], how='left')

bias_corrs_df = bias_corrs_df.select_dtypes(include=np.number).corr().loc[['bias_gpt35_avg', 'bias_gpt4_avg'],].reset_index(names="model")
bias_corrs_df

Unnamed: 0,model,BFI_Agreeableness,BFI_Conscientiousness,BFI_Extraversion,BFI_Neuroticism,BFI_Openness,bias_gpt35_avg,bias_gpt4_avg,age,ethnic,m_schl,f_schl,n_sib,sex,st_pub,occ_sta,occ_cat,income,rstat_1,chldrn
0,bias_gpt35_avg,-0.336785,-0.369423,-0.291651,0.387875,-0.136477,1.0,0.916592,0.024693,0.017889,0.00347,0.00616,-0.018809,0.004385,0.040039,0.104279,-0.009396,0.022803,0.013411,0.028359
1,bias_gpt4_avg,-0.213615,-0.321367,-0.178874,0.309813,-0.038655,0.916592,1.0,-0.014135,0.015438,0.018758,0.033611,-0.038325,-0.005128,0.037652,0.054913,-0.006492,0.046243,0.014393,-0.000871


#### Data export for R

In [111]:
(bias_corrs_df
    .melt(id_vars=["model"], var_name="var", value_name="r")
    .query("var!='bias_gpt35_avg' and var!='bias_gpt4_avg'")
    .reset_index(drop=True)
    .to_csv(os.path.join("data", "data_for_R", "bias_corrs_df.csv"), index=False)
)

## Supplementary: Analysis of the first token of the responses

In [112]:
generic_gpt35_first_token_digit_item_frequencies = get_rel_freq(generic_gpt35_df, colname_prefix="generic_gpt35", group_cols=["first_token_is_digit", "response_reversed"])
generic_gpt4_first_token_digit_item_frequencies = get_rel_freq(generic_gpt4_df, colname_prefix="generic_gpt4", group_cols=["first_token_is_digit", "response_reversed"])
silicon_gpt35_first_token_digit_item_frequencies = get_rel_freq(silicon_gpt35_df, colname_prefix="silicon_gpt35", group_cols=["first_token_is_digit", "response_reversed"])
silicon_gpt4_first_token_digit_item_frequencies = get_rel_freq(silicon_gpt4_df, colname_prefix="silicon_gpt4", group_cols=["first_token_is_digit", "response_reversed"])

# use reduce to combine the item_frequency dataframes without renaming columns
tmp = [generic_gpt35_first_token_digit_item_frequencies, generic_gpt4_first_token_digit_item_frequencies, 
        silicon_gpt35_first_token_digit_item_frequencies, silicon_gpt4_first_token_digit_item_frequencies]
combined_first_token_digit_item_frequencies = reduce(lambda left, right: pd.merge(left, right, on=["first_token_is_digit", "response_reversed"], how='inner'), tmp)
combined_first_token_digit_item_frequencies

Unnamed: 0,first_token_is_digit,response_reversed,generic_gpt35_total_count,generic_gpt35_relative_frequency,generic_gpt4_total_count,generic_gpt4_relative_frequency,silicon_gpt35_total_count,silicon_gpt35_relative_frequency,silicon_gpt4_total_count,silicon_gpt4_relative_frequency
0,False,1.0,6022,0.044171,14688,0.112064,85658.0,0.031719,84581,0.084523
1,False,2.0,6022,0.177682,14688,0.182734,85658.0,0.159822,84581,0.183221
2,False,3.0,6022,0.183826,14688,0.291326,85658.0,0.215718,84581,0.361606
3,False,4.0,6022,0.531717,14688,0.291803,85658.0,0.536611,84581,0.323359
4,False,5.0,6022,0.062604,14688,0.122072,85658.0,0.05613,84581,0.047292
5,True,1.0,9542,0.066338,440,0.170455,18226.0,0.204269,18267,0.045656
6,True,2.0,9542,0.075875,440,0.045455,18226.0,0.102601,18267,0.06832
7,True,3.0,9542,0.709495,440,0.034091,18226.0,0.684462,18267,0.294794
8,True,4.0,9542,0.125236,440,0.690909,18226.0,0.008669,18267,0.581759
9,True,5.0,9542,0.023056,440,0.059091,0.0,0.0,18267,0.009471


### Data export for R

In [113]:
(combined_first_token_digit_item_frequencies
    .melt(id_vars=["first_token_is_digit", "response_reversed"], 
            value_vars=[e for e in combined_first_token_digit_item_frequencies.columns if e.endswith("relative_frequency")], 
            var_name="model", value_name="relative_frequency")
    .to_csv(os.path.join("data", "data_for_R", "combined_first_token_digit_item_frequencies.csv"), index=False)
)