In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("all_responses_coded.csv", index_col=0)

In [53]:
df.loc[df["question_common_id"] == 2, "question_tom_type"] = "Action"

In [4]:
df.head()

Unnamed: 0,initial_order,initial_index,session_id,study_id,study_name,scenario_id,scenario_code,story_id,chat_id,story_common_id,...,content,role,function_call,tool_calls,language_category,is_correct,has_correct_reasoning,is_beyond_scope,has_grammatical_error,needs_debating
0,679,3896,109,1,Ullman Replication,1.0,1-EN/1,1-EN,1.0,1,...,Sam believes that the bag is full of chocolate.,assistant,,,English,True,,False,False,False
1,849,4066,189,1,Ullman Replication,1.0,1-EN/1,1-EN,1.0,1,...,chocolate.,assistant,,,English,True,,False,False,False
2,1019,4327,269,1,Ullman Replication,1.0,1-EN/1,1-EN,1.0,1,...,chocolate.,assistant,,,English,True,,False,False,False
3,1189,4497,349,1,Ullman Replication,1.0,1-EN/1,1-EN,1.0,1,...,Sam believes that the bag is full of chocolate.,assistant,,,English,True,,False,False,False
4,1359,4667,429,1,Ullman Replication,1.0,1-EN/1,1-EN,1.0,1,...,chocolate,assistant,,,English,True,,False,False,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6560 entries, 0 to 6559
Data columns (total 52 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   initial_order            6560 non-null   int64  
 1   initial_index            6560 non-null   int64  
 2   session_id               6560 non-null   int64  
 3   study_id                 6560 non-null   int64  
 4   study_name               6560 non-null   object 
 5   scenario_id              6560 non-null   float64
 6   scenario_code            6560 non-null   object 
 7   story_id                 6560 non-null   object 
 8   chat_id                  6560 non-null   float64
 9   story_common_id          6560 non-null   object 
 10  story_category           6560 non-null   object 
 11  story_name               6560 non-null   object 
 12  story_content            6560 non-null   object 
 13  story_language           6560 non-null   object 
 14  chat_name                6560

In [6]:
df.columns

Index(['initial_order', 'initial_index', 'session_id', 'study_id',
       'study_name', 'scenario_id', 'scenario_code', 'story_id', 'chat_id',
       'story_common_id', 'story_category', 'story_name', 'story_content',
       'story_language', 'chat_name', 'chat_language', 'chat_has_fbv_zan',
       'chat_has_fbv_san', 'questions', 'system_message_id',
       'system_message_language', 'system_message_content', 'question_index',
       'question_id', 'question_common_id', 'question_content',
       'question_type', 'question_language', 'question_has_fbv_zan',
       'question_has_fbv_san', 'question_tom_order', 'question_tom_type',
       'answer_id', 'answer_correct', 'id', 'created', 'model', 'object',
       'system_fingerprint', 'completion_tokens', 'prompt_tokens',
       'total_tokens', 'content', 'role', 'function_call', 'tool_calls',
       'language_category', 'is_correct', 'has_correct_reasoning',
       'is_beyond_scope', 'has_grammatical_error', 'needs_debating'],
      dtyp

We expect only the following to have some missing values:

### Coding Related


- `has_correct_reasoning` (since only Open-ended questions and Closed-ended questions to which GPT responded with explanations were coded for reasoning)

### API Response Related

- `function_call` (since no function calls were made using the API)
- `system_fingerprint` (since this feature was introduced with GPT-4 Turbo, and was not made available retroactively for earlier versions, including GPT-3.5 Turbo which was used in this study)
- `tool_call` (since no tool calls were made using the API)

In [79]:
for column in sorted(df.columns):
    missing_value_count = df[column].isna().sum()

    if missing_value_count != 0:
        print(f"The column '{column}' has {missing_value_count} missing values.")

The column 'answer_id' has 2 missing values.
The column 'function_call' has 6560 missing values.
The column 'has_correct_reasoning' has 4669 missing values.
The column 'system_fingerprint' has 3280 missing values.
The column 'tool_calls' has 6560 missing values.


In [46]:
study_characteristics = {
    "models": 2,
    "languages": 2,
    "trials": 10,

    "study_1": {
        "scenarios": 60,
        "question_per_scenario": 1
    },

    "study_2": {
        "scenarios": 40,
        "question_per_scenario": 4
    },

    "study_3": {
        "scenarios": 16,
        "question_per_scenario": 6.75
    }
}

In [23]:
df.shape

(6560, 52)

In [7]:
assert df.shape[0] == 6560
assert df.shape[1] == 52

In [19]:
scenario_size_by_study = df.groupby(["study_id"])["scenario_id"].nunique()

In [21]:
scenario_size_by_study

study_id
1    60
2    40
3    16
Name: scenario_id, dtype: int64

In [36]:
assert scenario_size_by_study[1] == study_characteristics["study_1"]["scenarios"]
assert scenario_size_by_study[2] == study_characteristics["study_2"]["scenarios"]
assert scenario_size_by_study[3] == study_characteristics["study_3"]["scenarios"]

In [8]:
study_sizes = df.groupby(["study_id"]).size()

In [9]:
study_sizes

study_id
1    1200
2    3200
3    2160
dtype: int64

In [47]:
assert study_sizes[1] == scenario_size_by_study[1] * study_characteristics["models"] * study_characteristics["trials"] * study_characteristics["study_1"]["question_per_scenario"]
assert study_sizes[2] == scenario_size_by_study[2] * study_characteristics["models"] * study_characteristics["trials"] * study_characteristics["study_2"]["question_per_scenario"]
assert study_sizes[3] == scenario_size_by_study[3] * study_characteristics["models"] * study_characteristics["trials"] * study_characteristics["study_3"]["question_per_scenario"]