In [2]:
# load nec libs
import pandas as pd
import numpy as np
from scipy.stats import binomtest 
import random

In [3]:
# read in data
df_human = pd.read_csv("/kaggle/input/analysis-llm-data/Shared_data_responses_demographics.csv")
df_falcon = pd.read_csv("/kaggle/input/analysis-llm-data/falcon_results_classified.csv")
df_mistral = pd.read_csv("/kaggle/input/analysis-llm-data/mistral_results_classified.csv")
df_gpt = pd.read_csv("/kaggle/input/analysis-llm-data/gpt_results_classified.csv")

In [4]:
# put all LLM dataframes into a list of dataframes
dfs_llm = [df_falcon, df_mistral, df_gpt]

In [5]:
# first, subset the LLM generated data so that it contains the base cases 
# for Switch, Loop and Footbridge present in Awad 2020 only

filtered_dfs_llm = [df[df['Variation'] == 'base'] for df in dfs_llm]

In [6]:
# NOTE: order is Falcon, Mistral, GPT
switch_data = [df[df['Scenario'] == 'Switch'] for df in dfs_llm]
loop_data = [df[df['Scenario'] == 'Loop'] for df in dfs_llm]
footbridge_data = [df[df['Scenario'] == 'Footbridge'] for df in dfs_llm]

In [7]:
# replace all '1' with 'A' and all '0' with 'B' in the human data
df_human['Outcome'] = df_human['Outcome'].replace([1], 'A')
df_human['Outcome'] = df_human['Outcome'].replace([0], 'B')

In [8]:
# subset human data by Scenario
human_switch = df_human[df_human['Scenario'] == 'Switch']
human_loop = df_human[df_human['Scenario'] == 'Loop']
human_footbridge = df_human[df_human['Scenario'] == 'Footbridge']

In [9]:
# compute shares for A and B
count_human_switch = human_switch['Outcome'].value_counts(normalize=True)
count_human_loop = human_loop['Outcome'].value_counts(normalize=True)
count_human_footbridge = human_footbridge['Outcome'].value_counts(normalize=True)

In [10]:
# store in variables
human_switch_prop_A = count_human_switch.get('A', 0)
human_switch_prop_B = count_human_switch.get('B', 0)

human_loop_prop_A = count_human_loop.get('A', 0)
human_loop_prop_B = count_human_loop.get('B', 0)

human_footbridge_prop_A = count_human_footbridge.get('A', 0)
human_footbridge_prop_B = count_human_footbridge.get('B', 0)

In [11]:
# save human data as baseline for binom test
human_shares_list = [{'Share of A': human_switch_prop_A, 'Share of B': human_switch_prop_B},
 {'Share of A': human_loop_prop_A, 'Share of B': human_loop_prop_B},
 {'Share of A': human_footbridge_prop_A, 'Share of B': human_footbridge_prop_B}]

human_shares_list

[{'Share of A': 0.8385544584231367, 'Share of B': 0.1614455415768633},
 {'Share of A': 0.7573738812042311, 'Share of B': 0.24262611879576892},
 {'Share of A': 0.4672022103970528, 'Share of B': 0.5327977896029472}]

In [12]:
def bootstrap(data, sample_size=100, n_bootstraps=1000):
    bootstrapped_samples = []

    for _ in range(n_bootstraps):
        # randomly sample indices without replacement
        indices = random.choices(range(len(data)), k=sample_size)
        # obtain sampled rows
        sample = data.iloc[indices]
        bootstrapped_samples.append(sample)

    return bootstrapped_samples

In [13]:
# bootstrap LLM data using random seed for reproducability
np.random.seed(42)
switch_bootstrapped = [bootstrap(df) for df in switch_data]
loop_bootstrapped = [bootstrap(df) for df in loop_data]
footbridge_bootstrapped = [bootstrap(df) for df in footbridge_data]

In [14]:
# obtain shares per option for each bootstrapped sample per scenario
switch_average_shares = compute_average_shares_per_scenario(switch_bootstrapped)
loop_average_shares = compute_average_shares_per_scenario(loop_bootstrapped)
footbridge_average_shares = compute_average_shares_per_scenario(footbridge_bootstrapped)

NameError: name 'compute_average_shares_per_scenario' is not defined

In [16]:
main_list = [
    switch_bootstrapped,
    loop_bootstrapped,
    footbridge_bootstrapped
]

In [17]:
def calculate_share(df):
    total_A = 0
    total_B = 0
    total_columns = 0
    for column in df.columns:
        if column.endswith('_classification'):
            counts = df[column].value_counts(normalize=True)
            total_A += counts.get('A', 0)
            total_B += counts.get('B', 0)
            total_columns += 1
    share_A = total_A / total_columns
    share_B = total_B / total_columns
    return share_A, share_B

In [18]:
# create function for binomial test

def perform_binomial_test(df, human_shares):
    share_A, share_B = calculate_share(df)
    p_value_A = binomtest(int(share_A*100), n=100, p=human_shares['Share of A'])
    p_value_B = binomtest(int(share_B*100), n=100, p=human_shares['Share of B'])
    return p_value_A, p_value_B

In [19]:
p_values_list = []

# obtain all p-values by conducting binomial tests 
for i, sublist in enumerate(main_list):
    model_p_values = []
    # select appropriate human base line for switch (0), loop (1), FB (2)
    human_shares = human_shares_list[i]
    for model_samples in sublist:
        sample_p_values = []
        # go through all 1000 samples and perfrom binom test
        for sample_df in model_samples:
            p_value_A, p_value_B = perform_binomial_test(sample_df, human_shares)
            sample_p_values.append(p_value_A)
        model_p_values.append(sample_p_values)
    p_values_list.append(model_p_values)

In [20]:
# inspect first five values for GPT 
p_values_list[0][2][0:5]

[BinomTestResult(k=86, n=100, alternative='two-sided', statistic=0.86, pvalue=0.6832140287445917),
 BinomTestResult(k=85, n=100, alternative='two-sided', statistic=0.85, pvalue=0.8918456817025693),
 BinomTestResult(k=85, n=100, alternative='two-sided', statistic=0.85, pvalue=0.8918456817025693),
 BinomTestResult(k=85, n=100, alternative='two-sided', statistic=0.85, pvalue=0.8918456817025693),
 BinomTestResult(k=85, n=100, alternative='two-sided', statistic=0.85, pvalue=0.8918456817025693)]

In [21]:
# inspect first five values for Mistral
p_values_list[0][1][0:5]

[BinomTestResult(k=91, n=100, alternative='two-sided', statistic=0.91, pvalue=0.05614879584821322),
 BinomTestResult(k=91, n=100, alternative='two-sided', statistic=0.91, pvalue=0.05614879584821322),
 BinomTestResult(k=91, n=100, alternative='two-sided', statistic=0.91, pvalue=0.05614879584821322),
 BinomTestResult(k=91, n=100, alternative='two-sided', statistic=0.91, pvalue=0.05614879584821322),
 BinomTestResult(k=91, n=100, alternative='two-sided', statistic=0.91, pvalue=0.05614879584821322)]

In [22]:
# extract specific p_values switch
p_switch_falcon = [result.pvalue for result in p_values_list[0][0]]
p_switch_mistral = [result.pvalue for result in p_values_list[0][1]]
p_switch_gpt = [result.pvalue for result in p_values_list[0][2]]

# extract specific p_values loop
p_loop_falcon = [result.pvalue for result in p_values_list[1][0]]
p_loop_mistral = [result.pvalue for result in p_values_list[1][1]]
p_loop_gpt = [result.pvalue for result in p_values_list[1][2]]

# extract specific p_values footbridge
p_footbridge_falcon = [result.pvalue for result in p_values_list[2][0]]
p_footbridge_mistral = [result.pvalue for result in p_values_list[2][1]]
p_footbridge_gpt = [result.pvalue for result in p_values_list[2][2]]

In [32]:
# come up with function to compute C.I. for bootstrapped samples

def bootstrap_confidence_interval(data, alpha=0.05):
    """
    # alpha = 5% threshold 
    """
    pvalue_statistics = np.mean(data)
    lower_percentile = (alpha / 2) * 100
    upper_percentile = 100 - lower_percentile
    lower_bound = np.percentile(pvalue_statistics, lower_percentile)
    upper_bound = np.percentile(pvalue_statistics, upper_percentile)
    
    return lower_bound, upper_bound

In [29]:
def p_value_confidence_interval(data, alpha=0.05):
    """
    # alpha = 5% threshold 
    """
    pvalue_statistics = np.mean(data)
    lower_percentile = (alpha / 2) * 100
    upper_percentile = 100 - lower_percentile
    lower_bound = np.percentile(pvalue_statistics, lower_percentile)
    upper_bound = np.percentile(pvalue_statistics, upper_percentile)
    
    return lower_bound, upper_bound

In [31]:
ci_switch_F = p_value_confidence_interval(p_switch_falcon)
ci_switch_M = p_value_confidence_interval(p_switch_mistral)
ci_switch_G = p_value_confidence_interval(p_switch_gpt)
print("Confidence intervals for Switch (Falcon):", ci_switch_F)
print("Confidence intervals for Switch (Mistral):", ci_switch_M)
print("Confidence intervals for Switch (GPT):", ci_switch_G)

Confidence intervals for Switch (Falcon): (5.194285175013e-08, 5.194285175013e-08)
Confidence intervals for Switch (Mistral): (0.054296769513066535, 0.054296769513066535)
Confidence intervals for Switch (GPT): (0.7762205011659995, 0.7762205011659995)


In [33]:
# calculate confidence intervals for switch
ci_switch_falcon = bootstrap_confidence_interval(p_switch_falcon)
ci_switch_mistral = bootstrap_confidence_interval(p_switch_mistral)
ci_switch_gpt = bootstrap_confidence_interval(p_switch_gpt)

print("Confidence intervals for Switch (Falcon):", ci_switch_falcon)
print("Confidence intervals for Switch (Mistral):", ci_switch_mistral)
print("Confidence intervals for Switch (GPT):", ci_switch_gpt)

Confidence intervals for Switch (Falcon): (5.194285175013e-08, 5.194285175013e-08)
Confidence intervals for Switch (Mistral): (0.054296769513066535, 0.054296769513066535)
Confidence intervals for Switch (GPT): (0.7762205011659995, 0.7762205011659995)


In [35]:
# calculate confidence intervals for loop
ci_loop_falcon = bootstrap_confidence_interval(p_loop_falcon)
ci_loop_mistral = bootstrap_confidence_interval(p_loop_mistral)
ci_loop_gpt = bootstrap_confidence_interval(p_loop_gpt)

print("Confidence intervals for Loop (Falcon):", ci_loop_falcon)
print("Confidence intervals for Loop (Mistral):", ci_loop_mistral)
print("Confidence intervals for Loop (GPT):", ci_loop_gpt)

Confidence intervals for Loop (Falcon): (0.05215283293847233, 0.05215283293847233)
Confidence intervals for Loop (Mistral): (0.0011438023976494882, 0.0011438023976494882)
Confidence intervals for Loop (GPT): (0.001476441351761239, 0.001476441351761239)


In [26]:
# calculate confidence intervals for footbridge
ci_footbridge_falcon = bootstrap_confidence_interval(p_footbridge_falcon)
ci_footbridge_mistral = bootstrap_confidence_interval(p_footbridge_mistral)
ci_footbridge_gpt = bootstrap_confidence_interval(p_footbridge_gpt)

print("Confidence intervals for Footbridge (Falcon):", ci_footbridge_falcon)
print("Confidence intervals for Footbridhe (Mistral):", ci_footbridge_mistral)
print("Confidence intervals for Footbridge (GPT):", ci_footbridge_gpt)

Confidence intervals for Footbridge (Falcon): (0.008570955753307026, 0.008912093865198015)
Confidence intervals for Footbridhe (Mistral): (2.067262695850545e-17, 2.4824730866410463e-17)
Confidence intervals for Footbridge (GPT): (0.00013729374334573284, 0.00016778461257825264)
