# Setup & Imports

Note that the ISC-CI model was renamed from the "CICO model" and some references in the code remain outdated.

In [32]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import plotly.express as px
from scipy.stats import ttest_ind, ttest_rel

import analysis
import utils

# Parameters for analysis
models = ['ISC-CI','Overlap','SCM','GPT-3.5','GPT-4']

# Load isc-ci model
cico_model, data_loader = utils.load_cico_model(model_type='1and2shot_isc',model_seed=3)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Study 1

## Argument Strength Ratings

In [33]:
# Load arguments & run models to generate strength predictions
argument_data = pd.read_csv('data/generalization_experiments/induction_arguments.csv', index_col=0)
argument_data = analysis.calc_argument_strength(argument_data, cico_model, data_loader)
argument_data = analysis.calc_argument_strength_overlap(argument_data)
argument_data = analysis.calc_argument_strength_scm(argument_data)

# Calculate pearson correlation between model & human strength ratings within each dataset
plot_data = argument_data.groupby(['Dataset']).corr('pearson')[['Human']].reset_index().melt(id_vars=['Dataset','level_1']).rename(columns={'level_1':'Model','value':'Correlation'})
plot_data = plot_data[plot_data['Model'].isin(models)]
# Calculate 95% CI for the correlations
dataset_n = argument_data.groupby(['Dataset']).count()[['Conclusion']].rename(columns={'Conclusion':'n'})
plot_data = plot_data.join(dataset_n,on='Dataset')
plot_data = utils.calc_pearson_ci(plot_data)

# Plot correlations
fig = px.bar(plot_data, x='Dataset', y='Correlation', color='Model', barmode='group',
             error_y_minus = plot_data['Correlation']-plot_data['95ci lower'], error_y = plot_data['95ci upper']-plot_data['Correlation'],
             category_orders={'Model':models,
                              'Dataset':['Osherson - Exp 4','Rips','Bhatia - Exp 1','Bhatia - Exp 3','Han - Exp 2']},
             color_discrete_map=utils.model_colors)
fig = utils.format_figure(fig)
fig.update_layout(legend=dict(title='',orientation='h',yanchor='top',y=0.99,xanchor='right',x=0.99,
                              font=dict(size=15)))
fig.add_hline(y=0)
fig.show()

## Argument Strength Factors

In [34]:
argument_data = pd.read_csv('data/generalization_experiments/inductive_phenomena.csv', index_col=0)
argument_data = analysis.calc_argument_strength(argument_data, cico_model, data_loader)
argument_data = analysis.calc_argument_strength_overlap(argument_data)
argument_data = analysis.calc_argument_strength_scm(argument_data)

argument_data['ISC-CI'] = 1/(1+np.exp(-argument_data['ISC-CI'])) # Sigmoid model ratings to maintain scale vs other models
argument_data['GPT-3.5'] /= 100 # Scale GPT-3.5 ratings to maintain scale vs other models
argument_data['GPT-4'] /= 100 # Scale GPT-4 ratings to maintain scale vs other models

argument_data['Phenomenon'] = argument_data.Phenomenon.replace({'Monotonicity':'In-Category Monotonicity',
                                                        'Non-Monotonicity (In-Category)': 'In-Category Non-Monotonicity',
                                                        'Non-Monotonicity (Cross-Category)': 'Cross-Category Non-Monotonicity'})

In [35]:
# Calculate average ratings for each model by argument group for unpaired comparison
plot_data = argument_data[['Phenomenon','Argument Group']+models].melt(id_vars=['Phenomenon','Argument Group']).rename(columns={'variable':'Model','value':'Rating'})
plot_data = plot_data.groupby(['Phenomenon','Argument Group','Model']).agg(['mean','sem'])['Rating'].reset_index().rename(columns={'mean':'Rating'})
plot_data['95ci'] = plot_data['sem']*1.96
fig = px.bar(plot_data[plot_data.Phenomenon!='Thematic Match'],x='Model',y='Rating',error_y='95ci',color='Argument Group',barmode='group',
             category_orders={'Model':['ISC-CI','Overlap','SCM'],
                              'Phenomenon':['Premise-Conclusion Similarity','Conclusion Typicality','Premise Diversity','In-Category Monotonicity','In-Category Non-Monotonicity','Cross-Category Non-Monotonicity']},
             facet_col='Phenomenon', facet_col_wrap=3, facet_row_spacing=0.15, pattern_shape='Argument Group',range_y=[0,1])

# Format figure style & coloring
fig = utils.format_figure(fig,height=800)
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1]))
fig.update_annotations(font_size=16)
fig.update_xaxes(showticklabels=True,tickfont=dict(size=16))
fig.update_yaxes(titlefont=dict(size=20),tickfont=dict(size=16))
colors = [utils.model_colors[x] for x in sorted(models)]
for i in range(len(fig.data)):
    fig.data[i].marker.color = colors

# Set up statistical significance
plot_data['Bar Height'] = plot_data['Rating']+plot_data['95ci']
regression_columns = ['coef','se','T','pval','CI[2.5%]','CI[97.5%]']
significance_data = plot_data.groupby(['Phenomenon','Model']).max().reset_index()[['Phenomenon','Model','Bar Height']]
regression_data = []

# Run paired and unpaired t-tests
ttest_data = []
for phenomenon in significance_data.Phenomenon.unique():
    phenomenon_data = argument_data[(argument_data.Phenomenon==phenomenon)]
    high_data = phenomenon_data[phenomenon_data['Argument Group']=='High']
    low_data = phenomenon_data[phenomenon_data['Argument Group']=='Low']
    combined_data = high_data.merge(low_data,on=['Premise 1','Conclusion'],suffixes=('_high','_low'))
    for model in models:
        # First handle paired comparisons
        statistic, p = 0, 0
        if 'Monotonicity' in phenomenon:
            statistic, p = ttest_rel(combined_data[f'{model}_high'],combined_data[f'{model}_low'])
        else:
            statistic, p = ttest_ind(high_data[model],low_data[model])
        direction = 'correct' if high_data[model].mean()>low_data[model].mean() else 'opposite'
        if p>=.05:
            direction = 'n.s.'
        ttest_data.append({'Model':model,'Phenomenon':phenomenon,'statistic':statistic,'p':p,'p_stars':utils.p_to_stars(p),'direction':direction})

# Add significance to bar plot
significance_data = significance_data.merge(pd.DataFrame(ttest_data),on=['Phenomenon','Model'])

phenomenon_to_axis_label = {'Conclusion Typicality':'5','Premise Diversity':'6','Cross-Category Non-Monotonicity':'3',
                            'In-Category Non-Monotonicity':'2','Premise-Conclusion Similarity':'4','In-Category Monotonicity':''}
for phenomenon in plot_data.Phenomenon.unique():
    if phenomenon=='Thematic Match':
        continue
    for model in plot_data.Model.unique():
        sig_data = significance_data.query(f'Model=="{model}"&Phenomenon=="{phenomenon}"')
        bar_height = sig_data['Bar Height'].values[0]
        p_stars = sig_data['p_stars'].values[0]
        color = 'red' if sig_data['direction'].values[0]=='opposite' else 'black'
        axis_label = phenomenon_to_axis_label[phenomenon]
        fig.add_annotation(x=model,xref='x'+axis_label,yref='y'+axis_label,y=bar_height+0.05,text=p_stars,
                           showarrow=False,font=dict(size=16,color=color))
fig.update_layout(showlegend=False)
fig.show()

In [37]:
# Select cross-category non-monotonicity data for detailed analysis
non_monotonicity_data = argument_data[argument_data.Phenomenon=='Cross-Category Non-Monotonicity'].reset_index(drop=True)

# Get indices of arguments with high similarity as rated by ISC-CI model (in the high argument group)
similarities = non_monotonicity_data.loc[non_monotonicity_data['Argument Group']=='High','ISC-CI']
high_sim_idxs = non_monotonicity_data[non_monotonicity_data['Argument Group']=='High'][similarities>=similarities.median()].index

# Augment indices to include arguments in the weak group
high_sim_idxs = np.concatenate([high_sim_idxs,high_sim_idxs+1])

# Add similarity group to data
non_monotonicity_data['Similarity Group'] = 'Low'
non_monotonicity_data.loc[high_sim_idxs,'Similarity Group'] = 'High'

# Calculate average ratings for each model by argument group for unpaired comparison
plot_data = non_monotonicity_data[['Similarity Group','Argument Group']+models].melt(id_vars=['Similarity Group','Argument Group']).rename(columns={'variable':'Model','value':'Rating'})
plot_data = plot_data.groupby(['Similarity Group','Argument Group','Model']).agg(['mean','sem'])['Rating'].reset_index().rename(columns={'mean':'Rating'})
plot_data['95ci'] = plot_data['sem']*1.96
fig = px.bar(plot_data,x='Model',y='Rating',error_y='95ci',color='Argument Group',barmode='group',
             category_orders={'Model':['ISC-CI','Overlap','SCM'],
                              'Similarity Group':['High','Low']},
             facet_col='Similarity Group', facet_col_wrap=3, facet_row_spacing=0.15, pattern_shape='Argument Group',range_y=[0,1])

# Format figure style & coloring
fig = utils.format_figure(fig,width=1000,height=400)
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1]+' Similarity'))
fig.update_annotations(font_size=16)
fig.update_xaxes(showticklabels=True,tickfont=dict(size=16))
fig.update_yaxes(titlefont=dict(size=20),tickfont=dict(size=16))
colors = [utils.model_colors[x] for x in sorted(models)]
for i in range(len(fig.data)):
    fig.data[i].marker.color = colors

# Set up statistical significance
plot_data['Bar Height'] = plot_data['Rating']+plot_data['95ci']
regression_columns = ['coef','se','T','pval','CI[2.5%]','CI[97.5%]']
significance_data = plot_data.groupby(['Similarity Group','Model']).max().reset_index()[['Similarity Group','Model','Bar Height']]
regression_data = []

# Run paired t-tests
ttest_data = []
for group in significance_data['Similarity Group'].unique():
    phenomenon_data = non_monotonicity_data[(non_monotonicity_data['Similarity Group']==group)]
    high_data = phenomenon_data[phenomenon_data['Argument Group']=='High']
    low_data = phenomenon_data[phenomenon_data['Argument Group']=='Low']
    combined_data = high_data.merge(low_data,on=['Premise 1','Conclusion'],suffixes=('_high','_low'))
    for model in models:
        statistic, p = ttest_rel(combined_data[f'{model}_high'],combined_data[f'{model}_low'])
        direction = 'correct' if high_data[model].mean()>low_data[model].mean() or p>=.05 else 'opposite'
        ttest_data.append({'Model':model,'Similarity Group':group,'statistic':statistic,'p':p,'p_stars':utils.p_to_stars(p),'direction':direction})

# Add significance to bar plot
significance_data = significance_data.merge(pd.DataFrame(ttest_data),on=['Similarity Group','Model'])

phenomenon_to_axis_label = {'Low':'2','High':''}
for group in significance_data['Similarity Group'].unique():
    for model in plot_data.Model.unique():
        sig_data = significance_data[(significance_data.Model==model)&(significance_data['Similarity Group']==group)]
        bar_height = sig_data['Bar Height'].values[0]
        p_stars = sig_data['p_stars'].values[0]
        color = 'red' if sig_data['direction'].values[0]=='opposite' else 'black'
        axis_label = phenomenon_to_axis_label[group]
        fig.add_annotation(x=model,xref='x'+axis_label,yref='y'+axis_label,y=bar_height+0.05,text=p_stars,
                           showarrow=False,font=dict(size=16,color=color))
fig.update_layout(showlegend=False)
fig.show()

In [11]:
# Select cross-category non-monotonicity data for detailed analysis
non_monotonicity_data = argument_data[argument_data.Phenomenon=='Non-Monotonicity (Cross-Category)'].reset_index(drop=True)
non_monotonicity_data['Premise 2 Name'] = non_monotonicity_data['Premise 2']
high_arg_idxs = non_monotonicity_data[non_monotonicity_data['Argument Group']=='High'].index
non_monotonicity_data.loc[high_arg_idxs,'Premise 2 Name'] = non_monotonicity_data[non_monotonicity_data['Argument Group']=='Low']['Premise 2'].values

# Add living vs non-living labels to data
category_df = pd.read_csv('data/leuven_dataset/leuven_categories.csv')[['Name','Category 2']]
category_df = category_df.append({'Name':'','Category 2':''},ignore_index=True)
non_monotonicity_data = non_monotonicity_data.replace({np.nan:''}).merge(category_df,left_on='Premise 1',right_on='Name').rename(columns={'Category 2':'Premise 1 Category'}).drop(columns='Name')
non_monotonicity_data = non_monotonicity_data.merge(category_df,left_on='Premise 2 Name',right_on='Name').rename(columns={'Category 2':'Premise 2 Category'}).drop(columns='Name')
non_monotonicity_data['Similarity Group'] = non_monotonicity_data['Premise 1 Category']==non_monotonicity_data['Premise 2 Category']
non_monotonicity_data['Similarity Group'] = non_monotonicity_data['Similarity Group'].replace({True:'Match',False:'Mismatch'})

# Calculate average ratings for each model by argument group for unpaired comparison
plot_data = non_monotonicity_data[['Similarity Group','Argument Group']+models].melt(id_vars=['Similarity Group','Argument Group']).rename(columns={'variable':'Model','value':'Rating'})
plot_data = plot_data.groupby(['Similarity Group','Argument Group','Model']).agg(['mean','sem'])['Rating'].reset_index().rename(columns={'mean':'Rating'})
plot_data['95ci'] = plot_data['sem']*1.96
fig = px.bar(plot_data,x='Model',y='Rating',error_y='95ci',color='Argument Group',barmode='group',
             category_orders={'Model':['CICO','Overlap','SCM'],
                              'Similarity Group':['Match','Mismatch']},
             facet_col='Similarity Group', facet_col_wrap=3, facet_row_spacing=0.15, pattern_shape='Argument Group',range_y=[0,1])

# Format figure style & coloring
fig = utils.format_figure(fig,width=1000,height=400)
fig.for_each_annotation(lambda a: a.update(text='Category '+a.text.split('=')[1]))
fig.update_annotations(font_size=16)
fig.update_xaxes(showticklabels=True,tickfont=dict(size=16))
fig.update_yaxes(titlefont=dict(size=20),tickfont=dict(size=16))
colors = [utils.model_colors[x] for x in sorted(models)]
for i in range(len(fig.data)):
    fig.data[i].marker.color = colors

# Set up statistical significance
plot_data['Bar Height'] = plot_data['Rating']+plot_data['95ci']
regression_columns = ['coef','se','T','pval','CI[2.5%]','CI[97.5%]']
significance_data = plot_data.groupby(['Similarity Group','Model']).max().reset_index()[['Similarity Group','Model','Bar Height']]
regression_data = []

# Run paired t-tests
ttest_data = []
for group in significance_data['Similarity Group'].unique():
    phenomenon_data = non_monotonicity_data[(non_monotonicity_data['Similarity Group']==group)]
    high_data = phenomenon_data[phenomenon_data['Argument Group']=='High']
    low_data = phenomenon_data[phenomenon_data['Argument Group']=='Low']
    combined_data = high_data.merge(low_data,on=['Premise 1','Conclusion'],suffixes=('_high','_low'))
    for model in models:
        statistic, p = ttest_rel(combined_data[f'{model}_high'],combined_data[f'{model}_low'])
        direction = 'correct' if high_data[model].mean()>low_data[model].mean() or p>=.05 else 'opposite'
        ttest_data.append({'Model':model,'Similarity Group':group,'statistic':statistic,'p':p,'p_stars':utils.p_to_stars(p),'direction':direction})

# Add significance to bar plot
significance_data = significance_data.merge(pd.DataFrame(ttest_data),on=['Similarity Group','Model'])

phenomenon_to_axis_label = {'Mismatch':'2','Match':''}
for group in significance_data['Similarity Group'].unique():
    for model in plot_data.Model.unique():
        sig_data = significance_data[(significance_data.Model==model)&(significance_data['Similarity Group']==group)]
        bar_height = sig_data['Bar Height'].values[0]
        p_stars = sig_data['p_stars'].values[0]
        color = 'red' if sig_data['direction'].values[0]=='opposite' else 'black'
        axis_label = phenomenon_to_axis_label[group]
        fig.add_annotation(x=model,xref='x'+axis_label,yref='y'+axis_label,y=bar_height+0.05,text=p_stars,
                           showarrow=False,font=dict(size=16,color=color))
fig.update_layout(showlegend=False)
fig.show()

# Study 2: Context Effects in Inductive Reasoning

## Context-Dependent Non-Monotonicity

In [33]:
# Load human data
human_data = pd.read_csv('data/generalization_experiments/context_dependent_nonmonotonicity.csv', index_col=0)

# Load LLM data
model_data = pd.read_csv('data/generalization_experiments/context_dependent_nonmonotonicity_llm.csv', index_col=0)
model_data[['GPT-4','GPT-3.5']] = model_data[['GPT-4','GPT-3.5']]==2

# Merge human and model data
model_data['participant_id'] = 0
model_data = model_data.melt(id_vars=['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion',
                                      'Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion','participant_id'],
                             value_vars=['GPT-3.5','GPT-4'],var_name='Model',value_name='Argument 2 Chosen')
human_data['Model'] = 'Human'
argument_data = pd.concat([model_data,human_data],axis=0,ignore_index=True)

# Generate box plot
plot_data = argument_data.groupby(['Model','participant_id']).mean().reset_index()
plot_data = plot_data[plot_data.Model.isin(['Human','GPT-3.5','GPT-4'])]
plot_data = plot_data.rename(columns={'Argument 2 Chosen':'Agreement with ISC-CI (% of Arguments)'})
fig = px.box(plot_data,y='Model',color='Model',x='Agreement with ISC-CI (% of Arguments)',
                category_orders={'Model':['Human','GPT-3.5','GPT-4']},
                color_discrete_map=utils.model_colors,
                range_x=[-.01,1.01])
fig = utils.format_figure(fig,height=400,width=800)
fig.add_vline(x=0.5, line_dash='dot', line_color="black")
fig.update_traces(line=dict(width=4))
fig.update_layout(boxgap=0.1,boxgroupgap=0.01,showlegend=False)
fig.show()

In [34]:
plot_data

Unnamed: 0,Model,participant_id,Argument 2-Premise 2,Agreement with ISC-CI (% of Arguments)
0,GPT-3.5,0,,0.0375
1,GPT-4,0,,0.0125
2,Human,5499-7425-7275,,0.90625
3,Human,5c4f5967aac8be0001716a65,,0.823529
4,Human,5c50efd2cc71f4000125ce0d,,0.454545
5,Human,5cfbf333ac28dd00196e807b,,0.970588
6,Human,5d37ad921c16bb0015281337,,0.6
7,Human,5ee4aab1232e9d1b4d791bae,,0.848485
8,Human,5ef5f74fb4a08f2676fb3aba,,0.909091
9,Human,5ef877bb6b0ed15b3f6ae106,,0.457143


### Unpaired version

In [31]:
# Load human data
human_data = pd.read_csv('data/generalization_experiments/context_dependent_nonmonotonicity.csv', index_col=0)

# Split into two arguments to calculate model scores
model_data = human_data.groupby(['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion',
                                    'Argument 2-Premise 1','Argument 2-Conclusion']).mean().reset_index()
model_argument1 = model_data[['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion']]
model_argument2 = model_data[['Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion']]
model_argument1.columns = ['Premise 1','Premise 2','Conclusion']
model_argument2.columns = ['Premise 1','Premise 2','Conclusion']
model_argument1['Premise 3'] = np.nan
model_argument2['Premise 2'] = ''
model_argument2['Premise 3'] = np.nan

# Add LLM predictions
llm_data = pd.read_csv('data/generalization_experiments/context_dependent_nonmonotonicity_llm_unpaired.csv', index_col=0)
model_argument1 = model_argument1.merge(llm_data,left_on=['Premise 1','Premise 2','Conclusion'],right_on=['Premise 1','Premise 2','Conclusion'],how='left')
model_argument2 = model_argument2.merge(llm_data,on=['Premise 1','Premise 2','Conclusion'],how='left')

# Generate paired model predictions for which argument is stronger
model_argument1 = model_argument1.add_prefix('Argument 1-')
model_argument2 = model_argument2.add_prefix('Argument 2-')
model_data = pd.concat([model_argument1,model_argument2],axis=1)
for model in ['GPT-3.5']:#,'GPT-4']:
    model_data[model] = model_data[f'Argument 2-{model}']>model_data[f'Argument 1-{model}']

# Merge human and model data
model_data['participant_id'] = 0
model_data = model_data.melt(id_vars=['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion',
                                      'Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion','participant_id'],
                             value_vars=['GPT-3.5'],var_name='Model',value_name='Argument 2 Chosen')
human_data['Model'] = 'Human'
argument_data = pd.concat([model_data,human_data],axis=0,ignore_index=True)

# Generate box plot
plot_data = argument_data.groupby(['Model','participant_id']).mean().reset_index()
plot_data = plot_data[plot_data.Model.isin(['Human','GPT-3.5','GPT-4'])]
plot_data = plot_data.rename(columns={'Argument 2 Chosen':'Agreement with ISC-CI (% of Arguments)'})
fig = px.box(plot_data,y='Model',color='Model',x='Agreement with ISC-CI (% of Arguments)',
                category_orders={'Model':['Human','GPT-3.5']},
                color_discrete_map=utils.model_colors,
                range_x=[-.01,1.01])
fig = utils.format_figure(fig,height=400,width=800)
fig.add_vline(x=0.5, line_dash='dot', line_color="black")
fig.update_traces(line=dict(width=4))
fig.update_layout(boxgap=0.1,boxgroupgap=0.01,showlegend=False)
fig.show()

In [20]:
model_argument1

Unnamed: 0,Argument 1-Premise 3_x,Argument 1-Premise 1,Argument 1-Premise 2,Argument 1-Premise 3_y,Argument 1-Conclusion,Argument 1-GPT-3.5


## Context-Dependent Categorization

In [6]:
# Load human data
human_data = pd.read_csv('data/generalization_experiments/thematic_arguments.csv', index_col=0)

# Split into two arguments to calculate model scores
model_data = human_data.groupby(['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion',
                                    'Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion']).mean().reset_index()
model_argument1 = model_data[['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion']]
model_argument2 = model_data[['Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion']]
model_argument1.columns = ['Premise 1','Premise 2','Conclusion']
model_argument2.columns = ['Premise 1','Premise 2','Conclusion']
model_argument1['Premise 3'] = np.nan
model_argument2['Premise 3'] = np.nan

# Add LLM predictions
llm_data = pd.read_csv('data/generalization_experiments/thematic_arguments_llm.csv', index_col=0)
model_argument1 = model_argument1.merge(llm_data,left_on=['Premise 1','Premise 2','Conclusion'],right_on=['Premise 1','Premise 2','Conclusion'],how='left')
model_argument2 = model_argument2.merge(llm_data,on=['Premise 1','Premise 2','Conclusion'],how='left')

# Generate paired model predictions for which argument is stronger
model_argument1 = model_argument1.add_prefix('Argument 1-')
model_argument2 = model_argument2.add_prefix('Argument 2-')
model_data = pd.concat([model_argument1,model_argument2],axis=1)
for model in ['GPT-3.5','GPT-4']:
    model_data[model] = model_data[f'Argument 2-{model}']>model_data[f'Argument 1-{model}']

# Merge human and model data
model_data['participant_id'] = 0
model_data = model_data.melt(id_vars=['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion',
                                      'Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion','participant_id'],
                             value_vars=['GPT-3.5','GPT-4'],var_name='Model',value_name='Argument 2 Chosen')
human_data['Model'] = 'Human'
argument_data = pd.concat([model_data,human_data],axis=0,ignore_index=True)

# Generate box plot
plot_data = argument_data.groupby(['Model','participant_id']).mean().reset_index()
plot_data = plot_data[plot_data.Model.isin(['Human','GPT-3.5','GPT-4'])]
plot_data = plot_data.rename(columns={'Argument 2 Chosen':'Agreement with ISC-CI (% of Arguments)'})
fig = px.box(plot_data,y='Model',color='Model',x='Agreement with ISC-CI (% of Arguments)',
                category_orders={'Model':['Human','GPT-3.5','GPT-4']},
                color_discrete_map=utils.model_colors,
                range_x=[-.01,1.01])
fig = utils.format_figure(fig,height=400,width=800)
fig.add_vline(x=0.5, line_dash='dot', line_color="black")
fig.update_traces(line=dict(width=4))
fig.update_layout(boxgap=0.1,boxgroupgap=0.01,showlegend=False)
fig.show()