# Setup & Imports

In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pingouin as pg
import plotly.express as px
from scipy.stats import binom

import analysis
import utils

# Load CICO model
cico_model, data_loader = utils.load_cico_model(model_type='1and2shot_isc',model_seed=3)

# Study 3: In-Category Similarity

In [4]:
models = ['ISC-CI','Overlap','Contrast','GPT-3.5','GPT-4']

# Load arguments & run models to generate similarity predictions
human_data = pd.read_csv('data/similarity_experiments/similarity_in_domain.csv', index_col=0)
model_data = human_data.groupby(['Premise 1','Conclusion','Domain']).mean().reset_index()
model_data['Premise 2'] = np.nan
model_data['Premise 3'] = np.nan
model_data = analysis.calc_argument_strength(model_data, cico_model, data_loader)
model_data = analysis.calc_argument_strength_overlap(model_data)
model_data = analysis.calc_argument_strength_contrast(model_data)
# Add LLM predictions
llm_data = pd.read_csv('data/similarity_experiments/similarity_in_domain_llm.csv', index_col=0)
model_data = model_data.merge(llm_data,left_on=['Premise 1','Conclusion'],right_on=['Premise 1','Conclusion'],how='left')

# Run CICO model in reverse direction & take average to get bidirectional similarity score
model_data_flipped = model_data.copy(deep=True)
model_data_flipped['Premise 1'] = model_data['Conclusion']
model_data_flipped['Conclusion'] = model_data['Premise 1']
model_data_flipped = analysis.calc_argument_strength(model_data_flipped, cico_model, data_loader)

model_data['ISC-CI'] = (1/(1+np.exp(-model_data['CICO'])) + 1/(1+np.exp(-model_data_flipped['CICO'])))/2

In [8]:
# Calculate pearson correlation between model & human similarity ratings within each dataset
model_plot_data = model_data.groupby(['Domain']).corr('pearson')[['Normalized Similarity']].reset_index().melt(id_vars=['Domain','level_1']).rename(columns={'level_1':'Model','value':'Correlation'})
model_plot_data = model_plot_data[model_plot_data['Model'].isin(models)]
# Calculate pearson correlations between each individual human and the mean human within each dataset
human_plot_data = []
for group_name, group in human_data.groupby(['Domain']):
    mean_similarity = group.groupby(['Premise 1','Conclusion']).sum()['Normalized Similarity'].values
    for participant_number in group['participant_id'].unique():
        participant_similarity = group[group['participant_id']==participant_number].groupby(['Premise 1','Conclusion']).mean()['Normalized Similarity'].values
        corr = utils.nansafe_correlation(participant_similarity,mean_similarity-participant_similarity)[0]
        human_plot_data.append({'Correlation':corr,'participant_id':participant_number,'Domain':group_name})
human_plot_data = pd.DataFrame(human_plot_data)
# Merge model & human data
model_plot_data['participant_id'] = 0
human_plot_data['Model'] = 'Human'
plot_data = pd.concat([model_plot_data,human_plot_data],ignore_index=True)

# Plot correlations
plot_data = plot_data.rename(columns={'Domain':'Category'})
fig = px.box(plot_data,x='Category',color='Model',y='Correlation',
             category_orders={'Model':['Human']+models},
             color_discrete_map=utils.model_colors,range_y=[-.12,1])
fig = utils.format_figure(fig,height=600,width=600)
fig.add_hline(y=0, line_dash='dot', line_color="black")
fig.update_traces(line=dict(width=4))
fig = utils.format_figure(fig,width=1400,height=600)
fig.update_layout(legend=dict(title='',orientation='h',yanchor='top',y=0.99,xanchor='right',x=0.99,
                              font=dict(size=15)))
fig.update_layout(boxgap=0.3,boxgroupgap=0.15)
fig.show()

# Study 4: Context Effects in Similarity Judgments

## Asymmetry

In [17]:
models = ['ISC-CI','Contrast','GPT-3.5','GPT-4']
argument_columns = ['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion',
                    'Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion']

# Load human data
human_data = pd.read_csv('data/similarity_experiments/similarity_asymmetry.csv', index_col=0).fillna('')

# Find object pairs that have significant asymmetry scores
grouped_data = human_data.groupby(argument_columns).mean()
counts = human_data.groupby(argument_columns).count()[['participant_id']]
counts.columns = ['Participant Count']
grouped_data = grouped_data.merge(counts,left_index=True,right_index=True).reset_index()
grouped_data['Effect Size'] = np.max((grouped_data['Argument 2 Chosen'],1-grouped_data['Argument 2 Chosen']),axis=0)
grouped_data['pval'] = binom.sf(grouped_data['Effect Size']*grouped_data['Participant Count']-1,grouped_data['Participant Count'],0.5)
significant_data = grouped_data[grouped_data.pval<=0.05][argument_columns]
significant_data['pair'] = significant_data['Argument 1-Premise 1']+'-'+significant_data['Argument 1-Conclusion']

# Select significant pairs
human_data['Argument 2 Chosen Majority'] = human_data.groupby(argument_columns).transform('mean')['Argument 2 Chosen']>.5
human_data['pair'] = human_data['Argument 1-Premise 1']+'-'+human_data['Argument 1-Conclusion']
human_data = human_data[human_data.pair.isin(significant_data.pair)]

# Split into two arguments to calculate model scores
model_data = human_data.groupby(argument_columns).mean().reset_index()
model_argument1 = model_data[['Argument 1-Premise 1','Argument 1-Premise 2','Argument 1-Conclusion','Argument 2 Chosen Majority']]
model_argument2 = model_data[['Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion']]
model_argument1.columns = ['Premise 1','Premise 2','Conclusion','Argument 2 Chosen Majority']
model_argument2.columns = ['Premise 1','Premise 2','Conclusion']
model_argument1['Premise 2'] = np.nan
model_argument1['Premise 3'] = np.nan
model_argument2['Premise 2'] = np.nan
model_argument2['Premise 3'] = np.nan

# Run models to generate strength predictions
model_argument1 = analysis.calc_argument_strength(model_argument1, cico_model, data_loader)
model_argument2 = analysis.calc_argument_strength(model_argument2, cico_model, data_loader)
model_argument1 = analysis.calc_argument_strength_contrast(model_argument1,alpha=1,beta=0)
model_argument2 = analysis.calc_argument_strength_contrast(model_argument2,alpha=1,beta=0)

# Generate paired model predictions for which argument is stronger
model_argument1 = model_argument1.add_prefix('Argument 1-')
model_argument2 = model_argument2.add_prefix('Argument 2-')
model_data = pd.concat([model_argument1,model_argument2],axis=1).rename(columns={'Argument 1-Argument 2 Chosen Majority':'Argument 2 Chosen Majority'})
# Add LLM predictions
llm_data = pd.read_csv('data/similarity_experiments/similarity_asymmetry_llm.csv', index_col=0)
llm_data[['Argument 1-GPT-3.5','Argument 1-GPT-4']] = (llm_data[['GPT-3.5','GPT-4']]==1).astype(float)
llm_data[['Argument 2-GPT-3.5','Argument 2-GPT-4']] = (llm_data[['GPT-3.5','GPT-4']]==2).astype(float)
model_data = model_data.merge(llm_data,on=['Argument 1-Premise 1','Argument 1-Conclusion','Argument 2-Premise 1','Argument 2-Conclusion'])
for model in models:
    model_data[model] = (model_data[f'Argument 2-{model}']>model_data[f'Argument 1-{model}'])==model_data['Argument 2 Chosen Majority']
model_data = model_data.melt(id_vars=argument_columns,
                             value_vars=models,var_name='Model',value_name='Agreement with Humans (%)')

# Generate bar plot
plot_data = model_data.groupby(['Model']).mean().reset_index()
fig = px.bar(plot_data,x='Model',color='Model',y='Agreement with Humans (%)',
             category_orders={'Model':['Human']+models},
             color_discrete_map=utils.model_colors,range_y=[0,1])
fig.add_hline(y=0.56, line_dash='dot', line_color="black")
fig = utils.format_figure(fig,height=600,width=800,showlegend=False)

# Calculate significance & add to plot
model_effect_sizes = model_data.groupby(['Model']).sum().reset_index()
model_effect_sizes['Count'] = model_data.groupby(['Model']).count().reset_index()['Agreement with Humans (%)']
model_effect_sizes['pval'] = binom.sf(model_effect_sizes['Agreement with Humans (%)']-1,model_effect_sizes['Count'],0.5)
model_effect_sizes['p_stars'] = model_effect_sizes['pval'].apply(utils.p_to_stars)
model_effect_sizes['Bar Height'] = model_effect_sizes['Agreement with Humans (%)']/model_effect_sizes['Count']
for model in models:
    p_stars = model_effect_sizes[model_effect_sizes['Model']==model]['p_stars'].values[0]
    bar_height = model_effect_sizes[model_effect_sizes['Model']==model]['Bar Height'].values[0]
    fig.add_annotation(x=model,y=bar_height+0.05,text=p_stars,
                       showarrow=False,font=dict(size=16))

fig.show()

In [18]:
models = ['ISC-CI','Contrast']
# Generate numerical asymmetry scores for model and humans
model_data = pd.concat([model_argument1,model_argument2],axis=1).rename(columns={'Argument 1-Argument 2 Chosen Majority':'Argument 2 Chosen Majority'})
model_data = model_data.merge(llm_data,on=['Argument 1-Premise 1','Argument 1-Conclusion','Argument 2-Premise 1','Argument 2-Conclusion'])
for model in models:
    model_data[model] = model_data[f'Argument 2-{model}']-model_data[f'Argument 1-{model}']
model_data = model_data.fillna('').melt(id_vars=argument_columns,value_vars=models,var_name='Model',value_name='Model Asymmetry Score')
human_data_averaged = human_data.groupby(argument_columns).mean()['Argument 2 Chosen'].reset_index().rename(columns={'Argument 2 Chosen':'Human Asymmetry Score'})
argument_data = model_data.merge(human_data_averaged,on=argument_columns)

# Run regression for each model's predictions
regression_data = []
for model in models:
    model_arg_data = argument_data[argument_data['Model']==model]
    lr = pg.linear_regression(X=model_arg_data['Human Asymmetry Score'],y=model_arg_data['Model Asymmetry Score'])
    lr = lr[lr.names=='Human Asymmetry Score']
    lr['Model'] = model
    lr['p_stars'] = lr['pval'].apply(utils.p_to_stars)
    regression_data.append(lr)
regression_data = pd.concat(regression_data,ignore_index=True)
regression_data = regression_data.rename(columns={'r2':'Agreement with Humans (R<sup>2</sup>)'})

# Generate bar plot
plot_data = model_data.groupby(['Model']).mean().reset_index()
fig = px.bar(regression_data,x='Model',color='Model',y='Agreement with Humans (R<sup>2</sup>)',
             category_orders={'Model':['Human']+models},
             color_discrete_map=utils.model_colors,range_y=[0,1])
fig = utils.format_figure(fig,height=600,width=400,showlegend=False)
for model in models:
    p_stars = regression_data[regression_data['Model']==model]['p_stars'].values[0]
    bar_height = regression_data[regression_data['Model']==model]['Agreement with Humans (R<sup>2</sup>)'].values[0]
    fig.add_annotation(x=model,y=bar_height+0.05,text=p_stars,
                       showarrow=False,font=dict(size=16))
fig.show()

## Multi-Alternative Context Effects

In [19]:
models = ['ISC-CI','GPT-3.5','GPT-4']
argument_columns = ['Premise 1','Conclusion 1','Conclusion 2','Distractor 1','Distractor 2']
human_data = pd.read_csv('data/similarity_experiments/similarity_in_context.csv', index_col=0)

human_context_effects = analysis.calc_similarity_context_effect_human(human_data)
significant_sets = human_context_effects[human_context_effects['Context Effect']>0]

# Calculate CICO predictions
model_data = analysis.calc_similarity_context_effect(significant_sets, cico_model, data_loader)

# Add LLM predictions
llm_data = pd.read_csv('data/similarity_experiments/similarity_in_context_llm.csv', index_col=0)
gpt_35_data = llm_data.drop(columns=['GPT-4']).rename(columns={'GPT-3.5':'Choice'})
gpt_4_data = llm_data.drop(columns=['GPT-3.5']).rename(columns={'GPT-4':'Choice'})
gpt_35_data['Conclusion 1 Chosen'] = gpt_35_data.Choice==1
gpt_35_data['Conclusion 2 Chosen'] = gpt_35_data.Choice==3
gpt_4_data['Conclusion 1 Chosen'] = gpt_4_data.Choice==1
gpt_4_data['Conclusion 2 Chosen'] = gpt_4_data.Choice==3
gpt_35_data = analysis.calc_similarity_context_effect_human(gpt_35_data).rename(columns={'Context Effect':'GPT-3.5'})
gpt_4_data = analysis.calc_similarity_context_effect_human(gpt_4_data).rename(columns={'Context Effect':'GPT-4'})

model_data = model_data.merge(gpt_35_data,on=argument_columns)
model_data = model_data.merge(gpt_4_data,on=argument_columns)
model_data = model_data[models].melt(var_name='Model',value_name='Agreement with Humans (%)')

In [20]:
# Generate bar plot
model_direction_data = model_data.copy()
model_direction_data['Agreement with Humans (%)'] = (model_direction_data['Agreement with Humans (%)']>0).astype(float)
plot_data = model_direction_data.groupby(['Model']).mean().reset_index()
fig = px.bar(plot_data,x='Model',color='Model',y='Agreement with Humans (%)',
             category_orders={'Model':['Human']+models},
             color_discrete_map=utils.model_colors,range_y=[0,1])
fig.add_hline(y=0.5, line_dash='dot', line_color="black")
fig = utils.format_figure(fig,height=600,width=800,showlegend=False)

# Calculate significance & add to plot
model_effect_sizes = model_direction_data.groupby(['Model']).sum().reset_index()
model_effect_sizes['Count'] = model_direction_data.groupby(['Model']).count().reset_index()['Agreement with Humans (%)']
model_effect_sizes['pval'] = binom.sf(model_effect_sizes['Agreement with Humans (%)']-1,model_effect_sizes['Count'],0.5)
model_effect_sizes['p_stars'] = model_effect_sizes['pval'].apply(utils.p_to_stars)
model_effect_sizes['Bar Height'] = model_effect_sizes['Agreement with Humans (%)']/model_effect_sizes['Count']
for model in models:
    p_stars = model_effect_sizes[model_effect_sizes['Model']==model]['p_stars'].values[0]
    bar_height = model_effect_sizes[model_effect_sizes['Model']==model]['Bar Height'].values[0]
    fig.add_annotation(x=model,y=bar_height+0.05,text=p_stars,
                       showarrow=False,font=dict(size=16))

fig.show()