# Setup & Imports

In [2]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pingouin as pg
import plotly.express as px
from scipy.stats import binom

import analysis
import utils

# Load CICO model
cico_model, data_loader = utils.load_cico_model(model_type='1and2shot_isc',model_seed=3)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Study 3: In-Category Similarity

In [7]:
# 1) Base human & model data
human_data = pd.read_csv('data/similarity_experiments/similarity_in_domain.csv', index_col=0)
model_data = (human_data
  .groupby(['Premise 1','Conclusion','Domain'])
  .mean()
  .reset_index()
)
model_data[['Premise 2','Premise 3']] = np.nan

# 2) Run your three strength functions
model_data = analysis.calc_argument_strength(model_data, cico_model, data_loader)
model_data = analysis.calc_argument_strength_overlap(model_data)
model_data = analysis.calc_argument_strength_contrast(model_data)

# 3) Load LLM data (if you need it)
llm_data = pd.read_csv('data/similarity_experiments/similarity_in_domain_llm.csv', index_col=0)

# 4) Make a flipped copy and recompute
model_data_flipped = model_data.copy()
model_data_flipped['Premise 1'], model_data_flipped['Conclusion'] = (
    model_data['Conclusion'],
    model_data['Premise 1']
)
model_data_flipped = analysis.calc_argument_strength(model_data_flipped, cico_model, data_loader)

# 5) Average the existing ISC‑CI scores
model_data['ISC-CI_bidirectional'] = (
    model_data['ISC-CI'] +
    model_data_flipped['ISC-CI']
) / 2


In [10]:
model_plot_data = (
    model_data
      .groupby('Domain')
      .corr(method='pearson', numeric_only=True)[['Normalized Similarity']]
      .reset_index()
      .melt(
        id_vars=['Domain', 'level_1'],
        value_name='Correlation'
      )
      .rename(columns={'level_1': 'Model'})
)
# Filter to only the models of interest
model_plot_data = model_plot_data[model_plot_data['Model'].isin(models)]

# 2. Pivot into a table and print each correlation
model_corr_table = (
    model_plot_data
      .pivot(index='Domain', columns='Model', values='Correlation')
      .reset_index()
)
print("Model × Domain correlations:")
print(model_corr_table)


# 3. Calculate Pearson correlations for each human participant vs. the group mean
human_plot_data = []
for domain, group in human_data.groupby('Domain'):
    # mean across all participants
    mean_sim = group.groupby(['Premise 1','Conclusion'])['Normalized Similarity'].mean()
    for pid, sub in group.groupby('participant_id'):
        part_sim = sub.groupby(['Premise 1','Conclusion'])['Normalized Similarity'].mean()
        corr = utils.nansafe_correlation(part_sim.values, mean_sim.values)[0]
        human_plot_data.append({
            'Domain':      domain,
            'participant_id': pid,
            'Correlation': corr
        })
human_plot_data = pd.DataFrame(human_plot_data)

# 4. Pivot and print human correlations
human_corr_table = (
    human_plot_data
      .pivot(index='Domain', columns='participant_id', values='Correlation')
      .reset_index()
)
print("\nHuman participant correlations (each vs. group mean):")
print(human_corr_table)


# 5. (Optional) Combine for plotting and display boxplots
model_plot_data['participant_id'] = 0
human_plot_data['Model'] = 'Human'
plot_data = pd.concat([model_plot_data, human_plot_data], ignore_index=True)
plot_data = plot_data.rename(columns={'Domain': 'Category'})

fig = px.box(
    plot_data,
    x='Category',
    y='Correlation',
    color='Model',
    category_orders={'Model': ['Human'] + models},
    color_discrete_map=utils.model_colors,
    range_y=[-0.12, 1]
)
fig = utils.format_figure(fig, width=1400, height=600)
fig.add_hline(y=0, line_dash='dot')
fig.update_traces(line=dict(width=4))
fig.update_layout(
    legend=dict(
        title='',
        orientation='h',
        yanchor='top', y=0.99,
        xanchor='right', x=0.99,
        font=dict(size=15)
    ),
    boxgap=0.3,
    boxgroupgap=0.15
)
fig.show()

Model × Domain correlations:
Model    Domain  Contrast    ISC-CI   Overlap
0         Birds  0.723198  0.600609  0.720334
1      Clothing  0.599849  0.265638  0.620582
2          Fish  0.598075  0.616449  0.561472
3       Insects  0.660184  0.317864  0.597342
4       Mammals  0.687843  0.506731  0.697147
5      Reptiles  0.755248  0.602044  0.684916
6         Tools  0.557312  0.381674  0.468311
7      Vehicles  0.590723  0.578338  0.623666
8       Weapons  0.729827  0.616292  0.717406

Human participant correlations (each vs. group mean):
participant_id    Domain         0         1         2         3         4  \
0                  Birds       NaN       NaN       NaN       NaN       NaN   
1               Clothing  0.711667  0.121965  0.079386  0.722884  0.394995   
2                   Fish       NaN       NaN       NaN       NaN       NaN   
3                Insects       NaN       NaN       NaN       NaN       NaN   
4                Mammals  0.774886  0.472335  0.180570  0.336802  

# Study 4: Context Effects in Similarity Judgments

## Asymmetry

In [None]:
# 1) Setup
models = ['ISC-CI', 'Contrast', 'GPT-3.5', 'GPT-4']
argument_columns = [
    'Argument 1-Premise 2',
    'Argument 1-Conclusion',
    'Argument 2-Premise 1',
    'Argument 2-Premise 2',
    'Argument 2-Conclusion'
]

# 2) Load & clean human data
human_data = (
    pd.read_csv('data/similarity_experiments/similarity_asymmetry.csv', index_col=0)
      .fillna('')
)
human_data['Argument 2 Chosen'] = pd.to_numeric(human_data['Argument 2 Chosen'], errors='coerce')

# 3) Compute per-pair means & counts on a shared MultiIndex
grouped = human_data.groupby(argument_columns).mean(numeric_only=True)
counts = human_data.groupby(argument_columns)['participant_id'] \
                   .count() \
                   .rename('Participant Count')
grouped_data = grouped.join(counts).reset_index()

# 4) Effect sizes & binomial p-values
grouped_data['Effect Size'] = grouped_data['Argument 2 Chosen'].apply(lambda p: max(p, 1-p))
grouped_data['pval'] = grouped_data.apply(
    lambda r: binom.sf(r['Effect Size'] * r['Participant Count'] - 1,
                      r['Participant Count'], 0.5),
    axis=1
)

# 5) Keep only significant pairs
sig = grouped_data[grouped_data['pval'] <= 0.05][argument_columns].copy()
sig['pair'] = sig['Argument 1-Premise 2'] + '-' + sig['Argument 1-Conclusion']

# 6) Filter raw human data to those pairs & compute majority
human_data['Argument 2 Chosen Majority'] = (
    human_data.groupby(argument_columns)['Argument 2 Chosen']
              .transform('mean') > 0.5
)
human_data['pair'] = human_data['Argument 1-Premise 2'] + '-' + human_data['Argument 1-Conclusion']
human_data = human_data[human_data['pair'].isin(sig['pair'])].copy()

# 7) Build model‐input tables
model_data = human_data.groupby(argument_columns, as_index=False)[
    'Argument 2 Chosen Majority'
].mean()

# Argument 1
arg1 = model_data[[
    'Argument 1-Premise 2','Argument 1-Conclusion','Argument 2 Chosen Majority'
]].copy()
arg1.columns = ['Premise 1','Conclusion','Argument 2 Chosen Majority']
arg1['Premise 2'] = np.nan
arg1['Premise 3'] = np.nan

# Argument 2
arg2 = model_data[[
    'Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion'
]].copy()
arg2.columns = ['Premise 1','Premise 2','Conclusion']
arg2['Premise 3'] = np.nan

# Normalize any non‑breaking spaces in column names to plain spaces
arg1.columns = [col.replace('\xa0', ' ') for col in arg1.columns]
arg2.columns = [col.replace('\xa0', ' ') for col in arg2.columns]

arg1 = arg1.loc[
    arg1['Premise 1'].astype(bool) &  # filters out '' and NaN
    arg1['Conclusion'].astype(bool)
].copy()

arg2 = arg2.loc[
    arg2['Premise 1'].astype(bool) &
    arg2['Premise 2'].astype(bool) &
    arg2['Conclusion'].astype(bool)
].copy()

# 8) Run CICO + contrast models
arg1 = analysis.calc_argument_strength(arg1, cico_model, data_loader)
arg2 = analysis.calc_argument_strength(arg2, cico_model, data_loader)
arg1 = analysis.calc_argument_strength_contrast(arg1, alpha=1, beta=0)
arg2 = analysis.calc_argument_strength_contrast(arg2, alpha=1, beta=0)

# 9) Prefix & combine
arg1 = arg1.add_prefix('Argument 1-')
arg2 = arg2.add_prefix('Argument 2-')
model_data = pd.concat([arg1, arg2], axis=1) \
               .rename(columns={'Argument 1-Argument 2 Chosen Majority': 'Argument 2 Chosen Majority'})

# 10) Merge in LLM judgments
llm = pd.read_csv('data/similarity_experiments/similarity_asymmetry_llm.csv', index_col=0)
for m in ['GPT-3.5','GPT-4']:
    llm[f'Argument 1-{m}'] = (llm[m] == 1).astype(float)
    llm[f'Argument 2-{m}'] = (llm[m] == 2).astype(float)
model_data = model_data.merge(
    llm,
    on=[
        'Argument 1-Premise 2','Argument 1-Conclusion',
        'Argument 2-Premise 1','Argument 2-Premise 2','Argument 2-Conclusion'
    ]
)

# 11) Compute agreement with human majority
for m in models:
    model_data[m] = (
        (model_data[f'Argument 2-{m}'] > model_data[f'Argument 1-{m}'])
        == model_data['Argument 2 Chosen Majority']
    ).astype(float)

# 12) Melt & plot
melted = model_data.melt(
    id_vars=argument_columns,
    value_vars=models,
    var_name='Model',
    value_name='Agreement with Humans (%)'
)
plot_df = melted.groupby('Model', as_index=False)['Agreement with Humans (%)'].mean()

fig = px.bar(
    plot_df,
    x='Model', y='Agreement with Humans (%)',
    color='Model',
    category_orders={'Model': models},
    color_discrete_map=utils.model_colors,
    range_y=[0,1]
)
fig.add_hline(y=0.56, line_dash='dot', line_color='black')
fig = utils.format_figure(fig, height=600, width=800, showlegend=False)

# 13) Annotate significance
sums = melted.groupby('Model')['Agreement with Humans (%)'].sum()
cnts = melted.groupby('Model')['Agreement with Humans (%)'].count()
effects = pd.DataFrame({'True Count': sums, 'Total Count': cnts})
effects['p_stars'] = effects.apply(
    lambda r: utils.p_to_stars(binom.sf(r['True Count']-1, r['Total Count'], 0.5)),
    axis=1
)
effects['Bar Height'] = effects['True Count'] / effects['Total Count']

for model, row in effects.iterrows():
    fig.add_annotation(
        x=model,
        y=row['Bar Height'] + 0.05,
        text=row['p_stars'],
        showarrow=False,
        font=dict(size=16)
    )

fig.show()

KeyError: 'Premise\xa01'

In [18]:
models = ['ISC-CI','Contrast']
# Generate numerical asymmetry scores for model and humans
model_data = pd.concat([model_argument1,model_argument2],axis=1).rename(columns={'Argument 1-Argument 2 Chosen Majority':'Argument 2 Chosen Majority'})
model_data = model_data.merge(llm_data,on=['Argument 1-Premise 1','Argument 1-Conclusion','Argument 2-Premise 1','Argument 2-Conclusion'])
for model in models:
    model_data[model] = model_data[f'Argument 2-{model}']-model_data[f'Argument 1-{model}']
model_data = model_data.fillna('').melt(id_vars=argument_columns,value_vars=models,var_name='Model',value_name='Model Asymmetry Score')
human_data_averaged = human_data.groupby(argument_columns).mean()['Argument 2 Chosen'].reset_index().rename(columns={'Argument 2 Chosen':'Human Asymmetry Score'})
argument_data = model_data.merge(human_data_averaged,on=argument_columns)

# Run regression for each model's predictions
regression_data = []
for model in models:
    model_arg_data = argument_data[argument_data['Model']==model]
    lr = pg.linear_regression(X=model_arg_data['Human Asymmetry Score'],y=model_arg_data['Model Asymmetry Score'])
    lr = lr[lr.names=='Human Asymmetry Score']
    lr['Model'] = model
    lr['p_stars'] = lr['pval'].apply(utils.p_to_stars)
    regression_data.append(lr)
regression_data = pd.concat(regression_data,ignore_index=True)
regression_data = regression_data.rename(columns={'r2':'Agreement with Humans (R<sup>2</sup>)'})

# Generate bar plot
plot_data = model_data.groupby(['Model']).mean().reset_index()
fig = px.bar(regression_data,x='Model',color='Model',y='Agreement with Humans (R<sup>2</sup>)',
             category_orders={'Model':['Human']+models},
             color_discrete_map=utils.model_colors,range_y=[0,1])
fig = utils.format_figure(fig,height=600,width=400,showlegend=False)
for model in models:
    p_stars = regression_data[regression_data['Model']==model]['p_stars'].values[0]
    bar_height = regression_data[regression_data['Model']==model]['Agreement with Humans (R<sup>2</sup>)'].values[0]
    fig.add_annotation(x=model,y=bar_height+0.05,text=p_stars,
                       showarrow=False,font=dict(size=16))
fig.show()

## Multi-Alternative Context Effects

In [19]:
models = ['ISC-CI','GPT-3.5','GPT-4']
argument_columns = ['Premise 1','Conclusion 1','Conclusion 2','Distractor 1','Distractor 2']
human_data = pd.read_csv('data/similarity_experiments/similarity_in_context.csv', index_col=0)

human_context_effects = analysis.calc_similarity_context_effect_human(human_data)
significant_sets = human_context_effects[human_context_effects['Context Effect']>0]

# Calculate CICO predictions
model_data = analysis.calc_similarity_context_effect(significant_sets, cico_model, data_loader)

# Add LLM predictions
llm_data = pd.read_csv('data/similarity_experiments/similarity_in_context_llm.csv', index_col=0)
gpt_35_data = llm_data.drop(columns=['GPT-4']).rename(columns={'GPT-3.5':'Choice'})
gpt_4_data = llm_data.drop(columns=['GPT-3.5']).rename(columns={'GPT-4':'Choice'})
gpt_35_data['Conclusion 1 Chosen'] = gpt_35_data.Choice==1
gpt_35_data['Conclusion 2 Chosen'] = gpt_35_data.Choice==3
gpt_4_data['Conclusion 1 Chosen'] = gpt_4_data.Choice==1
gpt_4_data['Conclusion 2 Chosen'] = gpt_4_data.Choice==3
gpt_35_data = analysis.calc_similarity_context_effect_human(gpt_35_data).rename(columns={'Context Effect':'GPT-3.5'})
gpt_4_data = analysis.calc_similarity_context_effect_human(gpt_4_data).rename(columns={'Context Effect':'GPT-4'})

model_data = model_data.merge(gpt_35_data,on=argument_columns)
model_data = model_data.merge(gpt_4_data,on=argument_columns)
model_data = model_data[models].melt(var_name='Model',value_name='Agreement with Humans (%)')

In [12]:
# Generate bar plot
model_direction_data = model_data.copy()
model_direction_data['Agreement with Humans (%)'] = (model_direction_data['Agreement with Humans (%)']>0).astype(float)
plot_data = model_direction_data.groupby(['Model']).mean().reset_index()
fig = px.bar(plot_data,x='Model',color='Model',y='Agreement with Humans (%)',
             category_orders={'Model':['Human']+models},
             color_discrete_map=utils.model_colors,range_y=[0,1])
fig.add_hline(y=0.5, line_dash='dot', line_color="black")
fig = utils.format_figure(fig,height=600,width=800,showlegend=False)

# Calculate significance & add to plot
model_effect_sizes = model_direction_data.groupby(['Model']).sum().reset_index()
model_effect_sizes['Count'] = model_direction_data.groupby(['Model']).count().reset_index()['Agreement with Humans (%)']
model_effect_sizes['pval'] = binom.sf(model_effect_sizes['Agreement with Humans (%)']-1,model_effect_sizes['Count'],0.5)
model_effect_sizes['p_stars'] = model_effect_sizes['pval'].apply(utils.p_to_stars)
model_effect_sizes['Bar Height'] = model_effect_sizes['Agreement with Humans (%)']/model_effect_sizes['Count']
for model in models:
    p_stars = model_effect_sizes[model_effect_sizes['Model']==model]['p_stars'].values[0]
    bar_height = model_effect_sizes[model_effect_sizes['Model']==model]['Bar Height'].values[0]
    fig.add_annotation(x=model,y=bar_height+0.05,text=p_stars,
                       showarrow=False,font=dict(size=16))

fig.show()

KeyError: 'Agreement with Humans (%)'