In [None]:
%load_ext autoreload
%autoreload 2
from utils.utils import *

In [None]:

name = 'Final_results'

datasets = ['CIQ','MDI','CSII']
datasets = ['MDI','CSII','CIQ']
prompt_type = 'Metric'

In [None]:
with open(f'{name}/all_dict.pkl', 'rb') as f:
    all_dict= pickle.load(f)


## Analyze

In [None]:
all_dict = {}

### OpenAI

In [None]:

datasets = ['MDI','CSII','CIQ']
prompt_type = 'Metric'
models = ["gpt-3.5-turbo-0125", "gpt-4.1-2025-04-14", "o4-mini-2025-04-16"]


for dataset in datasets:
    print(dataset)
    stats = get_stats(dataset)
    model_dict = {}
    for model in models:
        print('#'*20)
        print(model)
        try:
            batch_dir =f"{name}/{dataset}/{prompt_type}/{model}/"
            
            
            outcomes_df, _,_ = get_percentiles_fixed(batch_dir)#.set_index('number')
            outcomes_df = outcomes_df.set_index('number')
            df = stats.merge(outcomes_df,left_index = True,right_index=True)
            
            model_title = names[model]
            model_dict[model_title] = df
        except Exception as e:
            print("An error occurred:", e)
                
    all_dict[dataset]=model_dict

### Anthropic

In [None]:
name = 'Final_results_reversed'
prompt_type = 'Metric'
models = ['claude-3-haiku-20240307','claude-3-7-sonnet-20250219',]
datasets = ['MDI','CSII','CIQ']

for dataset in datasets:
    stats = get_stats(dataset)
    for model in models:
            try:
                batch_dir =f"{name}/{dataset}/{prompt_type}/{model}/"
                
                
                outcomes_df, win_df,df = get_percentiles_fixed(batch_dir)
                outcomes_df = outcomes_df.set_index('number')
                df = stats.merge(outcomes_df,left_index = True,right_index=True)
                model_title = names[model]
                print(model_title)
                all_dict[dataset][model_title] = df
            except Exception as e:
                print("An error occurred:", e)
                print(model)


In [None]:
import pickle

with open(f'{name}/all_dict.pkl', 'wb') as f:
    pickle.dump(all_dict, f)

# Continue

In [None]:
plot_gri_tir_dual_axis_fixed_scale(all_dict, datasets, correlation_type='spearman')

# Regression

In [None]:
true_coeffs = [3, 2.4, .8, 1.6] 
dataset = 'CIQ'
datasets = ['MDI','CSII','CIQ']



In [None]:
compare_regression_coefficients_sm(datasets, all_dict, true_coeffs, scaled=False)

# Concordance

In [None]:
models = ['gpt-3.5-turbo-0125', 'gpt-4.1-2025-04-14', 'o4-mini-2025-04-16','claude-3-haiku-20240307','claude-3-7-sonnet-20250219',]


In [None]:
import pickle
import pandas as pd
import numpy as np
from itertools import combinations
import matplotlib.gridspec as gridspec


model_decisions = {}

concordance_matrices = []

for dataset in datasets:
    first = True
    model_decisions = {}
    for model in models:
        
            
        batch_dir = f"{name}/{dataset}/{prompt_type}/{model}/"
        with open(f"{batch_dir}/outputs.p", "rb") as f:
            outputs = pickle.load(f)

        _, win_df, df = get_percentiles_fixed(batch_dir)


        if first == True:
            df[['id1', 'id2']] = df['pair'].str.split('_', expand=True)
            temp_model = 'o4-mini' 
            for index, row in df.iterrows():
                id1 = int(row['id1'])
                id2 = int(row['id2'])
                gri1 = all_dict[dataset][temp_model].loc[id1, 'GRI']
                gri2 = all_dict[dataset][temp_model].loc[id2, 'GRI']
                if gri1 < gri2:
                    df.at[index, 'lower_GRI'] = id1
                else:
                    df.at[index, 'lower_GRI'] = id2
                # go through win df and make column with higher TIR
                tir1 = all_dict[dataset][temp_model].loc[id1, 'TIR']
                tir2 = all_dict[dataset][temp_model].loc[id2, 'TIR']
                if tir1 > tir2:
                    df.at[index, 'higher_TIR'] = id1
                else:
                    df.at[index, 'higher_TIR'] = id2

            model_decisions['GRI'] = df.set_index('pair')['lower_GRI']
            model_decisions['TIR'] = df.set_index('pair')['higher_TIR']
            first = False
        model_decisions[names[model]] = df.set_index('pair')['winner']

    models_list = list(model_decisions.keys())
    concordance_matrix = pd.DataFrame(index=models_list, columns=models_list, dtype=float)

    for m1, m2 in combinations(models_list, 2):
        s1 = model_decisions[m1]
        s2 = model_decisions[m2]
        
        common_pairs = s1.index.intersection(s2.index)
        if len(common_pairs) == 0:
            concordance = np.nan
        else:
            concordance = (s1.loc[common_pairs] == s2.loc[common_pairs]).mean()
        
        concordance_matrix.loc[m1, m2] = concordance
        concordance_matrix.loc[m2, m1] = concordance

    np.fill_diagonal(concordance_matrix.values, 1.0)

    concordance_matrix = concordance_matrix.round(3)
    concordance_matrices.append(concordance_matrix)
    

fig = plt.figure(figsize=(18, 4), dpi=150)
gs = gridspec.GridSpec(1, 4, width_ratios=[1, 1, 1, 0.05])
axes = [fig.add_subplot(gs[i]) for i in range(3)]
cbar_ax = fig.add_subplot(gs[3])

heatmaps = []
for i, ax in enumerate(axes):
    mask = np.tril(np.ones_like(concordance_matrices[i], dtype=bool), k=-1)  
    hm = sns.heatmap(
        concordance_matrices[i].astype(float),
        annot=True,
        cmap="Blues",
        fmt=".2f",
        linewidths=0.5,
        cbar=False,  
        ax=ax, mask =mask
    )
    ax.set_title(f"{datasets[i]} Concordance")
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    ax.grid(False)
    
    for idx in range(concordance_matrices[i].shape[0]):
        axes[i].add_patch(
            plt.Rectangle((idx, idx), 1, 1, fill=True, color='lightgrey', lw=0, zorder=3)
        )

    for spine in ax.spines.values():
        spine.set_visible(False)

    heatmaps.append(hm)

fig.colorbar(heatmaps[-1].collections[0], cax=cbar_ax)
cbar_ax.set_ylabel("Concordance", rotation=270, labelpad=15)


plt.subplots_adjust(wspace=0.7)  
plt.savefig("Figures/all_datasets_concordance.png", dpi=600, bbox_inches='tight')
plt.show()


# Case Studies


## Case studies

In [None]:


var = 'both'
datasets = ['CSII']  

for dataset in datasets:
    stats = get_stats(dataset)
    
    model_dfs = {}
    model_pairs = {}
    
    for model in models:
        
        name = 'Final_results'
        batch_dir = f"{name}/{dataset}/{prompt_type}/{model}/"
        outcomes_df, win_df, pair = get_percentiles_fixed(batch_dir)
        outcomes_df = outcomes_df.set_index('number')
        df = stats.merge(outcomes_df,left_index=True,right_index=True)
        
        model_dfs[model] = df
        model_pairs[model] = pair

    first_model = models[0]
    df = model_dfs[first_model]
    



In [None]:
conflicting_pairs = []
for pr in [(27,97),(58,100)]:
    idx1, idx2 = pr
    row1 = df.loc[idx1]
    row2 = df.loc[idx2]

    model_votes = {}
    for model in models:
        pair_df = model_pairs[model]
        pair_id1 = f"{idx1}_{idx2}"
        pair_id2 = f"{idx2}_{idx1}"
        winner = None
        if pair_id1 in pair_df['pair'].values:
            winner = pair_df[pair_df['pair'] == pair_id1]['winner'].values[0]
        elif pair_id2 in pair_df['pair'].values:
            winner = pair_df[pair_df['pair'] == pair_id2]['winner'].values[0]
            flag ==True
        
        model_votes[model] = int(winner)
    votes = [vote for vote in model_votes.values()]
   
    conflicting_pairs.append(pr)
    print("Model votes:", model_votes)

    plot_two_cases_with_votes(row1, row2, model_votes,  dataset, idx1, idx2,model_dfs)