In [None]:
import pandas as pd

# Load the dataset
df = pd.read_parquet('data/tidy_data_2_aed_model.parquet')
gr_dor_columns = df.columns[df.columns.str.startswith('gr_dor')]
df['gr_dor_median'] = df[gr_dor_columns].median(axis=1)
gr_ori_columns = df.columns[df.columns.str.startswith('gr_ori')]
df['gr_ori_median'] = df[gr_ori_columns].median(axis=1)

print(df.columns)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# === Configuration ===
model_name_map = {
    "gpt_4o_mini": "4o-mini",
    "gpt_41_nano": "4.1-nano",
    "claude_35_haiku": "3-haiku",
    "grok_3_mini_beta": "grok-3-mini",
    "ds_v3": "ds-v3"
}

# === gr_dor processing ===
df_box_dor = df[gr_dor_columns.tolist() + ['gr_dor_median']].copy()
df_box_dor.columns = [col.replace('gr_dor_', '') for col in df_box_dor.columns]
df_box_dor.rename(columns={k: model_name_map.get(k, k) for k in df_box_dor.columns}, inplace=True)
df_long_dor = df_box_dor.melt(var_name='Model', value_name='Score')
df_long_dor['Criterion'] = 'Deep of Reasoning'

# === gr_ori processing ===
df_box_ori = df[gr_ori_columns.tolist() + ['gr_ori_median']].copy()
df_box_ori.columns = [col.replace('gr_ori_', '') for col in df_box_ori.columns]
df_box_ori.rename(columns={k: model_name_map.get(k, k) for k in df_box_ori.columns}, inplace=True)
df_long_ori = df_box_ori.melt(var_name='Model', value_name='Score')
df_long_ori['Criterion'] = 'Originality Score'

# === Combine into one DataFrame ===
df_long_combined = pd.concat([df_long_dor, df_long_ori], ignore_index=True)

# === Single grouped boxplot ===
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")
boxplot = sns.boxplot(
    data=df_long_combined,
    x='Model',
    y='Score',
    hue='Criterion',
    palette='rocket_r',
    linewidth=1.5,
    fliersize=5,
    dodge=True,
    flierprops=dict(marker='o', markerfacecolor='red', markersize=4)  # Change outlier shape
)

# Enhance the aesthetics
plt.title('Comparison of Deep of Reasoning and Originality scores by model', fontsize=16)
plt.xlabel('', fontsize=14)
plt.ylabel('Score', fontsize=14)
plt.xticks(rotation=30, fontsize=12)
plt.yticks(fontsize=12)
plt.legend(title='Criterion', loc='lower left', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figures/comparison_deep_of_reasoning_and_originality_scores.png', dpi=360)
plt.show()
plt.close()

# === Summary statistics ===
def compute_summary(columns, prefix, alias_map, metric_label):
    rows = []
    for col in columns:
        raw_name = col.replace(prefix, '')
        name = alias_map.get(raw_name, raw_name)
        data = df[col]
        rows.append({
            'Criterion': metric_label,
            'Model': name,
            'Mean': data.mean(),
            'Std': data.std(),
            'Median': data.median(),
            'Min': data.min(),
            'Max': data.max(),
            'Coefficient of Variation': data.std() / data.mean() if data.mean() != 0 else None
        })
    # Add median row
    median_col = f'{prefix}median'
    rows.append({
        'Criterion': metric_label,
        'Model': 'median',
        'Mean': df[median_col].mean(),
        'Std': df[median_col].std(),
        'Median': df[median_col].median(),
        'Min': df[median_col].min(),
        'Max': df[median_col].max(),
        'Coefficient of Variation': df[median_col].std() / df[median_col].mean()
    })
    return pd.DataFrame(rows)

summary_dor = compute_summary(gr_dor_columns, 'gr_dor_', model_name_map, 'Deep of Reasoning')
summary_ori = compute_summary(gr_ori_columns, 'gr_ori_', model_name_map, 'Originality Score')

summary_combined = pd.concat([summary_dor, summary_ori], ignore_index=True)
summary_combined = summary_combined.round(4).sort_values(by=['Criterion', 'Model']).reset_index(drop=True)

# Display summary table
display(summary_combined)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Alias mapping
model_name_map = {
    "gpt_4o_mini": "4o-mini",
    "gpt_41_nano": "4.1-nano",
    "claude_35_haiku": "3-haiku",
    "grok_3_mini_beta": "grok-3-mini",
    "ds_v3": "ds-v3"
}

# Prepare correlation matrix
df2cor = pd.concat([df[gr_dor_columns], df['gr_dor_median'], df[gr_ori_columns], df['gr_ori_median']], axis=1)
correlation_matrix = df2cor.corr(method='kendall')

# Apply alias names and remove prefixes
def clean_label(label):
    for key in model_name_map:
        if key in label:
            label = label.replace(key, model_name_map[key])
    label = label.replace('gr_dor_', '').replace('gr_ori_', '').replace('_median', ' median')
    return label

renamed_columns = [clean_label(col) for col in df2cor.columns]
correlation_matrix.columns = renamed_columns
correlation_matrix.index = renamed_columns

# Mask the lower triangle
mask = np.tril(np.ones_like(correlation_matrix, dtype=bool))

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='rocket_r', square=True,
            cbar_kws={"shrink": .8}, linewidths=0.5, mask=mask, annot_kws={"size": 10})

# Calculate separation index
dor_sep_index = len(gr_dor_columns) + 1

# Add horizontal and vertical lines at the separation index
plt.axhline(y=dor_sep_index, color='black', linewidth=1.5)
plt.axvline(x=dor_sep_index, color='black', linewidth=1.5)

# Add quadrant labels outside the heatmap (below and left)
n = correlation_matrix.shape[0]
plt.text(dor_sep_index / 2, n - 12.2, r'$\mathbf{DoR \ x \ DoR}$', fontsize=10, ha='center', color='blue')
plt.text((dor_sep_index + n) / 2, n - 12.2, r'$\mathbf{DoR \ x \ ORI}$', fontsize=10, ha='center', color='green')
plt.text(n + 0.2, dor_sep_index / 2, r'$\mathbf{ORI \ x \ DoR}$', fontsize=10, va='center', rotation=270, color='green')
plt.text(n + 0.2, (dor_sep_index + n) / 2, r'$\mathbf{ORI \ x \ ORI}$', fontsize=10, va='center', rotation=270, color='firebrick')

# Add axis group labels
plt.text(dor_sep_index / 2, 12, 'Group DoR', fontsize=12, ha='center', color='black')
plt.text((dor_sep_index + n) / 2, 12, 'Group ORI', fontsize=12, ha='center', color='black')
plt.text(0, dor_sep_index / 2, 'Group DoR', fontsize=12, va='center', rotation=90, color='black')
plt.text(0, (dor_sep_index + n) / 2, 'Group ORI', fontsize=12, va='center', rotation=90, color='black')

# Adjust aesthetics
plt.title('Tau Kendall Correlation Matrix of Deep of Reasoning (DoR)\nand Originality (ORI) Scores', fontsize=16, pad=30)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('figures/kendall_correlogram.png', dpi=360)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython.display import display

# Alias mapping
model_name_map = {
    "gpt_4o_mini": "4o-mini",
    "gpt_41_nano": "4.1-nano",
    "claude_35_haiku": "3-haiku",
    "grok_3_mini_beta": "grok-3-mini",
    "ds_v3": "ds-v3"
}

def clean_label(label):
    for key in model_name_map:
        if key in label:
            label = label.replace(key, model_name_map[key])
    label = label.replace('gr_dor_', '').replace('gr_ori_', '').replace('_median', ' median')
    return label

def bootstrap_correlation(df_subset, n_iterations=100):
    """Perform bootstrap to calculate confidence intervals for Kendall's tau."""
    tau_values = []
    for _ in range(n_iterations):
        sample = df_subset.sample(frac=1, replace=True)
        tau = sample.corr(method='kendall').iloc[0, 1]  # Assuming we want tau between first two columns
        tau_values.append(tau)
    lower_bound = np.percentile(tau_values, 2.5)
    upper_bound = np.percentile(tau_values, 97.5)
    return lower_bound, upper_bound, tau_values

def hypothesis_test(tau_values):
    """Perform hypothesis test for Kendall's tau."""
    p_value = (np.sum(np.array(tau_values) <= 0) + np.sum(np.array(tau_values) >= 0)) / len(tau_values)  # Two-tailed test
    return p_value

def plot_correlogram(df_subset, title, filename=None, ax=None, annotate=True):
    df2cor = pd.concat([
        df_subset[gr_dor_columns], df_subset['gr_dor_median'],
        df_subset[gr_ori_columns], df_subset['gr_ori_median']
    ], axis=1)
    correlation_matrix = df2cor.corr(method='kendall')

    renamed_columns = [clean_label(col) for col in df2cor.columns]
    correlation_matrix.columns = renamed_columns
    correlation_matrix.index = renamed_columns

    # Display the correlation matrix
    print(f"Displaying the Kendall correlation matrix for: {title}.")
    display(correlation_matrix)

    # Calculate and display bootstrap confidence intervals and hypothesis test results
    lower_bound, upper_bound, tau_values = bootstrap_correlation(df2cor)
    p_value = hypothesis_test(tau_values)
    print(f"95% Confidence Interval for τ: [{lower_bound:.3f}, {upper_bound:.3f}]")
    print(f"P-value for hypothesis test: {p_value:.3f}")

    mask = np.tril(np.ones_like(correlation_matrix, dtype=bool))

    if ax is None:
        fig, ax = plt.subplots(figsize=(14, 10))

    sns.heatmap(correlation_matrix, annot=annotate, fmt=".2f", cmap='rocket_r', square=True,
                cbar_kws={"shrink": .8} if annotate else None, linewidths=0.5, mask=mask,
                annot_kws={"size": 10} if annotate else {}, ax=ax)

    dor_sep_index = len(gr_dor_columns) + 1
    n = correlation_matrix.shape[0]

    ax.axhline(y=dor_sep_index, color='black', linewidth=1.5)
    ax.axvline(x=dor_sep_index, color='black', linewidth=1.5)

    ax.text(dor_sep_index / 2, n - 12.2, r'$\mathbf{DoR \ x \ DoR}$', fontsize=10, ha='center', color='blue')
    ax.text((dor_sep_index + n) / 2, n - 12.2, r'$\mathbf{DoR \ x \ ORI}$', fontsize=10, ha='center', color='green')
    ax.text(n + 0.2, dor_sep_index / 2, r'$\mathbf{ORI \ x \ DoR}$', fontsize=10, va='center', rotation=270, color='green')
    ax.text(n + 0.2, (dor_sep_index + n) / 2, r'$\mathbf{ORI \ x \ ORI}$', fontsize=10, va='center', rotation=270, color='firebrick')

    ax.text(dor_sep_index / 2, 12, 'Group DoR', fontsize=12, ha='center', color='black')
    ax.text((dor_sep_index + n) / 2, 12, 'Group ORI', fontsize=12, ha='center', color='black')
    ax.text(0, dor_sep_index / 2, 'Group DoR', fontsize=12, va='center', rotation=90, color='black')
    ax.text(0, (dor_sep_index + n) / 2, 'Group ORI', fontsize=12, va='center', rotation=90, color='black')

    ax.set_title(title, fontsize=14, pad=12)
    ax.tick_params(axis='x', rotation=45)
    ax.tick_params(axis='y', rotation=0)

    if filename and ax is None:
        plt.tight_layout()
        plt.savefig(filename, dpi=360)
        plt.close()

# Create combined 2x2 plot with full + prompt_type
fig, axes = plt.subplots(2, 2, figsize=(14, 13))
fig.suptitle('Tau Kendall Correlation Matrix of Deep of Reasoning (DoR)\nand Originality (ORI) Scores by Prompt Type', fontsize=16)

# Plot for all data
plot_correlogram(df, 'All Data', ax=axes[0, 0], annotate=True)

# Display correlations for all prompt types
prompt_types = df['prompt_type'].dropna().unique()
for ax, prompt in zip(axes.flat[1:], prompt_types):
    df_subset = df[df['prompt_type'] == prompt]
    plot_correlogram(df_subset, prompt, ax=ax, annotate=True)

plt.tight_layout(rect=[0, 0, 1, 1])
plt.savefig('figures/kendall_correlogram_combined.png', dpi=360)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython.display import display

# Alias mapping
model_name_map = {
    "gpt_4o_mini": "4o-mini",
    "gpt_41_nano": "4.1-nano",
    "claude_35_haiku": "3-haiku",
    "grok_3_mini_beta": "grok-3-mini",
    "ds_v3": "ds-v3"
}

def clean_label(label):
    for key in model_name_map:
        if key in label:
            label = label.replace(key, model_name_map[key])
    label = label.replace('gr_dor_', '').replace('gr_ori_', '').replace('_median', ' median')
    return label

def plot_correlogram(df_subset, title, filename=None, ax=None, annotate=True):
    df2cor = pd.concat([
        df_subset[gr_dor_columns], df_subset['gr_dor_median'],
        df_subset[gr_ori_columns], df_subset['gr_ori_median']
    ], axis=1)
    correlation_matrix = df2cor.corr(method='kendall')

    renamed_columns = [clean_label(col) for col in df2cor.columns]
    correlation_matrix.columns = renamed_columns
    correlation_matrix.index = renamed_columns

    # Display the correlation matrix
    print(f"Displaying the Kendall correlation matrix for: {title}.")
    display(correlation_matrix)

    mask = np.tril(np.ones_like(correlation_matrix, dtype=bool))

    if ax is None:
        fig, ax = plt.subplots(figsize=(14, 10))

    sns.heatmap(correlation_matrix, annot=annotate, fmt=".2f", cmap='rocket_r', square=True,
                cbar_kws={"shrink": .8} if annotate else None, linewidths=0.5, mask=mask,
                annot_kws={"size": 10} if annotate else {}, ax=ax)

    dor_sep_index = len(gr_dor_columns) + 1
    n = correlation_matrix.shape[0]

    ax.axhline(y=dor_sep_index, color='black', linewidth=1.5)
    ax.axvline(x=dor_sep_index, color='black', linewidth=1.5)

    ax.text(dor_sep_index / 2, n - 12.2, r'$\mathbf{DoR \ x \ DoR}$', fontsize=10, ha='center', color='blue')
    ax.text((dor_sep_index + n) / 2, n - 12.2, r'$\mathbf{DoR \ x \ ORI}$', fontsize=10, ha='center', color='green')
    ax.text(n + 0.2, dor_sep_index / 2, r'$\mathbf{ORI \ x \ DoR}$', fontsize=10, va='center', rotation=270, color='green')
    ax.text(n + 0.2, (dor_sep_index + n) / 2, r'$\mathbf{ORI \ x \ ORI}$', fontsize=10, va='center', rotation=270, color='firebrick')

    ax.text(dor_sep_index / 2, 12, 'Group DoR', fontsize=12, ha='center', color='black')
    ax.text((dor_sep_index + n) / 2, 12, 'Group ORI', fontsize=12, ha='center', color='black')
    ax.text(0, dor_sep_index / 2, 'Group DoR', fontsize=12, va='center', rotation=90, color='black')
    ax.text(0, (dor_sep_index + n) / 2, 'Group ORI', fontsize=12, va='center', rotation=90, color='black')

    ax.set_title(title, fontsize=16, pad=20)
    ax.tick_params(axis='x', rotation=45)
    ax.tick_params(axis='y', rotation=0)

    if filename and ax is None:
        plt.tight_layout()
        plt.savefig(filename, dpi=360)
        plt.close()

# Create combined 3x3 plot with prompt_type and source
fig, axes = plt.subplots(3, 3, figsize=(20, 20))
fig.suptitle('Tau Kendall Correlation Matrix of Deep of Reasoning (DoR) and\nOriginality (ORI) Scores by Prompt Type and Source', fontsize=20)

# Display correlations for selected prompt types and sources
prompt_types = df['prompt_type'].dropna().unique()  # Select first three prompt types
sources = df['source'].dropna().unique()  # Select first three sources
for ax, (prompt, source) in zip(axes.flat, [(p, s) for p in prompt_types for s in sources]):
    df_subset = df[(df['prompt_type'] == prompt) & (df['source'] == source)]
    plot_correlogram(df_subset, f'{prompt} - {source}', ax=ax, annotate=True)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig('figures/kendall_correlogram_combined_source_prompt.png', dpi=360)
plt.show()


# **Step 1: Reasoning depth versus alternative correctness**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# === Configuration ===
model_name_map = {
    "gpt_4o_mini": "4o-mini",
    "gpt_41_nano": "4.1-nano",
    "claude_35_haiku": "3-haiku",
    "grok_3_mini_beta": "grok-3-mini",
    "ds_v3": "ds-v3"
}

# === Process gr_dor ===
df_dor = df[gr_dor_columns.tolist() + ['gr_dor_median', 'hit_alternative']].copy()
df_dor.columns = [col.replace('gr_dor_', '') for col in df_dor.columns]
df_dor.rename(columns={k: model_name_map.get(k, k) for k in df_dor.columns}, inplace=True)
df_long_dor = df_dor.melt(id_vars='hit_alternative', var_name='Model', value_name='Score')
df_long_dor['Criterion'] = 'Deep of Reasoning'

# === Process gr_ori ===
df_ori = df[gr_ori_columns.tolist() + ['gr_ori_median', 'hit_alternative']].copy()
df_ori.columns = [col.replace('gr_ori_', '') for col in df_ori.columns]
df_ori.rename(columns={k: model_name_map.get(k, k) for k in df_ori.columns}, inplace=True)
df_long_ori = df_ori.melt(id_vars='hit_alternative', var_name='Model', value_name='Score')
df_long_ori['Criterion'] = 'Originality Score'

# === Combine and relabel hit_alternative ===
df_long_combined = pd.concat([df_long_dor, df_long_ori], ignore_index=True)
df_long_combined['hit_alternative'] = df_long_combined['hit_alternative'].map({0: 'Fail', 1: 'Success'})

# === Plot with FacetGrid (one figure per Criterion) ===
sns.set(style="whitegrid")
g = sns.FacetGrid(
    df_long_combined,
    col='Criterion',
    height=6,  # Updated figure height
    aspect=1.0,  # Updated aspect ratio to maintain overall size
    sharey=False
)
g.map_dataframe(
    sns.boxplot,
    x='Model',
    y='Score',
    hue='hit_alternative',
    palette='rocket_r',
    linewidth=1.5,
    fliersize=4,
    dodge=True,
    flierprops=dict(marker='o', markerfacecolor='red', markersize=4)
)

# === Formatting ===
g.set_titles(col_template="{col_name}")
g.set_axis_labels("", "Score")
for ax in g.axes.flat:
    ax.tick_params(axis='x', rotation=30)
g.add_legend(title='Alternative Hit', loc='lower center', bbox_to_anchor=(0.5, -0.1), ncol=2)
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Evaluation Scores by Model and Alternative Hit Status', fontsize=16)
plt.tight_layout()
plt.savefig('figures/faceted_scores_by_hit_status.png', dpi=360, bbox_inches='tight')
plt.show()

# === Compute summary by Model, Criterion, and Alternative Hit ===
summary_rows = []

for criterion, prefix, cols in [
    ("Deep of Reasoning", "gr_dor_", gr_dor_columns),
    ("Originality Score", "gr_ori_", gr_ori_columns)
]:
    for col in cols:
        raw_name = col.replace(prefix, '')
        model = model_name_map.get(raw_name, raw_name)

        for hit_value, hit_label in zip([0, 1], ['Fail', 'Success']):
            subset = df[df['hit_alternative'] == hit_value][col]
            summary_rows.append({
                'Criterion': criterion,
                'Model': model,
                'Alternative Hit': hit_label,
                'Mean': subset.mean(),
                'Std': subset.std(),
                'Median': subset.median(),
                'Min': subset.min(),
                'Max': subset.max(),
                'Coefficient of Variation': subset.std() / subset.mean() if subset.mean() != 0 else None
            })

    # Add median column
    median_col = f'{prefix}median'
    for hit_value, hit_label in zip([0, 1], ['Fail', 'Success']):
        subset = df[df['hit_alternative'] == hit_value][median_col]
        summary_rows.append({
            'Criterion': criterion,
            'Model': 'median',
            'Alternative Hit': hit_label,
            'Mean': subset.mean(),
            'Std': subset.std(),
            'Median': subset.median(),
            'Min': subset.min(),
            'Max': subset.max(),
            'Coefficient of Variation': subset.std() / subset.mean()
        })

# Convert to DataFrame and sort
summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.round(4).sort_values(by=['Criterion', 'Alternative Hit', 'Model']).reset_index(drop=True)

# Display or export
display(summary_df)



# **Adversarial Compensation Effect (ACE)**

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import kendalltau
from sklearn.metrics import f1_score

# Set seed for reproducibility
np.random.seed(42)

# Model aliases
model_name_map = {
    "gpt_4o_mini": "4o-mini",
    "gpt_41_nano": "4.1-nano",
    "claude_35_haiku": "3-haiku",
    "grok_3_mini_beta": "grok-3-mini",
    "ds_v3": "ds-v3"
}

# Prompt types
model_list = list(model_name_map.keys())
prompt_types = ['naive', 'cot', 'adversarial']

summary_data = []

for model_substring in model_list:
    model_match = next((m for m in df['model'].unique() if model_substring in m), None)
    if model_match is None:
        print(f"[!] Model '{model_substring}' not found.")
        continue

    try:
        dor_col = next(c for c in df.columns if f"gr_dor_{model_substring}" in c)
        ori_col = next(c for c in df.columns if f"gr_ori_{model_substring}" in c)
    except StopIteration:
        print(f"[!] Trait columns not found for '{model_substring}'")
        continue

    df_model = df[df["model"] == model_match]

    wf1_means = []
    dor_means = []
    dor_vars = []
    ori_means = []
    ori_vars = []
    kendall_dor_taus = []
    kendall_ori_taus = []

    for prompt in prompt_types:
        subset = df_model[df_model["prompt_type"] == prompt]

        # WF₁
        if subset.empty:
            wf1 = 0.0
        else:
            y_true = subset["answer"]
            y_pred = subset["model_answer"]

            # remove NaNs e garante strings limpas (A/B/C/D/E)
            mask = y_true.notna() & y_pred.notna()
            y_true = y_true[mask].astype(str).str.strip().str.upper()
            y_pred = y_pred[mask].astype(str).str.strip().str.upper()

            if len(y_true) == 0:
                wf1 = 0.0
            else:
                # IMPORTANTe: incluir também labels que apareçam só no y_pred (ex: 'E')
                labels = sorted(set(y_true.unique()) | set(y_pred.unique()))
                wf1 = f1_score(
                    y_true, y_pred,
                    labels=labels,
                    average="weighted",
                    zero_division=0
                )

        wf1_means.append(wf1)


        # Normalize DoR and ORI
        dor_values = subset[dor_col]
        ori_values = subset[ori_col]
        dor_normalized = (dor_values - dor_values.min()) / (dor_values.max() - dor_values.min()) if dor_values.max() != dor_values.min() else dor_values
        ori_normalized = (ori_values - ori_values.min()) / (ori_values.max() - ori_values.min()) if ori_values.max() != ori_values.min() else ori_values

        # Mean and variance of normalized DoR
        dor_mean = dor_normalized.mean() if not subset.empty else 0
        dor_var = dor_normalized.var() if not subset.empty else 0
        dor_means.append(dor_mean)
        dor_vars.append(dor_var)

        # Mean and variance of normalized ORI
        ori_mean = ori_normalized.mean() if not subset.empty else 0
        ori_var = ori_normalized.var() if not subset.empty else 0
        ori_means.append(ori_mean)
        ori_vars.append(ori_var)

        # Kendall τ for DoR
        if len(subset) > 0 and dor_normalized.nunique() > 1:
            tau_dor, _ = kendalltau(dor_normalized, subset["hit"])
        else:
            tau_dor = 0
        kendall_dor_taus.append(tau_dor)

        # Kendall τ for ORI
        if len(subset) > 0 and ori_normalized.nunique() > 1:
            tau_ori, _ = kendalltau(ori_normalized, subset["hit"])
        else:
            tau_ori = 0
        kendall_ori_taus.append(tau_ori)

    model_display_name = model_name_map[model_substring]

    # Collect summary data
    for prompt, wf1, dor_mean, dor_var, ori_mean, ori_var, tau_dor, tau_ori in zip(prompt_types, wf1_means, dor_means, dor_vars, ori_means, ori_vars, kendall_dor_taus, kendall_ori_taus):
        summary_data.append({
            'Model': model_display_name,
            'Prompt Type': prompt,
            'WF₁ Mean': wf1,
            'Mean DoR': dor_mean,
            'DoR Var.': dor_var,
            'Mean Originality': ori_mean,
            'Originality Var.': ori_var,
            "Kendall's τ (DoR)": tau_dor,
            "Kendall's τ (ORI)": tau_ori
        })

# Convert summary data to DataFrame
summary_df = pd.DataFrame(summary_data)

# Display summary table
display(summary_df)


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from itertools import combinations
from sklearn.metrics import pairwise_distances
from statsmodels.multivariate.cancorr import CanCorr

# Prepare the data for PCA
features = ['WF₁ Mean', 'Mean DoR', 'DoR Var.', 'Mean Originality', 'Originality Var.']
x = summary_df[features]

# Standardize the data
x = (x - x.mean()) / x.std()

# Perform PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(x)

# Create a DataFrame with the PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Model'] = summary_df['Model']
pca_df['Prompt Type'] = summary_df['Prompt Type']  # Assuming 'Prompt Type' is a column in summary_df

# Calculate instability index for each model
instability_scores = []
for model in pca_df["Model"].unique():
    points = pca_df[pca_df["Model"] == model][["Principal Component 1", "Principal Component 2"]].values
    # Calculate all distances between pairs of points
    dists = []
    for (i, j) in combinations(range(len(points)), 2):
        dists.append(np.linalg.norm(points[i] - points[j]))
    instab_index = np.mean(dists) / (len(points) * (len(points) - 1) / 2) if len(points) > 1 else 0
    instability_scores.append({
        "Model": model,
        "Instability Index": instab_index
    })

# Set color palette
palette = sns.color_palette("viridis", len(pca_df['Prompt Type'].unique()))  # Changed to a more visually appealing palette

# Create a figure for both PCA and dendrogram
fig, axs = plt.subplots(1, 2, figsize=(20, 6))  # Create a single figure with two subplots

# Plot the PCA results
sns.scatterplot(data=pca_df, x='Principal Component 1', y='Principal Component 2', hue='Prompt Type', palette=palette, alpha=0.8, s=100, edgecolor='w', linewidth=0.5, ax=axs[0])

# Add lines to separate quadrants with a softer color
axs[0].axhline(0, color='gray', linestyle='--', linewidth=1)
axs[0].axvline(0, color='gray', linestyle='--', linewidth=1)

# Annotate points with model names, using a larger font size
for i, model in enumerate(pca_df['Model']):
    axs[0].annotate(model, (pca_df['Principal Component 1'][i], pca_df['Principal Component 2'][i]), fontsize=10, ha='right')

# Add quadrant interpretations, centralized with a more elegant font style
axs[0].text(1.5, 0.1, "Compensated Instability Region", fontsize=10, ha='center', fontweight='bold', color='darkorange')
axs[0].text(-2, 0.1, "Expressive Consistency Region", fontsize=10, ha='center', fontweight='bold', color='darkorange')
axs[0].text(-2, -0.2, "Suppressed Reasoning Region", fontsize=10, ha='center', fontweight='bold', color='darkorange')
axs[0].text(1.5, -0.2, "Misaligned Confidence Region", fontsize=10, ha='center', fontweight='bold', color='darkorange')

# Add explained variance for each dimension with a more prominent title
explained_variance = pca.explained_variance_ratio_
axs[0].set_title(f'PCA of Models Metrics\nExplained Variance: PC1 = {explained_variance[0]:.2f}, PC2 = {explained_variance[1]:.2f}', fontsize=16)
axs[0].set_xlabel('Principal Component 1', fontsize=12)
axs[0].set_ylabel('Principal Component 2', fontsize=12)
axs[0].grid(True, linestyle='--', alpha=0.7)

# Display loadings for each original criterion
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)  # Calculate loadings
loadings_df = pd.DataFrame(loadings, index=features, columns=['PC1', 'PC2'])

# Display eigenvalues
eigenvalues = pca.explained_variance_

# Heterotrait-Monotrait (HTMT) ratio calculation
def htmt_ratio(df_in, dor_cols, ori_cols, corr_method="spearman"):
    # Mantém só colunas relevantes e garante numérico
    data = df_in[list(dor_cols) + list(ori_cols)].apply(pd.to_numeric, errors="coerce").dropna()

    # Matriz de correlação (Spearman costuma ser mais robusto; pode usar "pearson" também)
    corr = data.corr(method=corr_method)

    # 1) Heterotrait-heteromethod: correlações entre indicadores de DoR e ORI
    hetero = corr.loc[dor_cols, ori_cols].abs().to_numpy().ravel()
    mean_hetero = hetero.mean()

    # 2) Monotrait-heteromethod: correlações entre indicadores dentro de cada construto (sem diagonal)
    dor_mtx = corr.loc[dor_cols, dor_cols].abs().to_numpy()
    ori_mtx = corr.loc[ori_cols, ori_cols].abs().to_numpy()

    dor_mono = dor_mtx[np.triu_indices_from(dor_mtx, k=1)]
    ori_mono = ori_mtx[np.triu_indices_from(ori_mtx, k=1)]

    mean_mono_dor = dor_mono.mean()
    mean_mono_ori = ori_mono.mean()

    denom = np.sqrt(mean_mono_dor * mean_mono_ori)
    return float(mean_hetero / denom) if denom > 0 else np.nan

# --- HTMT (DoR vs ORI) ---
dor_cols = list(gr_dor_columns)   # indicadores: notas DoR de cada juiz
ori_cols = list(gr_ori_columns)   # indicadores: notas ORI de cada juiz

# Para reproduzir o “sob adversarial stress” mencionado no paper:
df_htmt = df[df["prompt_type"] == "adversarial"]

htmt_score = htmt_ratio(df_htmt, dor_cols, ori_cols, corr_method="spearman")

# Dendrogram plot
from scipy.cluster.hierarchy import dendrogram, linkage

# Perform hierarchical clustering using the Euclidean distance
Z = linkage(pca_df[['Principal Component 1', 'Principal Component 2']], method='ward')

# Create a dendrogram to visualize the clustering
dendrogram(Z, labels=[f"{model} ({prompt})" for model, prompt in zip(pca_df['Model'].values, pca_df['Prompt Type'].values)], leaf_rotation=90, ax=axs[1])
axs[1].set_title('Hierarchical Clustering Dendrogram', fontsize=16)
axs[1].set_xlabel('', fontsize=12)
axs[1].set_ylabel('Euclidean Distance', fontsize=12)  # Specify the distance used
axs[1].grid(True, linestyle='--', alpha=0.7)

plt.savefig('figures/pca_and_dendrogram_plot.png', dpi=360, bbox_inches='tight')  # Save the combined figure
plt.show()

instability_df = pd.DataFrame(instability_scores).sort_values(by='Instability Index')

# Print HTMT score for validity
print(f"Heterotrait-Monotrait (HTMT) score: {htmt_score:.4f}")
print("Loadings (coefficients) for each criterion:")
print(loadings_df)
print("Eigenvalues:")
print(eigenvalues)
print(instability_df)


In [None]:
from scipy import stats

# Calculate effect sizes for DoR and ORI between hits and misses
def calculate_effect_size(df, score_columns):
    effect_sizes = {}
    for col in score_columns:
        hit_scores = df[df['hit'] == 1][col]
        miss_scores = df[df['hit'] == 0][col]
        # Calculate Cohen's d with the absolute difference
        d = np.abs(hit_scores.mean() - miss_scores.mean()) / np.sqrt((hit_scores.std() ** 2 + miss_scores.std() ** 2) / 2)
        # Convert d to r
        r = d / np.sqrt(d**2 + 4)
        effect_sizes[col] = r
    return effect_sizes

# Define score columns for DoR and ORI, ensuring both are included
dor_cols = [col for col in df.columns if col.startswith('gr_dor_') and not col.endswith('median')]
ori_cols = [col for col in df.columns if col.startswith('gr_ori_') and not col.endswith('median')]
score_columns = dor_cols + ori_cols  # Include both DoR and ORI columns

# Ensure df_hit0 is defined before calculating effect sizes
df_hit0 = df.copy()  # Assuming df is defined and filtering for misses

# Calculate effect sizes
effect_sizes = calculate_effect_size(df_hit0, score_columns)

# Print effect sizes
print("Effect Sizes (r) for DoR and ORI between hits and misses:")
for col, size in effect_sizes.items():
    print(f"{col}: {size:.4f}")

# Test ACE on an expanded set of low-capacity models
low_capacity_models = ['3-haiku', '4.1-nano', '4o-mini']  # Example low-capacity models
expanded_df = df[df['model'].isin(low_capacity_models)]

# Calculate ACE for the expanded set
# Assuming ACE calculation function is defined elsewhere
# ace_results = calculate_ace(expanded_df)

# Print ACE results if applicable
# print("ACE results for low-capacity models:")
# print(ace_results)


In [None]:
df.columns

# **Qualitative outlier analysis for the hit = 0 cases in Deep of Reasoning and Originality**

In [None]:
import pandas as pd
import numpy as np

# Filtro: apenas casos onde o modelo ERROU
df_hit0 = df[df['hit'] == 0].copy()

# Selecionar colunas com escores
dor_cols = [col for col in df.columns if col.startswith('gr_dor_') and not col.endswith('median')]
ori_cols = [col for col in df.columns if col.startswith('gr_ori_') and not col.endswith('median')]

# Função para detectar outliers com base no IQR
def detect_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return (series > upper)

# Detectar outliers por modelo nas métricas DoR e ORI
outlier_flags = pd.DataFrame(index=df_hit0.index)
for col in dor_cols + ori_cols:
    outlier_flags[col + '_outlier'] = detect_outliers_iqr(df_hit0[col])

# Combinar os dados
df_outliers = df_hit0[outlier_flags.any(axis=1)].copy()
df_outliers['outlier_columns'] = outlier_flags.loc[df_outliers.index].apply(lambda row: [col for col, val in row.items() if val], axis=1)

# Selecionar colunas úteis para análise
relevant_cols = ['source', 'item', 'model', 'prompt_type', 'CoT'] + dor_cols + ori_cols + ['outlier_columns']
df_outliers_view = df_outliers[relevant_cols]
df_outliers_view = df_outliers_view.sort_values(by=['model', 'source', 'prompt_type'])

df_outliers_view.to_csv('data/df_outliers_view.csv', index=False)
len(df_outliers_view)

# Definition of categories
error_categories = {
    "Plausible but incorrect": "Coherent reasoning that ends in a wrong answer.",
    "Concept confusion": "Misunderstanding of a core concept or principle.",
    "Alternative interpretation": "Valid logic applied to an unconventional or unintended interpretation of the question.",
    "Narrative drift": "Response veers into irrelevant or excessively verbose explanation.",
    "Incomplete or truncated": "Reasoning is too brief or lacks a conclusion."
}

# Mock function for classification
def classify_cot_mock(text):
    text_lower = text.lower()
    if "definition" in text_lower and "but" in text_lower and "so" in text_lower:
        return "Plausible but incorrect"
    elif "confuse" in text_lower or "mistake" in text_lower:
        return "Concept confusion"
    elif "could mean" in text_lower or "might be interpreted" in text_lower:
        return "Alternative interpretation"
    elif len(text_lower.split()) > 100:
        return "Narrative drift"
    elif len(text_lower.split()) < 10:
        return "Incomplete or truncated"
    else:
        return "Plausible but incorrect"

# Apply classification to all CoTs
cot_classification_df = df_outliers_view.copy()
cot_classification_df["CoT Category"] = cot_classification_df["CoT"].apply(classify_cot_mock)
cot_classification_df.to_csv('data/cot_classification_df.csv', index=False)

# Tabular result
cot_classification_df[['model', 'prompt_type', 'CoT', 'CoT Category']]

# Check the overall distribution of categories
category_counts = cot_classification_df["CoT Category"].value_counts().reset_index()
category_counts.columns = ["CoT Category", "Count"]

# Frequency by model and type of error
model_category = cot_classification_df.groupby(["model", "CoT Category"]).size().unstack(fill_value=0)

# Add row sums for model_category
model_category.loc['Row Sum'] = model_category.sum()

# Frequency by type of prompt
prompt_category = cot_classification_df.groupby(["prompt_type", "CoT Category"]).size().unstack(fill_value=0)

# Add row sums for prompt_category
prompt_category.loc['Row Sum'] = prompt_category.sum()

display(category_counts)
display(model_category)
display(prompt_category)

# Count the examples by category
category_counts = cot_classification_df["CoT Category"].value_counts()

# Select representative examples from each category
examples = []

for category in category_counts.index:
    subset = cot_classification_df[cot_classification_df["CoT Category"] == category]
    # Choose the example with the longest CoT to best represent the category
    representative = subset.loc[subset['CoT'].str.len().idxmax()]
    examples.append({
        "Category": category,
        "Model": representative["model"],
        "Prompt Type": representative["prompt_type"],
        "Question": representative["item"],
        "CoT": representative["CoT"]
    })

# Organize into DataFrame
examples_df = pd.DataFrame(examples)
display(examples_df)

# Filtrar possíveis casos de desalinhamento moral
alignment_relevant_categories = [
    "Plausible but incorrect",
    "Alternative interpretation",
    "Narrative drift"
]

alignment_df = cot_classification_df[cot_classification_df["CoT Category"].isin(alignment_relevant_categories)].copy()

# Procurar menções a termos sensíveis que indicam dilemas morais, normas sociais, julgamentos
keywords = [
    "should", "should not", "ethical", "moral", "deserve", "justice", "fair", "unfair",
    "responsible", "dangerous", "appropriate", "inappropriate", "illegal", "respect", "harm"
]

# Identificar CoTs que contêm essas palavras
pattern = '|'.join(keywords)
alignment_df["has_sensitive_terms"] = alignment_df["CoT"].str.lower().str.contains(pattern)

# Selecionar os que possuem indícios linguísticos de avaliação moral
morally_relevant = alignment_df[alignment_df["has_sensitive_terms"] == True]
display(morally_relevant)


# **Qualitative outlier analysis for the hit = 1 cases in Deep of Reasoning and Originality**

In [None]:
import pandas as pd
import numpy as np

# Filtro: apenas casos onde o modelo ERROU
df_hit1 = df[df['hit'] == 1].copy()

# Selecionar colunas com escores
dor_cols = [col for col in df.columns if col.startswith('gr_dor_') and not col.endswith('median')]
ori_cols = [col for col in df.columns if col.startswith('gr_ori_') and not col.endswith('median')]

# Função para detectar outliers com base no IQR
def detect_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return (series < lower)

# Detectar outliers por modelo nas métricas DoR e ORI
outlier_flags = pd.DataFrame(index=df_hit1.index)
for col in dor_cols + ori_cols:
    outlier_flags[col + '_outlier'] = detect_outliers_iqr(df_hit1[col])

# Combinar os dados
df_outliers = df_hit1[outlier_flags.any(axis=1)].copy()
df_outliers['outlier_columns'] = outlier_flags.loc[df_outliers.index].apply(lambda row: [col for col, val in row.items() if val], axis=1)

# Selecionar colunas úteis para análise
relevant_cols = ['source', 'item', 'model', 'prompt_type', 'CoT'] + dor_cols + ori_cols + ['outlier_columns']
df_outliers_view = df_outliers[relevant_cols]
df_outliers_view = df_outliers_view.sort_values(by=['model', 'source', 'prompt_type'])

df_outliers_view.to_csv('data/df_outliers_view_hit1.csv', index=False)
len(df_outliers_view)

# Definition of categories
error_categories = {
    "Plausible but incorrect": "Coherent reasoning that ends in a wrong answer.",
    "Concept confusion": "Misunderstanding of a core concept or principle.",
    "Alternative interpretation": "Valid logic applied to an unconventional or unintended interpretation of the question.",
    "Narrative drift": "Response veers into irrelevant or excessively verbose explanation.",
    "Incomplete or truncated": "Reasoning is too brief or lacks a conclusion."
}

# Mock function for classification
def classify_cot_mock(text):
    text_lower = text.lower()
    if "definition" in text_lower and "but" in text_lower and "so" in text_lower:
        return "Plausible but incorrect"
    elif "confuse" in text_lower or "mistake" in text_lower:
        return "Concept confusion"
    elif "could mean" in text_lower or "might be interpreted" in text_lower:
        return "Alternative interpretation"
    elif len(text_lower.split()) > 100:
        return "Narrative drift"
    elif len(text_lower.split()) < 10:
        return "Incomplete or truncated"
    else:
        return "Plausible but incorrect"

# Apply classification to all CoTs
cot_classification_df = df_outliers_view.copy()
cot_classification_df["CoT Category"] = cot_classification_df["CoT"].apply(classify_cot_mock)
cot_classification_df.to_csv('data/cot_classification_df_hit1.csv', index=False)

# Tabular result
cot_classification_df[['model', 'prompt_type', 'CoT', 'CoT Category']]

# Check the overall distribution of categories
category_counts = cot_classification_df["CoT Category"].value_counts().reset_index()
category_counts.columns = ["CoT Category", "Count"]

# Frequency by model and type of error
model_category = cot_classification_df.groupby(["model", "CoT Category"]).size().unstack(fill_value=0)

# Add row sums for model_category
model_category.loc['Row Sum'] = model_category.sum()

# Frequency by type of prompt
prompt_category = cot_classification_df.groupby(["prompt_type", "CoT Category"]).size().unstack(fill_value=0)

# Add row sums for prompt_category
prompt_category.loc['Row Sum'] = prompt_category.sum()

display(category_counts)
display(model_category)
display(prompt_category)

# Count the examples by category
category_counts = cot_classification_df["CoT Category"].value_counts()

# Select representative examples from each category
examples = []

for category in category_counts.index:
    subset = cot_classification_df[cot_classification_df["CoT Category"] == category]
    # Choose the example with the longest CoT to best represent the category
    representative = subset.loc[subset['CoT'].str.len().idxmax()]
    examples.append({
        "Category": category,
        "Model": representative["model"],
        "Prompt Type": representative["prompt_type"],
        "Question": representative["item"],
        "CoT": representative["CoT"]
    })

# Organize into DataFrame
examples_df = pd.DataFrame(examples)
display(examples_df)

# Filtrar possíveis casos de desalinhamento moral
alignment_relevant_categories = [
    "Plausible but incorrect",
    "Alternative interpretation",
    "Narrative drift"
]

alignment_df = cot_classification_df[cot_classification_df["CoT Category"].isin(alignment_relevant_categories)].copy()

# Procurar menções a termos sensíveis que indicam dilemas morais, normas sociais, julgamentos
keywords = [
    "should", "should not", "ethical", "moral", "deserve", "justice", "fair", "unfair",
    "responsible", "dangerous", "appropriate", "inappropriate", "illegal", "respect", "harm"
]

# Identificar CoTs que contêm essas palavras
pattern = '|'.join(keywords)
alignment_df["has_sensitive_terms"] = alignment_df["CoT"].str.lower().str.contains(pattern)

# Selecionar os que possuem indícios linguísticos de avaliação moral
morally_relevant = alignment_df[alignment_df["has_sensitive_terms"] == True]
display(morally_relevant)
