In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from itertools import combinations

# --- Table A-1: Descriptive Statistics by Story ---

# Load evaluation data
df = pd.read_csv('official_row_data.csv', encoding='utf-8')

# Manually defined word count dictionary
word_counts = {
    'Story A': 2291, 'Story B': 3832, 'Story C': 1962, 'Story D': 1671,
    'Story E': 1998, 'Story F': 3225, 'Story G': 3068, 'Story H': 1279,
    'Story I': 2145, 'Story J': 1765
}

# Calculate z-score within each model-session group
df['z_score'] = df.groupby(['model', 'session'])['score'].transform(
    lambda x: (x - x.mean()) / x.std(ddof=1)
)

# Compute descriptive statistics grouped by story
grouped = df.groupby('story')
summary_df = pd.DataFrame({
    'Word Count': pd.Series(word_counts),
    'Mean Score': grouped['score'].mean().round(1),
    'Std Dev': grouped['score'].std(ddof=1).round(2),
    'Raw Score Variance': grouped['score'].var(ddof=1).round(1),
    'Z-Score Variance': grouped['z_score'].var(ddof=1).round(2),
    'Range': grouped['score'].apply(lambda x: f"{x.min()}–{x.max()}")
})

print("Table A-1")
print(summary_df)


# --- Table A-2: Model-Specific Evaluation Consistency ---

def cronbach_alpha(data):
    """Calculate Cronbach's alpha"""
    n_items, n_sessions = data.shape
    session_var = np.var(data, axis=0, ddof=1)
    total_scores = np.sum(data, axis=1)
    total_var = np.var(total_scores, ddof=1)
    alpha = (n_sessions / (n_sessions - 1)) * (1 - np.sum(session_var) / total_var)
    return alpha

def mean_spearman_correlation(data):
    """Compute mean Spearman correlation across all session pairs"""
    n_sessions = data.shape[1]
    correlations = []
    for i, j in combinations(range(n_sessions), 2):
        corr, _ = stats.spearmanr(data[:, i], data[:, j])
        correlations.append(corr)
    return np.mean(correlations)

models = ['ChatGPT4.5', 'Notebook LM', 'Gemini-2.5', 'Grok3', 'Sonnet4', 'OpenAI-o3']
stories = ['Story A', 'Story B', 'Story C', 'Story D', 'Story E', 'Story F', 'Story G', 'Story H', 'Story I', 'Story J']

results = []

for model in models:
    model_data = df[df['model'] == model]
    pivot = model_data.pivot(index='story', columns='session', values='score')
    matrix = pivot.reindex(stories).values
    alpha = cronbach_alpha(matrix)
    mean_corr = mean_spearman_correlation(matrix)
    results.append({
        'Model': model,
        'Cronbach_alpha': round(alpha, 2),
        'Mean_Correlation': round(mean_corr, 2),
    })

results_df = pd.DataFrame(results).sort_values(by='Cronbach_alpha', ascending=False)

print("\nTable A-2")
print(results_df)


# --- Figure A-1: Bar Plot of Cronbach's Alpha and Spearman Correlation ---

fig, ax = plt.subplots(figsize=(10, 6))
bar_width = 0.35
index = range(len(results_df))

bars1 = ax.bar(index, results_df['Cronbach_alpha'], bar_width, label='Cronbach\'s α', color='dimgray')
bars2 = ax.bar([i + bar_width for i in index], results_df['Mean_Correlation'], bar_width, label='Mean Correlation (ρ)', color='lightgray')

ax.set_xticks([i + bar_width / 2 for i in index])
ax.set_xticklabels(results_df['Model'], fontsize=15)
ax.set_ylim(0, 1.2)
ax.legend(fontsize=15)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

for bar in bars1:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom', fontsize=15)

for bar in bars2:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom', fontsize=15)

plt.title('Cronbach\'s Alpha and Spearman Correlation by Language Model', fontsize=15)
plt.tight_layout()
plt.savefig('Figure A-1.png', dpi=300)
print("\nFigure A-1")
plt.show()


In [1]:
# --- Table A-2(Right panel): Top/bottom 3 works agreement ---
import pandas as pd

# --- Load official evaluation data ---
df = pd.read_csv('official_row_data.csv', encoding='utf-8')

# --- Extract top 3 highest scores per model-session pair ---
top3_df = (
    df.sort_values(['model', 'session', 'score'], ascending=[True, True, False])
      .groupby(['model', 'session'])
      .head(3)
)

# --- Extract bottom 3 lowest scores per model-session pair ---
bottom3_df = (
    df.sort_values(['model', 'session', 'score'], ascending=[True, True, True])
      .groupby(['model', 'session'])
      .head(3)
)

# --- Define aggregation logic for N ≥ 2 appearance count per model-story pair ---
def aggregate_story_counts(df_subset, label, denominator, threshold=2):
    counts = (
        df_subset.groupby(['model', 'story'])
        .size()
        .reset_index(name='count')
    )
    filtered = counts[counts['count'] >= threshold]
    model_total = (
        filtered.groupby('model')['count']
        .sum()
        .reset_index(name=f'{label}_total')
    )
    model_total[f'{label}_agreement'] = (model_total[f'{label}_total'] / denominator * 100).round(1).astype(str) + '%'
    return model_total

# --- Aggregate results for Top-3 and Bottom-3 story appearances ---
denominator = 3 * 7 # Top/Bottom n * Number of sessions
top3_summary = aggregate_story_counts(top3_df, 'top3', denominator)
bottom3_summary = aggregate_story_counts(bottom3_df, 'bottom3', denominator)

# --- Merge summaries into a single table ---
summary = pd.merge(top3_summary, bottom3_summary, on='model', how='outer').fillna({
    'top3_total': 0, 'top3_agreement': '0.0%',
    'bottom3_total': 0, 'bottom3_agreement': '0.0%'
})

print("Table A-2 (Right Panel)")
print(summary[['model', 'top3_total', 'top3_agreement', 'bottom3_total', 'bottom3_agreement']])

Table A-2 (Right Panel)
         model  top3_total top3_agreement  bottom3_total bottom3_agreement
0   ChatGPT4.5          19          90.5%             16             76.2%
1   Gemini-2.5          18          85.7%             18             85.7%
2        Grok3          20          95.2%             21            100.0%
3  Notebook LM          21         100.0%             21            100.0%
4    OpenAI-o3          20          95.2%             20             95.2%
5      Sonnet4          17          81.0%             20             95.2%
