In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import combinations

# === Common Utility Functions =================================================================

def fdr_correction(p_values, alpha=0.05):
    """Benjamini–Hochberg FDR correction"""
    p_values = np.array(p_values)
    n = len(p_values)
    sorted_idx = np.argsort(p_values)
    sorted_p = p_values[sorted_idx]
    corrected = np.zeros(n)

    for i in range(n):
        corrected[sorted_idx[i]] = min(1.0, sorted_p[i] * n / (i + 1))

    # Enforce monotonicity
    sorted_corrected = corrected[sorted_idx]
    for i in range(n - 2, -1, -1):
        sorted_corrected[i] = min(sorted_corrected[i], sorted_corrected[i + 1])

    for i in range(n):
        corrected[sorted_idx[i]] = sorted_corrected[i]

    return corrected

def cronbach_alpha(data):
    """Compute Cronbach's alpha"""
    n_items, n_sessions = data.shape
    session_var = np.var(data, axis=0, ddof=1)
    total_scores = np.sum(data, axis=1)
    total_var = np.var(total_scores, ddof=1)
    alpha = (n_sessions / (n_sessions - 1)) * (1 - np.sum(session_var) / total_var)
    return alpha

def count_significant_pairs(data, alpha=0.05, method='ttest'):
    """Count significant pairs with FDR-corrected p-values (t-test or Spearman)"""
    n = data.shape[0] if method == 'ttest' else data.shape[1]
    p_values = []

    for i, j in combinations(range(n), 2):
        if method == 'ttest':
            _, p = stats.ttest_rel(data[i], data[j])
        elif method == 'spearman':
            _, p = stats.spearmanr(data[:, i], data[:, j])
        p_values.append(p)

    corrected_p = fdr_correction(p_values, alpha)
    sig_count = sum(p < alpha for p in corrected_p)
    return sig_count, len(p_values)

# === Load Data ================================================================================

df = pd.read_csv('official_row_data.csv', encoding='utf-8')
models = ['ChatGPT4.5', 'Notebook LM', 'Gemini-2.5', 'Grok3', 'Sonnet4', 'OpenAI-o3']
stories = ['Story A', 'Story B', 'Story C', 'Story D', 'Story E', 'Story F',
           'Story G', 'Story H', 'Story I', 'Story J']

# === Analysis per Model =======================================================================

results = []

for model in models:
    pivot = (
        df[df['model'] == model]
        .pivot(index='story', columns='session', values='score')
        .reindex(stories)
    )
    matrix = pivot.values

    # Paired t-tests
    alpha = cronbach_alpha(matrix)
    sig_count_t, total_t = count_significant_pairs(matrix, method='ttest')
    detection_rate_t = (sig_count_t / total_t) * 100

    results.append({
        'Test Type': 'Pairwise t-tests',
        'Model': model,
        'Significant Pairs': f"{sig_count_t}/{total_t}",
        'Detection Rate (%)': detection_rate_t,
    })

    # Rank correlation tests (Spearman)
    sig_count_s, total_s = count_significant_pairs(matrix, method='spearman')
    detection_rate_s = (sig_count_s / total_s) * 100

    results.append({
        'Test Type': 'Rank correlation tests',
        'Model': model,
        'Significant Pairs': f"{sig_count_s}/{total_s}",
        'Detection Rate (%)': detection_rate_s,
    })

# === Output Table D-3 ========================================================================

results_df = pd.DataFrame(results)
sorted_df = results_df.sort_values(
    by=['Test Type', 'Detection Rate (%)'],
    ascending=[True, False]
).reset_index(drop=True)

print("\nTable D-3: Pairwise Consistency Metrics by Model")
print(sorted_df.round(1))
