In [None]:
from service.json_service import load_evaluation_progress, load_common_evaluation_config
import pandas as pd
import os


evaluation_progress_path = "data/4_expert_evaluation/output_depseudonymized/"
llm_evaluation_progress_path = "data/5_llm_evaluation/evaluation_progress_llm-as-a-judge.json"

evaluation_config_path = "data/4_expert_evaluation/output_depseudonymized/common_evaluation_config.json"

participant_info_path = "data/6_analysis/participant_info.csv"

analysis_output_path = "data/6_analysis/"

# Load evaluation progress
df = load_evaluation_progress(evaluation_progress_path, llm_evaluation_progress_path)

# Load evaluation config
evaluation_config_df = load_common_evaluation_config(evaluation_config_path)
df = df.merge(evaluation_config_df, on=["exercise_id", "submission_id", "feedback_type"], how="left")

# Load participant info and add LLM as a participant
participant_info_df = pd.read_csv(participant_info_path, delimiter=";")
participant_info_df = pd.concat([participant_info_df, pd.DataFrame([{
    'expert_id': 'llm',
    'evaluation_name': 'LLM as a judge',
    'link': '',
    'name': 'LLM',
    'study_program': '',
    'semester': pd.NA,
    'eist_participation': pd.NA,
    'pse_participation': pd.NA,
    'tutoring_experience': pd.NA,
    'group': 'LLM',
}])])
df = df.merge(participant_info_df, on=["expert_id"], how="left")


In [None]:
df = df[['expert_id', 'study_program', 'semester', 'eist_participation', 'pse_participation', 'tutoring_experience', 'group', 'exercise_id', 'submission_id', 'feedback_type', 'metric', 'score', 'exercise', 'submission', 'feedback']]

df.to_csv(os.path.join(analysis_output_path, "evaluation_data.csv"), index=False)
df = pd.read_csv(os.path.join(analysis_output_path, "evaluation_data.csv"))

In [None]:
excluded_expert_ids = [
    'e9eec3a2-9fe6-4974-a346-45fe43ab0590',  # Low variance and random ratings
]

df = df[~df['expert_id'].isin(excluded_expert_ids)]

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl

# Set global plot style
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=['#105391', '#68A0C6', '#9BC6E8', '#999999', '#E07430', '#A1AC23', '#DAD7CC'])
mpl.rcParams['font.family'] = 'Helvetica Neue'

plt.rcParams.update({
    # Median Style (Thick Black Line)
    'boxplot.medianprops.color': 'black',
    'boxplot.medianprops.linewidth': 2.5,
    
    # Mean Style (White Triangle with Black Border)
    'boxplot.meanprops.marker': '^',
    'boxplot.meanprops.markerfacecolor': 'white',
    'boxplot.meanprops.markeredgecolor': 'black',

    # Legend Position (Lower Right)
    'legend.loc': 'lower right',

    # Figure Size
    'figure.figsize': (6, 4),
})

metric_order = sorted(df['metric'].dropna().unique())
feedback_type_order = ['Cofee', 'Tutor', 'LLM']
group_order = ['Student', 'Instructor']

# # Filter 0 scores
# df = df[df['score'] > 0]

In [None]:
from service.plot_service import plot_boxplot

for group in [['Student'], ['Instructor'], ['LLM'], ['Student', 'Instructor']]:
    df_group = df[df['group'].isin(group)]
    plot_boxplot(
        df = df_group,
        x = 'feedback_type',
        y = 'score',
        hue = 'metric',
        x_order = feedback_type_order,
        hue_order = metric_order,
        title = f'Assessment Scores by Feedback Type and Metric ({"s & ".join(group)}s):',
        ylabel = 'Score',
        xlabel = '',
        legend_title = 'Metric',
        plot_path = analysis_output_path,
        filename = f'{"_".join(group)}_feedback_type_metric_boxplot.png'
    )

In [None]:
from service.plot_service import plot_boxplot

for group in [['Student'], ['Instructor'], ['LLM'], ['Student', 'Instructor']]:
    df_group = df[df['group'].isin(group)]
    plot_boxplot(
        df = df_group,
        x = 'metric',
        y = 'score',
        hue = 'feedback_type',
        x_order = metric_order,
        hue_order = feedback_type_order,
        title = f'Assessment Scores by Metric & Feedback Type ({"s & ".join(group)}s):',
        ylabel = 'Score',
        xlabel = '',
        legend_title = 'Feedback Type',
        plot_path = analysis_output_path,
        filename = f'{"_".join(group)}_metric_feedback_type_boxplot.png'
    )

In [None]:
from service.plot_service import plot_boxplot

for feedback_type in feedback_type_order:
    df_feedback = df[df['feedback_type'] == feedback_type]
    df_feedback = df_feedback[df_feedback['group'] != 'LLM']
    plt.figure(figsize=(5,4))
    plot_boxplot(
        df = df_feedback,
        x = 'metric',
        y = 'score',
        hue = 'group',
        x_order = metric_order,
        hue_order = group_order,
        title = f'Assessment Scores by Metric & Feedback Source ({feedback_type}):',
        ylabel = 'Score',
        xlabel = '',
        legend_title = 'Group',
        plot_path = analysis_output_path,
        filename = f'{"_".join(group)}_metric_group_boxplot.png'
    )

In [None]:
from service.plot_service import plot_boxplot

for feedback_type in feedback_type_order:
    df_feedback = df[df['feedback_type'] == feedback_type]
    df_feedback = df_feedback[df_feedback['group'] != 'LLM']
    plot_boxplot(
        df = df_feedback,
        x = 'group',
        y = 'score',
        hue = 'metric',
        x_order = group_order,
        hue_order = metric_order,
        title = f'Assessment Scores by Feedback Source & Metric ({feedback_type}):',
        ylabel = 'Score',
        xlabel = '',
        legend_title = 'Group',
        plot_path = analysis_output_path,
        filename = f'{feedback_type}_feedback_source_metric_boxplot.png'
    )

In [None]:
evaluation_df = df.copy()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------------------------------------
# 1. SETUP & PRE-PROCESSING
# ---------------------------------------------------------

df = evaluation_df.copy()
df = df[df['group'] == 'Student']  # Focus on student raters only


# Ensure score is numeric
# df['score'] = pd.to_numeric(df['score'], errors='coerce')
# df['score'] = df['score'].replace(0, np.nan)
df = df.dropna(subset=['score'])

# ---------------------------------------------------------
# 2. CALCULATE CONSENSUS & DEVIATION
# ---------------------------------------------------------

# Define what constitutes a "unique item" that experts are rating
# In your real data, this is likely: ['submission_id', 'feedback_type', 'metric']
# If you used the mock data, it's just 'unique_item_id'
if 'unique_item_id' in df.columns:
    group_cols = ['unique_item_id']
else:
    group_cols = ['submission_id', 'feedback_type', 'metric']

# A. Calculate the Median score for every item (The Peer Consensus)
# transform returns a Series aligned with the original dataframe
df['consensus_score'] = df.groupby(group_cols)['score'].transform('median')

# B. Calculate Absolute Deviation (How far was THIS rating from the median?)
df['deviation'] = abs(df['score'] - df['consensus_score'])

# ---------------------------------------------------------
# 3. AGGREGATE PER EXPERT
# ---------------------------------------------------------
expert_stats = df.groupby('name').agg(
    avg_deviation=('deviation', 'mean'),  # Agreement Score (Lower is better)
    std_score=('score', 'std'),           # Variance (Are they using the full scale?)
    count=('score', 'count')              # Did they rate enough items?
).reset_index()

# Filter out experts with too few ratings to be statistically valid
expert_stats = expert_stats[expert_stats['count'] > 5]

# ---------------------------------------------------------
# 4. VISUALIZE
# ---------------------------------------------------------
plt.figure(figsize=(10, 8))

# Scatter plot
sns.scatterplot(
    data=expert_stats,
    x='avg_deviation',
    y='std_score',
    size='count',
    sizes=(50, 400),
    alpha=0.7,
    edgecolor='black'
)

# Add Threshold Lines (Arbitrary, based on distribution)
mean_dev_threshold = expert_stats['avg_deviation'].quantile(0.90) # Top 10% disagree-ers
plt.axvline(mean_dev_threshold, color='red', linestyle='--', alpha=0.5, label='High Disagreement Zone')

# Label the outliers
for i, row in expert_stats.iterrows():
    # Label if deviation is high OR variance is suspiciously low
    if row['avg_deviation'] > mean_dev_threshold or row['std_score'] < 0.5:
        plt.text(
            row['avg_deviation']+0.02, 
            row['std_score'], 
            row['name'], 
            fontsize=9, 
            color='darkred',
            weight='bold'
        )

plt.title('Rater Reliability Analysis: Outlier Detection', fontsize=15)
plt.xlabel('Average Disagreement with Peers (Mean Absolute Deviation)\n(Right = High Disagreement)', fontsize=11)
plt.ylabel('Rating Variance (Standard Deviation)\n(Bottom = Always clicks same button)', fontsize=11)
plt.legend(title='Number of Ratings', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.4)

plt.tight_layout()
plt.show()

# ---------------------------------------------------------
# 5. PRINT SUSPICIOUS ID LIST
# ---------------------------------------------------------
print("--- SUSPICIOUS RATERS ---")
# 1. The Disagree-ers (High Deviation)
outliers = expert_stats[expert_stats['avg_deviation'] > mean_dev_threshold]
print(f"\n[High Disagreement] (Consistently deviated from group consensus):")
print(outliers[['name', 'avg_deviation', 'count']].sort_values('avg_deviation', ascending=False))

# 2. The Flatliners (Low Variance)
flatliners = expert_stats[expert_stats['std_score'] < 0.6]
print(f"\n[Flatliners] (Variance < 0.6, likely clicked same button repeatedly):")
print(flatliners[['name', 'std_score', 'count']])

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------------------------------------
# 1. SETUP & PRE-PROCESSING
# ---------------------------------------------------------

df = evaluation_df.copy()
df = df[df['group'] == 'Student']  # Focus on student raters only



# Ensure score is numeric
# df['score'] = pd.to_numeric(df['score'], errors='coerce')
# df['score'] = df['score'].replace(0, np.nan)
df = df.dropna(subset=['score'])

# Safety check for 'name' column if using real data that might be missing it
if 'name' not in df.columns:
    df['name'] = df['expert_id']

# ---------------------------------------------------------
# 2. CALCULATE CONSENSUS & DEVIATION
# ---------------------------------------------------------

# Define what constitutes a "unique item" that experts are rating
if 'unique_item_id' in df.columns:
    group_cols = ['unique_item_id']
else:
    # In real data, an item is defined by the specific submission + feedback + metric
    group_cols = ['submission_id', 'feedback_type', 'metric']

# A. Calculate the Median score for every item (The Peer Consensus)
df['consensus_score'] = df.groupby(group_cols)['score'].transform('median')

# B. Calculate Absolute Deviation (How far was THIS rating from the median?)
df['deviation'] = abs(df['score'] - df['consensus_score'])

# ---------------------------------------------------------
# 3. AGGREGATE PER EXPERT
# ---------------------------------------------------------
# We include 'name' in the aggregation to keep it available
expert_stats = df.groupby(['expert_id', 'name']).agg(
    avg_deviation=('deviation', 'mean'),  # Agreement Score (Lower is better)
    std_score=('score', 'std'),           # Variance (Are they using the full scale?)
    count=('score', 'count')              # Did they rate enough items?
).reset_index()

# Filter out experts with too few ratings to be statistically valid
expert_stats = expert_stats[expert_stats['count'] > 5]

# ---------------------------------------------------------
# 4. VISUALIZE: GLOBAL OUTLIER DETECTION
# ---------------------------------------------------------
plt.figure(figsize=(10, 8))

# Scatter plot
sns.scatterplot(
    data=expert_stats,
    x='avg_deviation',
    y='std_score',
    size='count',
    sizes=(50, 400),
    alpha=0.7,
    edgecolor='black'
)

# Add Threshold Lines
mean_dev_threshold = expert_stats['avg_deviation'].quantile(0.50) # Top 50% disagree-ers
plt.axvline(mean_dev_threshold, color='red', linestyle='--', alpha=0.5, label='High Disagreement Zone')

# Label the outliers
for i, row in expert_stats.iterrows():
    # Changed threshold from 0.5 to 0.8
    if row['avg_deviation'] > mean_dev_threshold or row['std_score'] < 0.8:
        plt.text(
            row['avg_deviation']+0.02, 
            row['std_score'], 
            row['name'], # Changed to use name
            fontsize=9, 
            color='darkred',
            weight='bold'
        )

plt.title('Rater Reliability Analysis: Outlier Detection', fontsize=15)
plt.xlabel('Average Disagreement with Peers (Mean Absolute Deviation)', fontsize=11)
plt.ylabel('Rating Variance (Standard Deviation)', fontsize=11)
plt.legend(title='Number of Ratings', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

# ---------------------------------------------------------
# 5. DETAILED ANALYSIS: INDIVIDUAL PLOTS FOR OUTLIERS
# ---------------------------------------------------------
print("\n--- GENERATING INDIVIDUAL BIAS PLOTS FOR OUTLIERS ---")

# Identify suspicious raters (High Disagreement OR Low Variance)
# Changed threshold from 0.5 to 0.8
suspicious_mask = (expert_stats['avg_deviation'] > mean_dev_threshold) | (expert_stats['std_score'] < 0.8)
suspicious_raters = expert_stats[suspicious_mask]

if suspicious_raters.empty:
    print("No significant outliers found to analyze.")
else:
    for idx, rater in suspicious_raters.iterrows():
        rater_id = rater['expert_id']
        rater_name = rater['name']
        
        # Get all ratings for this specific person
        rater_data = df[df['expert_id'] == rater_id].copy()
        
        # Add Jitter for visualization (since scores are discrete integers)
        # We add random noise (-0.2 to +0.2) so points don't overlap perfectly
        rater_data['x_jitter'] = rater_data['consensus_score'] + np.random.uniform(-0.15, 0.15, len(rater_data))
        rater_data['y_jitter'] = rater_data['score'] + np.random.uniform(-0.15, 0.15, len(rater_data))

        plt.figure(figsize=(6, 6))
        
        # Plot Rater vs Consensus
        plt.scatter(
            rater_data['x_jitter'], 
            rater_data['y_jitter'], 
            alpha=0.6, 
            c='teal', 
            edgecolor='k'
        )
        
        # Perfect Agreement Line (Diagonal)
        plt.plot([1, 5], [1, 5], color='red', linestyle='--', linewidth=2, label='Perfect Agreement')
        
        plt.title(f"Bias Check: {rater_name}", fontsize=14)
        plt.xlabel("Peer Consensus (Median)", fontsize=12)
        plt.ylabel(f"{rater_name}'s Rating", fontsize=12)
        plt.xlim(0.5, 5.5)
        plt.ylim(0.5, 5.5)
        plt.grid(True, linestyle='--', alpha=0.3)
        plt.legend()
        
        # Interpretation Logic for Title
        slope_check = np.mean(rater_data['score'] - rater_data['consensus_score'])
        # Changed threshold from 0.5 to 0.8
        if abs(slope_check) < 0.3 and rater['std_score'] < 0.5:
            interp = "Likely FLATLINER (Central Tendency)"
        elif slope_check < -0.5:
            interp = "Likely HARSH/CRITICAL Bias"
        elif slope_check > 0.5:
            interp = "Likely LENIENT Bias"
        elif rater['avg_deviation'] > 1.2:
            interp = "Likely RANDOM/NOISY Inputs"
        else:
            interp = "Moderate Disagreement"
            
        plt.figtext(0.5, 0.01, f"Pattern: {interp}", ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})
        
        plt.tight_layout()
        plt.show()

# ---------------------------------------------------------
# 6. PRINT SUMMARY LIST
# ---------------------------------------------------------
print("\n--- SUSPICIOUS RATERS SUMMARY ---")
print(suspicious_raters[['name', 'avg_deviation', 'std_score', 'count']].sort_values('avg_deviation', ascending=False))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # Added for Z-score calculation

# ---------------------------------------------------------
# 1. SETUP & PRE-PROCESSING
# ---------------------------------------------------------

df = evaluation_df.copy()
df = df[df['group'] == 'Student']  # Focus on student raters only

# Ensure score is numeric
# df['score'] = pd.to_numeric(df['score'], errors='coerce')
# df['score'] = df['score'].replace(0, np.nan)
df = df.dropna(subset=['score'])

# Safety check for 'name' column if using real data that might be missing it
if 'name' not in df.columns:
    df['name'] = df['expert_id']

# ---------------------------------------------------------
# 2. CALCULATE CONSENSUS & DEVIATION
# ---------------------------------------------------------

# Define what constitutes a "unique item" that experts are rating
if 'unique_item_id' in df.columns:
    group_cols = ['unique_item_id']
else:
    # In real data, an item is defined by the specific submission + feedback + metric
    group_cols = ['submission_id', 'feedback_type', 'metric']

# A. Calculate the Median score for every item (The Peer Consensus)
df['consensus_score'] = df.groupby(group_cols)['score'].transform('median')

# B. Calculate Absolute Deviation (How far was THIS rating from the median?)
df['deviation'] = abs(df['score'] - df['consensus_score'])

# ---------------------------------------------------------
# 3. AGGREGATE PER EXPERT
# ---------------------------------------------------------
# We include 'name' in the aggregation to keep it available
expert_stats = df.groupby(['expert_id', 'name']).agg(
    avg_deviation=('deviation', 'mean'),  # Agreement Score (Lower is better)
    std_score=('score', 'std'),           # Variance (Are they using the full scale?)
    count=('score', 'count')              # Did they rate enough items?
).reset_index()

# Filter out experts with too few ratings to be statistically valid
expert_stats = expert_stats[expert_stats['count'] > 5]

# ---------------------------------------------------------
# 4. STATISTICAL OUTLIER TEST (Z-SCORES)
# ---------------------------------------------------------
# Instead of arbitrary thresholds, we look for statistical anomalies.
# Z = (Value - Mean) / StdDev. 
# A Z-score > 1.96 means they are in the top 2.5% of extreme values (p < 0.05).

# Test 1: Disagreement Z-Score
# Are they significantly more disagreeable than the average student?
expert_stats['z_deviation'] = stats.zscore(expert_stats['avg_deviation'])

# Test 2: Flatline Z-Score
# Is their variance significantly lower than the average student?
expert_stats['z_variance'] = stats.zscore(expert_stats['std_score'])

# Define Statistical Thresholds (Z > 1.96 corresponds to 95% Confidence Interval)
Z_THRESHOLD = 1.96 

# Identify Outliers based on Z-Scores
# 1. High Disagreement: Z_deviation > 1.96
# 2. Flatliner (Low Variance): Z_variance < -1.96 (Significantly LESS variance than others)
outlier_mask = (expert_stats['z_deviation'] > Z_THRESHOLD) | (expert_stats['z_variance'] < -Z_THRESHOLD)

# ---------------------------------------------------------
# 5. VISUALIZE: GLOBAL OUTLIER DETECTION
# ---------------------------------------------------------
plt.figure(figsize=(12, 9))

# Scatter plot
sns.scatterplot(
    data=expert_stats,
    x='avg_deviation',
    y='std_score',
    size='count',
    sizes=(50, 400),
    alpha=0.7,
    edgecolor='black',
    hue=outlier_mask, # Color code the statistically significant outliers
    palette={True: 'red', False: 'teal'}
)

# Calculate the raw values that correspond to the Z-Score thresholds for plotting lines
mean_dev = expert_stats['avg_deviation'].mean()
std_dev = expert_stats['avg_deviation'].std()
plot_dev_threshold = mean_dev + (Z_THRESHOLD * std_dev)

mean_var = expert_stats['std_score'].mean()
std_var = expert_stats['std_score'].std()
plot_var_threshold = mean_var - (Z_THRESHOLD * std_var)

# Add Threshold Lines
plt.axvline(plot_dev_threshold, color='darkred', linestyle='--', alpha=0.5, label=f'High Disagreement (Z > {Z_THRESHOLD})')
plt.axhline(plot_var_threshold, color='orange', linestyle='--', alpha=0.5, label=f'Suspected Flatliner (Z < -{Z_THRESHOLD})')

# Label the outliers
for i, row in expert_stats[outlier_mask].iterrows():
    plt.text(
        row['avg_deviation']+0.02, 
        row['std_score'], 
        f"{row['name']}", 
        fontsize=9, 
        color='darkred',
        weight='bold'
    )

plt.title('Rater Reliability: Statistical Outlier Detection (Z-Score Method)', fontsize=15)
plt.xlabel('Average Disagreement with Peers (Mean Absolute Deviation)', fontsize=11)
plt.ylabel('Rating Variance (Standard Deviation)', fontsize=11)
plt.legend(title='Statistical Outlier', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

# ---------------------------------------------------------
# 6. DETAILED ANALYSIS: INDIVIDUAL PLOTS FOR OUTLIERS
# ---------------------------------------------------------
print("\n--- GENERATING INDIVIDUAL BIAS PLOTS FOR STATISTICAL OUTLIERS ---")

suspicious_raters = expert_stats[outlier_mask]

if suspicious_raters.empty:
    print("No statistically significant outliers found (p < 0.05). Your data is clean!")
else:
    for idx, rater in suspicious_raters.iterrows():
        rater_id = rater['expert_id']
        rater_name = rater['name']
        
        # Get all ratings for this specific person
        rater_data = df[df['expert_id'] == rater_id].copy()
        
        # Add Jitter for visualization
        rater_data['x_jitter'] = rater_data['consensus_score'] + np.random.uniform(-0.15, 0.15, len(rater_data))
        rater_data['y_jitter'] = rater_data['score'] + np.random.uniform(-0.15, 0.15, len(rater_data))

        plt.figure(figsize=(6, 6))
        
        # Plot Rater vs Consensus
        plt.scatter(
            rater_data['x_jitter'], 
            rater_data['y_jitter'], 
            alpha=0.6, 
            c='red', 
            edgecolor='k'
        )
        
        # Perfect Agreement Line (Diagonal)
        plt.plot([0, 5], [0, 5], color='grey', linestyle='--', linewidth=2, label='Perfect Agreement')
        
        plt.title(f"Bias Check: {rater_name} (Z-Dev: {rater['z_deviation']:.2f})", fontsize=14)
        plt.xlabel("Peer Consensus (Median)", fontsize=12)
        plt.ylabel(f"{rater_name}'s Rating", fontsize=12)
        plt.xlim(-0.5, 5.5)
        plt.ylim(-0.5, 5.5)
        plt.grid(True, linestyle='--', alpha=0.3)
        plt.legend()
        
        # Interpretation Logic
        interp = "Unknown Pattern"
        if rater['z_variance'] < -Z_THRESHOLD:
            interp = "FLATLINER (Statistically Low Variance)"
        elif rater['z_deviation'] > Z_THRESHOLD:
             # Check slope/bias
            slope_check = np.mean(rater_data['score'] - rater_data['consensus_score'])
            if slope_check < -0.5: interp = "High Disagreement (Harsh Bias)"
            elif slope_check > 0.5: interp = "High Disagreement (Lenient Bias)"
            else: interp = "High Disagreement (Random/Noisy)"

        plt.figtext(0.5, -0.02, f"Diagnosis: {interp}", ha="center", fontsize=10, bbox={"facecolor":"salmon", "alpha":0.2, "pad":5})
        
        plt.tight_layout()
        plt.show()

# ---------------------------------------------------------
# 7. PRINT SUMMARY LIST
# ---------------------------------------------------------
print("\n--- STATISTICAL OUTLIERS (Z > 1.96 or Z < -1.96) ---")
summary_cols = ['name', 'avg_deviation', 'z_deviation', 'std_score', 'z_variance']
print(suspicious_raters[summary_cols].sort_values('z_deviation', ascending=False))

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import scikit_posthocs as sp

# ---------------------------------------------------------
# 1. SETUP
# ---------------------------------------------------------
df = evaluation_df.copy()

# df = df[df['Expert'] == True]  # Experts only
# df = df[df['Expert'] == False]  # Students only
# df = df[df['expert_id'] != "llm"] # Exclude LLM judge for this analysis
df = df[df['group'] != "LLM"] # LLM judge only

# Pre-processing
df['score'] = pd.to_numeric(df['score'], errors='coerce')
df['score'] = df['score'].replace(0, np.nan)

unique_metrics = df['metric'].unique()

print(f"Total raw ratings: {len(df)}")

# ---------------------------------------------------------
# 2. ROBUST ANALYSIS LOOP (RANK-THEN-AGGREGATE)
# ---------------------------------------------------------

for metric in unique_metrics:
    print(f"\n\n================ {metric.upper()} (SUBMISSION-LEVEL / RANK-BASED) ================")
    
    # Filter by metric
    df_metric = df[df['metric'] == metric]
    
    # --- STEP 1: PIVOT TO STUDENT LEVEL ---
    # We need to see the student's side-by-side comparison first to calculate ranks.
    student_level = df_metric.pivot_table(
        index=['expert_id', 'submission_id'], # Unique session
        columns='feedback_type', 
        values='score'
    )
    
    # Remove incomplete sessions (Student didn't rate all 3 techniques)
    # This is crucial for valid ranking.
    student_level = student_level.dropna()
    
    print(f"Valid Student Ratings (Raw Count): {len(student_level)}")
    
    if len(student_level) < 5:
        print("Not enough data to proceed.")
        continue

    # --- STEP 2: CALCULATE RANKS (Per Student) ---
    # This neutralizes the "Grumpy vs Happy" grader bias immediately.
    # A '5' from a happy grader becomes Rank 3. A '2' from a grumpy grader also becomes Rank 3.
    student_ranks = student_level.rank(axis=1, method='average', ascending=True)
    
    # --- STEP 3: AGGREGATE BY SUBMISSION (Median of Ranks) ---
    # Now we combine the ranks for the same submission.
    # We use groupby on the 'submission_id' level of the index.
    submission_level = student_ranks.groupby(level='submission_id').median()
    
    # Drop submissions that might have become incomplete during aggregation 
    # (unlikely here given previous dropna, but good safety)
    clean_data = submission_level.dropna()
    
    print(f"Valid Submissions for Analysis: {len(clean_data)}")
    
    if len(clean_data) < 5:
        print("Not enough submissions.")
        continue

    # --- STEP 4: FRIEDMAN TEST (On Median Ranks) ---
    # We are now comparing the "Consensus Rank" of T1 vs T2 vs T3
    
    # Calculate mean of the median ranks for display
    final_mean_ranks = clean_data.mean().sort_values(ascending=False)
    
    stat, p_value = stats.friedmanchisquare(*[clean_data[col] for col in clean_data.columns])
    
    print("\n--- Results ---")
    print(f"Friedman p-value: {p_value:.5f}")
    print("Mean of Median Ranks (Higher is better):")
    print(final_mean_ranks)
    
    if p_value < 0.05:
        print("\n--- Post-Hoc: Nemenyi Test ---")
        
        nemenyi = sp.posthoc_nemenyi_friedman(clean_data.reset_index(drop=True))
        
        # Filter for significant pairs to make reading easier
        print("Significant Differences (p < 0.05):")
        cols = clean_data.columns
        any_sig = False
        for i in range(len(cols)):
            for j in range(i+1, len(cols)):
                t1, t2 = cols[i], cols[j]
                p_val = nemenyi.loc[t1, t2]
                if p_val < 0.05:
                    any_sig = True
                    # Determine winner based on mean rank
                    winner = t1 if final_mean_ranks[t1] > final_mean_ranks[t2] else t2
                    loser = t2 if winner == t1 else t1
                    print(f"  * {winner} beats {loser} (p={p_val:.4f})")
                else:
                    print(f"  - No significant difference between {t1} and {t2} (p={p_val:.4f})")
        
        if not any_sig:
            print("  (None found despite global significance)")
            
    else:
        print("Result: No significant difference found.")