In [None]:
import json
import os

evaluation_progress_path = "data/4_expert_evaluation/output_depseudonymized/"
llm_evaluation_progress_path = "data/5_llm_evaluation/evaluation_progress_llm-as-a-judge.json"

evaluation_config_path = "data/4_expert_evaluation/output_depseudonymized/common_evaluation_config.json"

participant_info_path = "data/6_analysis/participant_info.csv"

common_evaluation_config = {}
evaluation_progresses = {}

# Load common evaluation config
with open(evaluation_config_path, "r") as config_file:
    common_evaluation_config = json.load(config_file)

# Load expert evaluation progress files
files = os.listdir(evaluation_progress_path)
for file in files:
    if file.startswith("evaluation_progress_"):
        with open(os.path.join(evaluation_progress_path, file), "r") as progress_file:
            evaluation_progress = json.load(progress_file)
            expert_id = file.replace("evaluation_progress_", "").replace(".json", "")
            evaluation_progresses[expert_id] = evaluation_progress

# Load optional LLM evaluation progress file
if os.path.exists(llm_evaluation_progress_path):
    with open(llm_evaluation_progress_path, "r") as llm_progress_file:
        llm_evaluation_progress = json.load(llm_progress_file)
        evaluation_progresses["llm"] = llm_evaluation_progress

In [None]:
import pandas as pd

# Dataframe from progress data
records = []
for expert_id, evaluation_progress in evaluation_progresses.items():
    for exercise_id, exercise_data in evaluation_progress.get("selected_values", {}).items():
        for submission_id, submission_data in exercise_data.items():
            for feedback_type, feedback_data in submission_data.items():
                for metric, score in feedback_data.items():
                    if metric == "meta":
                        continue
                    record = {
                        "expert_id": expert_id,
                        "exercise_id": exercise_id,
                        "submission_id": submission_id,
                        "feedback_type": feedback_type,
                        "metric": metric,
                        "score": score,
                    }
                    records.append(record)

evaluation_df = pd.DataFrame.from_records(records).astype({
        'expert_id': 'string',
        'exercise_id': 'int64',
        'submission_id': 'int64',
        'feedback_type': 'string',
        'metric': 'string',
        'score': 'float64',
    })

# Dataframe from participant info
if os.path.exists(participant_info_path):
    participant_info_df = pd.read_csv(participant_info_path, delimiter=";").astype({
        'expert_id': 'string',
        'evaluation_name': 'string',
        'link': 'string',
        'name': 'string',
        'study_program': 'string',
        'semester': 'Int64',
        'eist_participation': 'boolean',
        'pse_participation': 'boolean',
        'tutoring_experience': 'boolean',
        'group': 'string',
    })
    participant_info_df = pd.concat([participant_info_df, pd.DataFrame([{
        'expert_id': 'llm',
        'evaluation_name': 'LLM as a judge',
        'link': '',
        'name': 'LLM',
        'study_program': '',
        'semester': pd.NA,
        'eist_participation': pd.NA,
        'pse_participation': pd.NA,
        'tutoring_experience': pd.NA,
        'group': 'LLM',
    }])])
    evaluation_df = evaluation_df.join(participant_info_df.set_index("expert_id"), on="expert_id", how="left")

# Dataframe from common evaluation config
records = []
for raw_exercise in common_evaluation_config.get("exercises", []):
    # Get exercise without submissions
    exercise = {k: v for k, v in raw_exercise.items() if k != "submissions"}
    for raw_submission in raw_exercise.get("submissions", []):
        submission = {k: v for k, v in raw_submission.items() if k != "feedbacks"}
        for feedback_type, raw_feedback in raw_submission.get("feedbacks", {}).items():
            record = {
                'exercise_id': exercise['id'],
                'submission_id': submission['id'],
                'feedback_type': feedback_type,
                'exercise': exercise,
                'submission': submission,
                'feedback': raw_feedback,
            }
            records.append(record)

exercise_config_df = pd.DataFrame.from_records(records).astype({
        'exercise_id': 'int64',
        'submission_id': 'int64',
        'feedback_type': 'string',
    })

evaluation_df = evaluation_df.merge(
    exercise_config_df,
    on=["exercise_id", "submission_id", "feedback_type"],
    how="left"
)

evaluation_df.head()

In [None]:
# Only required columns
df = evaluation_df[[
    'expert_id',
    'study_program',
    'semester',
    'eist_participation',
    'pse_participation',
    'tutoring_experience',
    'group',
    'exercise_id',
    'submission_id',
    'feedback_type',
    'metric',
    'score',
    'exercise',
    'submission',
    'feedback',
]]

In [None]:
df

In [None]:
# import pandas as pd
# import numpy as np
# import scipy.stats as stats
# import scikit_posthocs as sp

# # ---------------------------------------------------------
# # 1. SETUP
# # ---------------------------------------------------------
# # Assuming 'evaluation_df' is already loaded in your Jupyter Notebook environment.
# # We create a copy to avoid modifying the original dataframe during processing.
# try:
#     df = evaluation_df.copy()
#     # Experts only where Expert is true
#     # df = df[df['Expert'] == True]
#     # Students only where Expert is false
#     # df = df[df['Expert'] == False]
# except NameError:
#     print("Error: 'evaluation_df' is not defined. Please ensure your dataframe is loaded.")
#     # For testing purposes only, you might uncomment the line below if loading from CSV
#     # df = pd.read_csv('your_data.csv')
#     df = pd.DataFrame() # Empty placeholder to prevent further errors if run standalone

# print("--- Data Loaded ---")
# if not df.empty:
#     print(f"Total Rows: {len(df)}")
#     if 'metric' in df.columns:
#         print(f"Metrics found: {df['metric'].unique()}")


# # ---------------------------------------------------------
# # 2. PRE-PROCESSING
# # ---------------------------------------------------------

# if not df.empty:
#     # Convert 0 to NaN (as 0 means "Not Ratable", not "Terrible")
#     # Ensure 'score' is numeric just in case
#     df['score'] = pd.to_numeric(df['score'], errors='coerce')
#     df['score'] = df['score'].replace(0, np.nan)


#     # ---------------------------------------------------------
#     # 3. STATISTICAL ANALYSIS LOOP (Per Metric)
#     # ---------------------------------------------------------

#     unique_metrics = df['metric'].unique()

#     for metric in unique_metrics:
#         print(f"\n\n================ ANALYSIS FOR: {metric.upper()} ================")
        
#         # 3.1 Filter
#         df_metric = df[df['metric'] == metric]
        
#         # 3.2 Pivot
#         pivoted = df_metric.pivot_table(
#             index=['expert_id', 'exercise_id', 'submission_id'], 
#             columns='feedback_type', 
#             values='score'
#         )
        
#         # 3.3 DROP MISSING (Strict Matched Pairs)
#         clean_data = pivoted.dropna()
        
#         print(f"Valid comparison tuples (Expert + Submission): {len(clean_data)}")
        
#         if len(clean_data) < 5:
#             print("Not enough data points for this metric.")
#             continue

#         # ---------------------------------------------------------
#         # 3.4 ASSUMPTION TESTING (The Pre-conditions)
#         # ---------------------------------------------------------
#         print("\n--- Precondition Check: Normality (Shapiro-Wilk) ---")
#         print("Rationale: If p < 0.05, data is NON-NORMAL. This justifies using Friedman instead of ANOVA.")
        
#         violated_normality = False
#         for tech in clean_data.columns:
#             # Shapiro-Wilk test for normality
#             # We need at least 3 data points for Shapiro-Wilk
#             if len(clean_data[tech]) >= 3:
#                 stat, p_shapiro = stats.shapiro(clean_data[tech])
                
#                 # Helper text for interpretation
#                 normality_status = "Normal" if p_shapiro > 0.05 else "Non-Normal (Violated)"
#                 if p_shapiro < 0.05: violated_normality = True
                    
#                 print(f"  {tech}: p={p_shapiro:.5f} -> {normality_status}")
#             else:
#                 print(f"  {tech}: Not enough data for Normality test")
        
#         if violated_normality:
#             print("-> CONCLUSION: Normality assumption violated. Friedman Test is the CORRECT choice.")
#         else:
#             print("-> CONCLUSION: Data looks Normal. You *could* use Repeated Measures ANOVA, but Friedman is still safe.")


#         # ---------------------------------------------------------
#         # 3.5 HYPOTHESIS TESTING
#         # ---------------------------------------------------------

#         # Rank data (handling the ordinal/bias nature)
#         ranked_data = clean_data.rank(axis=1, method='average', ascending=True)
#         mean_ranks = ranked_data.mean().sort_values(ascending=False)
        
#         print("\n--- Friedman Test ---")
#         stat, p_value = stats.friedmanchisquare(*[clean_data[col] for col in clean_data.columns])
#         print(f"p-value: {p_value:.5f}")
#         print("Mean Ranks (Higher is better):")
#         print(mean_ranks)
        
#         if p_value < 0.05:
#             print("\n--- Post-Hoc: Nemenyi Test ---")
            
#             nemenyi = sp.posthoc_nemenyi_friedman(clean_data.reset_index(drop=True))
            
#             # Filter for significant pairs to make reading easier
#             print("Significant Differences (p < 0.05):")
#             cols = clean_data.columns
#             any_sig = False
#             for i in range(len(cols)):
#                 for j in range(i+1, len(cols)):
#                     t1, t2 = cols[i], cols[j]
#                     p_val = nemenyi.loc[t1, t2]
#                     if p_val < 0.05:
#                         any_sig = True
#                         # Determine winner based on mean rank
#                         winner = t1 if mean_ranks[t1] > mean_ranks[t2] else t2
#                         loser = t2 if winner == t1 else t1
#                         print(f"  * {winner} beats {loser} (p={p_val:.4f})")
            
#             if not any_sig:
#                 print("  (None found despite global significance)")
                
#         else:
#             print("Result: No significant difference found.")
# else:
#     print("DataFrame is empty. Please check your data source.")

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import scikit_posthocs as sp

# ---------------------------------------------------------
# 1. SETUP
# ---------------------------------------------------------
df = evaluation_df.copy()

# df = df[df['Expert'] == True]  # Experts only
# df = df[df['Expert'] == False]  # Students only
# df = df[df['expert_id'] != "llm"] # Exclude LLM judge for this analysis
df = df[df['expert_id'] == "llm"] # LLM judge only

# Pre-processing
df['score'] = pd.to_numeric(df['score'], errors='coerce')
df['score'] = df['score'].replace(0, np.nan)

unique_metrics = df['metric'].unique()

print(f"Total raw ratings: {len(df)}")

# ---------------------------------------------------------
# 2. ROBUST ANALYSIS LOOP (RANK-THEN-AGGREGATE)
# ---------------------------------------------------------

for metric in unique_metrics:
    print(f"\n\n================ {metric.upper()} (SUBMISSION-LEVEL / RANK-BASED) ================")
    
    # Filter by metric
    df_metric = df[df['metric'] == metric]
    
    # --- STEP 1: PIVOT TO STUDENT LEVEL ---
    # We need to see the student's side-by-side comparison first to calculate ranks.
    student_level = df_metric.pivot_table(
        index=['expert_id', 'submission_id'], # Unique session
        columns='feedback_type', 
        values='score'
    )
    
    # Remove incomplete sessions (Student didn't rate all 3 techniques)
    # This is crucial for valid ranking.
    student_level = student_level.dropna()
    
    print(f"Valid Student Ratings (Raw Count): {len(student_level)}")
    
    if len(student_level) < 5:
        print("Not enough data to proceed.")
        continue

    # --- STEP 2: CALCULATE RANKS (Per Student) ---
    # This neutralizes the "Grumpy vs Happy" grader bias immediately.
    # A '5' from a happy grader becomes Rank 3. A '2' from a grumpy grader also becomes Rank 3.
    student_ranks = student_level.rank(axis=1, method='average', ascending=True)
    
    # --- STEP 3: AGGREGATE BY SUBMISSION (Median of Ranks) ---
    # Now we combine the ranks for the same submission.
    # We use groupby on the 'submission_id' level of the index.
    submission_level = student_ranks.groupby(level='submission_id').median()
    
    # Drop submissions that might have become incomplete during aggregation 
    # (unlikely here given previous dropna, but good safety)
    clean_data = submission_level.dropna()
    
    print(f"Valid Submissions for Analysis: {len(clean_data)}")
    
    if len(clean_data) < 5:
        print("Not enough submissions.")
        continue

    # --- STEP 4: FRIEDMAN TEST (On Median Ranks) ---
    # We are now comparing the "Consensus Rank" of T1 vs T2 vs T3
    
    # Calculate mean of the median ranks for display
    final_mean_ranks = clean_data.mean().sort_values(ascending=False)
    
    stat, p_value = stats.friedmanchisquare(*[clean_data[col] for col in clean_data.columns])
    
    print("\n--- Results ---")
    print(f"Friedman p-value: {p_value:.5f}")
    print("Mean of Median Ranks (Higher is better):")
    print(final_mean_ranks)
    
    if p_value < 0.05:
        print("\n--- Post-Hoc: Nemenyi Test ---")
        
        nemenyi = sp.posthoc_nemenyi_friedman(clean_data.reset_index(drop=True))
        
        # Filter for significant pairs to make reading easier
        print("Significant Differences (p < 0.05):")
        cols = clean_data.columns
        any_sig = False
        for i in range(len(cols)):
            for j in range(i+1, len(cols)):
                t1, t2 = cols[i], cols[j]
                p_val = nemenyi.loc[t1, t2]
                if p_val < 0.05:
                    any_sig = True
                    # Determine winner based on mean rank
                    winner = t1 if final_mean_ranks[t1] > final_mean_ranks[t2] else t2
                    loser = t2 if winner == t1 else t1
                    print(f"  * {winner} beats {loser} (p={p_val:.4f})")
                else:
                    print(f"  - No significant difference between {t1} and {t2} (p={p_val:.4f})")
        
        if not any_sig:
            print("  (None found despite global significance)")
            
    else:
        print("Result: No significant difference found.")

In [None]:
evaluation_df.groupby(["feedback_type", "metric"])["score"].describe()

In [None]:
import numpy as np

evaluation_df["evaluator_type"] = np.where(evaluation_df["expert_id"] == "llm", "llm",
    np.where(evaluation_df["Expert"] == True, "expert", "student")
)
evaluation_df

In [None]:
# Get evaluator type (llm, vs student, vs expert)
# llm if expert_id == "llm", expert if Expert == True, student otherwise

evaluation_df.groupby(["metric", "feedback_type"])["score"].median()

In [None]:
evaluation_df.groupby(["feedback_type"])["score"].describe()

In [None]:
evaluation_df.groupby(["metric"])["score"].describe()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Set up the plot style
plt.style.use('default')

# Create figure and axis
fig, ax = plt.subplots(figsize=(14, 8))

# Get unique submissions and create a mapping to consecutive numbers 1-100
unique_submissions = sorted(evaluation_df['submission_id'].unique())
submission_mapping = {sub_id: i+1 for i, sub_id in enumerate(unique_submissions)}

# Add submission index to dataframe for plotting
evaluation_df_plot = evaluation_df.copy()
evaluation_df_plot['submission_index'] = evaluation_df_plot['submission_id'].map(submission_mapping)

# Get unique metrics to assign different symbols and colors
unique_metrics = evaluation_df['metric_id'].unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(unique_metrics)))
markers = ['o', 's', '^', 'D', 'v', '<', '>', 'p', '*', 'h']

# Create scatter plot for each metric
for i, metric in enumerate(unique_metrics):
    metric_data = evaluation_df_plot[evaluation_df_plot['metric_id'] == metric]
    
    ax.scatter(
        metric_data['submission_index'], 
        metric_data['value'],
        c=[colors[i]], 
        marker=markers[i % len(markers)],
        label=f'Metric {i+1}',
        alpha=0.7,
        s=50
    )

# Customize the plot
ax.set_xlabel('Submissions (1-100)', fontsize=12, fontweight='bold')
ax.set_ylabel('Punkte (Value)', fontsize=12, fontweight='bold')
ax.set_title('Punktverteilung nach Submission und Metrik', fontsize=14, fontweight='bold')

# Set x-axis to show all submissions from 1 to 100
ax.set_xlim(0, 101)
ax.set_xticks(range(0, 101, 10))

# Set y-axis to show all possible point values
ax.set_ylim(-0.5, 5.5)
ax.set_yticks(range(0, 6))

# Add grid for better readability
ax.grid(True, alpha=0.3)

# Add legend
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Metriken')

# Adjust layout to prevent legend cutoff
plt.tight_layout()

# Show basic statistics
print(f"Anzahl Submissions: {len(unique_submissions)}")
print(f"Anzahl Metriken: {len(unique_metrics)}")
print(f"Punktebereich: {evaluation_df['value'].min()} - {evaluation_df['value'].max()}")
print(f"Durchschnittliche Punkte: {evaluation_df['value'].mean():.2f}")

plt.show()

In [None]:
# Explore the unique values for metrics and feedback types
print("Unique metric IDs:")
print(evaluation_df['metric_id'].unique())
print(f"\nNumber of unique metrics: {evaluation_df['metric_id'].nunique()}")

print("\nUnique feedback types:")
print(evaluation_df['feedback_type'].unique())
print(f"\nNumber of unique feedback types: {evaluation_df['feedback_type'].nunique()}")

# Check if we have readable names for metrics and feedback types
print("\nSample data with metric and feedback type info:")
print(evaluation_df[['metric_id', 'feedback_type', 'value']].head(10))

In [None]:
# Check what titles we have available
print("Unique titles (metric names):")
print(evaluation_df['title'].unique())
print(f"\nNumber of unique titles: {evaluation_df['title'].nunique()}")

# Calculate statistics for each metric (by title) and feedback type combination
stats_list = []

for title in evaluation_df['title'].unique():
    for feedback_type in evaluation_df['feedback_type'].unique():
        # Filter data for this combination
        subset = evaluation_df[
            (evaluation_df['title'] == title) & 
            (evaluation_df['feedback_type'] == feedback_type)
        ]['value']
        
        if len(subset) > 0:  # Only calculate if we have data
            stats = {
                'metric_title': title,
                'feedback_type': feedback_type,
                'count': len(subset),
                'mean': subset.mean(),
                'median': subset.median(),
                'std': subset.std(),
                'min': subset.min(),
                'max': subset.max(),
                'q25': subset.quantile(0.25),
                'q75': subset.quantile(0.75),
                'iqr': subset.quantile(0.75) - subset.quantile(0.25)
            }
            stats_list.append(stats)

# Create the statistics dataframe
stats_df = pd.DataFrame(stats_list)

# Round numerical values for better readability
numerical_columns = ['mean', 'median', 'std', 'q25', 'q75', 'iqr']
stats_df[numerical_columns] = stats_df[numerical_columns].round(3)

print("\nStatistics by Metric Title and Feedback Type:")
print("=" * 80)
print(stats_df.to_string(index=False))

# Also create a pivot table for easier comparison
print("\n\nMean values pivot table:")
print("=" * 50)
mean_pivot = stats_df.pivot(index='metric_title', columns='feedback_type', values='mean')
print(mean_pivot.round(3))

print("\n\nStandard deviation pivot table:")
print("=" * 50)
std_pivot = stats_df.pivot(index='metric_title', columns='feedback_type', values='std')
print(std_pivot.round(3))

# Save the statistics dataframe
print(f"\nStatistics dataframe shape: {stats_df.shape}")
print("Dataframe saved as 'stats_df' variable")

# Save the dataframe to a CSV file
stats_df.to_csv("data/4_expert_evaluation/stats_df.csv", index=False)

In [None]:
# Create a second dataframe that also distinguishes between experts and non-experts
print("Creating statistics dataframe with expert/non-expert distinction:")
print("=" * 70)

# Check the Expert column
print("Expert status distribution:")
print(evaluation_df['Expert'].value_counts())
print(f"\nExpert column type: {evaluation_df['Expert'].dtype}")

# Calculate statistics for each metric, feedback type, and expert status combination
detailed_stats_list = []

for title in evaluation_df['title'].unique():
    for feedback_type in evaluation_df['feedback_type'].unique():
        for expert_status in evaluation_df['Expert'].unique():
            # Filter data for this combination
            subset = evaluation_df[
                (evaluation_df['title'] == title) & 
                (evaluation_df['feedback_type'] == feedback_type) &
                (evaluation_df['Expert'] == expert_status)
            ]['value']
            
            if len(subset) > 0:  # Only calculate if we have data
                stats = {
                    'metric_title': title,
                    'feedback_type': feedback_type,
                    'is_expert': expert_status,
                    'count': len(subset),
                    'mean': subset.mean(),
                    'median': subset.median(),
                    'std': subset.std(),
                    'min': subset.min(),
                    'max': subset.max(),
                    'q25': subset.quantile(0.25),
                    'q75': subset.quantile(0.75),
                    'iqr': subset.quantile(0.75) - subset.quantile(0.25)
                }
                detailed_stats_list.append(stats)

# Create the detailed statistics dataframe
detailed_stats_df = pd.DataFrame(detailed_stats_list)

# Round numerical values for better readability
numerical_columns = ['mean', 'median', 'std', 'q25', 'q75', 'iqr']
detailed_stats_df[numerical_columns] = detailed_stats_df[numerical_columns].round(3)

print("\nDetailed Statistics by Metric, Feedback Type, and Expert Status:")
print("=" * 80)
print(detailed_stats_df.to_string(index=False))

# Create pivot tables for easier comparison
print("\n\nMean values by Expert Status:")
print("=" * 50)
for expert_status in [False, True]:
    expert_label = "Expert" if expert_status else "Non-Expert"
    print(f"\n{expert_label}:")
    expert_data = detailed_stats_df[detailed_stats_df['is_expert'] == expert_status]
    if len(expert_data) > 0:
        expert_pivot = expert_data.pivot(index='metric_title', columns='feedback_type', values='mean')
        print(expert_pivot.round(3))
    else:
        print("No data available")

# Save the detailed statistics dataframe
print(f"\nDetailed statistics dataframe shape: {detailed_stats_df.shape}")
print("Detailed dataframe saved as 'detailed_stats_df' variable")

# Save to CSV
detailed_stats_df.to_csv("data/4_expert_evaluation/detailed_stats_df.csv", index=False)
print("Saved to: data/4_expert_evaluation/detailed_stats_df.csv")

In [None]:
# Create box plots (whiskers diagrams) for the detailed statistics
import matplotlib.pyplot as plt
import numpy as np

# Create subplots for different comparisons
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Distribution of Evaluation Scores - Box Plots', fontsize=16, fontweight='bold')

# Plot 1: Overall comparison by feedback type
ax1 = axes[0, 0]
feedback_types = evaluation_df['feedback_type'].unique()
data_by_feedback = [evaluation_df[evaluation_df['feedback_type'] == ft]['value'].values for ft in feedback_types]
box1 = ax1.boxplot(data_by_feedback, labels=feedback_types, patch_artist=True)
ax1.set_title('Score Distribution by Feedback Type', fontweight='bold')
ax1.set_ylabel('Score')
ax1.grid(True, alpha=0.3)

# Color the boxes
colors = ['lightblue', 'lightgreen', 'lightcoral']
for patch, color in zip(box1['boxes'], colors):
    patch.set_facecolor(color)

# Plot 2: Comparison by expert status
ax2 = axes[0, 1]
expert_labels = ['Non-Expert', 'Expert']
data_by_expert = [
    evaluation_df[evaluation_df['Expert'] == False]['value'].values,
    evaluation_df[evaluation_df['Expert'] == True]['value'].values
]
box2 = ax2.boxplot(data_by_expert, labels=expert_labels, patch_artist=True)
ax2.set_title('Score Distribution by Expert Status', fontweight='bold')
ax2.set_ylabel('Score')
ax2.grid(True, alpha=0.3)

# Color the boxes
expert_colors = ['lightyellow', 'lightpink']
for patch, color in zip(box2['boxes'], expert_colors):
    patch.set_facecolor(color)

# Plot 3: Detailed comparison by feedback type and expert status
ax3 = axes[1, 0]
combined_labels = []
combined_data = []

for ft in feedback_types:
    for expert in [False, True]:
        expert_label = 'Expert' if expert else 'Non-Expert'
        label = f"{ft}\n{expert_label}"
        data = evaluation_df[
            (evaluation_df['feedback_type'] == ft) & 
            (evaluation_df['Expert'] == expert)
        ]['value'].values
        
        if len(data) > 0:
            combined_labels.append(label)
            combined_data.append(data)

box3 = ax3.boxplot(combined_data, labels=combined_labels, patch_artist=True)
ax3.set_title('Score Distribution by Feedback Type and Expert Status', fontweight='bold')
ax3.set_ylabel('Score')
ax3.tick_params(axis='x', rotation=45)
ax3.grid(True, alpha=0.3)

# Color alternating boxes
alt_colors = ['lightsteelblue', 'mistyrose'] * len(feedback_types)
for patch, color in zip(box3['boxes'], alt_colors):
    patch.set_facecolor(color)

# Plot 4: Comparison by metrics (titles)
ax4 = axes[1, 1]
metric_titles = evaluation_df['title'].unique()
data_by_metric = [evaluation_df[evaluation_df['title'] == title]['value'].values for title in metric_titles]

# Shorten labels for better display
short_labels = [title.split()[1] if len(title.split()) > 1 else title for title in metric_titles]
box4 = ax4.boxplot(data_by_metric, labels=short_labels, patch_artist=True)
ax4.set_title('Score Distribution by Metric', fontweight='bold')
ax4.set_ylabel('Score')
ax4.grid(True, alpha=0.3)

# Color the boxes
metric_colors = ['lavender', 'lightcyan', 'wheat', 'honeydew']
for patch, color in zip(box4['boxes'], metric_colors):
    patch.set_facecolor(color)

plt.tight_layout()
plt.show()

# Print summary statistics for the box plots
print("Summary Statistics for Box Plots:")
print("=" * 50)

print("\n1. By Feedback Type:")
for ft in feedback_types:
    data = evaluation_df[evaluation_df['feedback_type'] == ft]['value']
    print(f"{ft:>10}: Mean={data.mean():.2f}, Median={data.median():.2f}, Std={data.std():.2f}")

print("\n2. By Expert Status:")
for expert, label in [(False, 'Non-Expert'), (True, 'Expert')]:
    data = evaluation_df[evaluation_df['Expert'] == expert]['value']
    print(f"{label:>10}: Mean={data.mean():.2f}, Median={data.median():.2f}, Std={data.std():.2f}")

print("\n3. By Metric:")
for title in metric_titles:
    data = evaluation_df[evaluation_df['title'] == title]['value']
    short_title = title.split()[1] if len(title.split()) > 1 else title
    print(f"{short_title:>12}: Mean={data.mean():.2f}, Median={data.median():.2f}, Std={data.std():.2f}")

In [None]:
# Create box plots with one box for each row of the detailed stats
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for each combination in detailed_stats_df
box_data = []
box_labels = []

for idx, row in detailed_stats_df.iterrows():
    # Filter data for this specific combination
    subset = evaluation_df[
        (evaluation_df['title'] == row['metric_title']) & 
        (evaluation_df['feedback_type'] == row['feedback_type']) &
        (evaluation_df['Expert'] == row['is_expert'])
    ]['value'].values
    
    if len(subset) > 0:
        box_data.append(subset)
        
        # Create descriptive label
        expert_label = 'Expert' if row['is_expert'] else 'Non-Expert'
        metric_short = row['metric_title'].split()[1] if len(row['metric_title'].split()) > 1 else row['metric_title']
        label = f"{metric_short}\n{row['feedback_type']}\n{expert_label}"
        box_labels.append(label)

# Create the plot
fig, ax = plt.subplots(figsize=(20, 8))

# Create box plot
boxes = ax.boxplot(box_data, tick_labels=box_labels, patch_artist=True)

# Color boxes based on feedback type
colors = {'Cofee': 'lightblue', 'Tutor': 'lightgreen', 'LLM': 'lightcoral'}
expert_colors = {'Expert': 0.8, 'Non-Expert': 0.5}  # Alpha values

for i, (box, label) in enumerate(zip(boxes['boxes'], box_labels)):
    # Extract feedback type and expert status from label
    lines = label.split('\n')
    feedback_type = lines[1]
    expert_status = lines[2]
    
    # Set color based on feedback type
    base_color = colors.get(feedback_type, 'lightgray')
    alpha = expert_colors.get(expert_status, 0.7)
    
    box.set_facecolor(base_color)
    box.set_alpha(alpha)

# Customize the plot
ax.set_title('Score Distribution for Each Detailed Statistics Combination\n(24 combinations: 4 metrics × 3 feedback types × 2 expert levels)', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_xlabel('Metric - Feedback Type - Expert Status', fontsize=12, fontweight='bold')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add grid for better readability
ax.grid(True, alpha=0.3, axis='y')

# Create custom legend for feedback types and expert status
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='lightblue', alpha=0.8, label='Cofee - Expert'),
    Patch(facecolor='lightblue', alpha=0.5, label='Cofee - Non-Expert'),
    Patch(facecolor='lightgreen', alpha=0.8, label='Tutor - Expert'),
    Patch(facecolor='lightgreen', alpha=0.5, label='Tutor - Non-Expert'),
    Patch(facecolor='lightcoral', alpha=0.8, label='LLM - Expert'),
    Patch(facecolor='lightcoral', alpha=0.5, label='LLM - Non-Expert')
]

ax.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1.02, 1))

plt.tight_layout()
plt.show()

# Print the corresponding statistics for verification
print("Detailed Statistics for Each Box:")
print("=" * 60)
print(detailed_stats_df[['metric_title', 'feedback_type', 'is_expert', 'count', 'mean', 'median', 'std']].to_string(index=False))

In [None]:
# Create separate box plots for experts and non-experts
import matplotlib.pyplot as plt
import numpy as np

# Create subplots for experts and non-experts
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle('Score Distribution Comparison: Experts vs Non-Experts', fontsize=16, fontweight='bold')

# Define colors for feedback types
colors = {'Cofee': 'lightblue', 'Tutor': 'lightgreen', 'LLM': 'lightcoral'}

# Function to create box plot for a specific expert status
def create_expert_plot(ax, is_expert, title_suffix):
    box_data = []
    box_labels = []
    box_colors = []
    
    # Filter detailed stats for the specific expert status
    expert_stats = detailed_stats_df[detailed_stats_df['is_expert'] == is_expert]
    
    for idx, row in expert_stats.iterrows():
        # Filter data for this specific combination
        subset = evaluation_df[
            (evaluation_df['title'] == row['metric_title']) & 
            (evaluation_df['feedback_type'] == row['feedback_type']) &
            (evaluation_df['Expert'] == row['is_expert'])
        ]['value'].values
        
        if len(subset) > 0:
            box_data.append(subset)
            
            # Create descriptive label (shorter for better readability)
            metric_short = row['metric_title'].split()[1] if len(row['metric_title'].split()) > 1 else row['metric_title']
            label = f"{metric_short}\n{row['feedback_type']}"
            box_labels.append(label)
            box_colors.append(colors.get(row['feedback_type'], 'lightgray'))
    
    # Create box plot
    boxes = ax.boxplot(box_data, tick_labels=box_labels, patch_artist=True)
    
    # Color the boxes
    for box, color in zip(boxes['boxes'], box_colors):
        box.set_facecolor(color)
        box.set_alpha(0.7)
    
    # Customize the plot
    ax.set_title(f'Score Distribution - {title_suffix}\n(12 combinations: 4 metrics × 3 feedback types)', 
                 fontweight='bold', pad=15)
    ax.set_ylabel('Score', fontweight='bold')
    ax.set_xlabel('Metric - Feedback Type', fontweight='bold')
    
    # Rotate x-axis labels
    ax.tick_params(axis='x', rotation=45)
    ax.grid(True, alpha=0.3, axis='y')
    
    return expert_stats

# Create plots for both groups
non_expert_stats = create_expert_plot(ax1, False, 'Non-Experts')
expert_stats = create_expert_plot(ax2, True, 'Experts')

# Create shared legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='lightblue', alpha=0.7, label='Cofee'),
    Patch(facecolor='lightgreen', alpha=0.7, label='Tutor'),
    Patch(facecolor='lightcoral', alpha=0.7, label='LLM')
]

# Place legend outside the plots
fig.legend(handles=legend_elements, loc='center', bbox_to_anchor=(0.5, 0.02), ncol=3)

plt.tight_layout()
plt.subplots_adjust(bottom=0.15)  # Make room for legend
plt.show()

# Print comparison statistics
print("Comparison Statistics - Experts vs Non-Experts:")
print("=" * 70)

print("\nNon-Expert Statistics:")
print(non_expert_stats[['metric_title', 'feedback_type', 'count', 'mean', 'median', 'std']].to_string(index=False))

print("\nExpert Statistics:")
print(expert_stats[['metric_title', 'feedback_type', 'count', 'mean', 'median', 'std']].to_string(index=False))

# Calculate and show differences
print("\nMean Score Differences (Expert - Non-Expert):")
print("=" * 50)
for metric in evaluation_df['title'].unique():
    for feedback in evaluation_df['feedback_type'].unique():
        expert_mean = expert_stats[
            (expert_stats['metric_title'] == metric) & 
            (expert_stats['feedback_type'] == feedback)
        ]['mean'].iloc[0]
        
        non_expert_mean = non_expert_stats[
            (non_expert_stats['metric_title'] == metric) & 
            (non_expert_stats['feedback_type'] == feedback)
        ]['mean'].iloc[0]
        
        difference = expert_mean - non_expert_mean
        metric_short = metric.split()[1] if len(metric.split()) > 1 else metric
        print(f"{metric_short:>12} - {feedback:>6}: {difference:+.3f}")

In [None]:
# Calculate the number of complete evaluations for each submission
print("Calculating complete evaluations per submission:")
print("=" * 60)

# Define what constitutes a "complete" evaluation
# Complete = all 4 metrics × all 3 feedback types = 12 evaluations per submission
total_metrics = evaluation_df['title'].nunique()
total_feedback_types = evaluation_df['feedback_type'].nunique()
expected_evaluations_per_submission = total_metrics * total_feedback_types

print(f"Expected evaluations per submission: {expected_evaluations_per_submission}")
print(f"({total_metrics} metrics × {total_feedback_types} feedback types)")

# Group by submission and count unique combinations of metric and feedback type
submission_completeness = []

for submission_id in evaluation_df['submission_id'].unique():
    # Get all evaluations for this submission
    submission_data = evaluation_df[evaluation_df['submission_id'] == submission_id]
    
    # Count unique combinations of metric and feedback type
    unique_combinations = submission_data.groupby(['title', 'feedback_type']).size()
    num_complete_combinations = len(unique_combinations)
    
    # Calculate how many complete evaluation sets this submission has
    # (some submissions might have multiple evaluators for the same combination)
    total_evaluations = len(submission_data)
    
    # Count unique evaluators (expert_id) for this submission
    unique_evaluators = submission_data['expert_id'].nunique()
    
    # Calculate completeness metrics
    completeness_ratio = num_complete_combinations / expected_evaluations_per_submission
    is_complete = num_complete_combinations == expected_evaluations_per_submission
    
    submission_completeness.append({
        'submission_id': submission_id,
        'total_evaluations': total_evaluations,
        'unique_combinations': num_complete_combinations,
        'unique_evaluators': unique_evaluators,
        'expected_combinations': expected_evaluations_per_submission,
        'completeness_ratio': completeness_ratio,
        'is_complete': is_complete,
        'avg_evaluations_per_combination': total_evaluations / num_complete_combinations if num_complete_combinations > 0 else 0
    })

# Create the completeness dataframe
completeness_df = pd.DataFrame(submission_completeness)

# Round numerical values
completeness_df['completeness_ratio'] = completeness_df['completeness_ratio'].round(3)
completeness_df['avg_evaluations_per_combination'] = completeness_df['avg_evaluations_per_combination'].round(2)

# Sort by submission_id for better readability
completeness_df = completeness_df.sort_values('submission_id').reset_index(drop=True)

print(f"\nCompleteness Analysis Results:")
print("=" * 50)
print(f"Total submissions analyzed: {len(completeness_df)}")
print(f"Complete submissions (all 12 combinations): {completeness_df['is_complete'].sum()}")
print(f"Incomplete submissions: {(~completeness_df['is_complete']).sum()}")

# Display statistics
print(f"\nEvaluation Count Statistics:")
print(f"Minimum evaluations per submission: {completeness_df['total_evaluations'].min()}")
print(f"Maximum evaluations per submission: {completeness_df['total_evaluations'].max()}")
print(f"Average evaluations per submission: {completeness_df['total_evaluations'].mean():.1f}")
print(f"Median evaluations per submission: {completeness_df['total_evaluations'].median():.1f}")

print(f"\nUnique Combinations Statistics:")
print(f"Minimum unique combinations: {completeness_df['unique_combinations'].min()}")
print(f"Maximum unique combinations: {completeness_df['unique_combinations'].max()}")
print(f"Average unique combinations: {completeness_df['unique_combinations'].mean():.1f}")

print(f"\nEvaluator Statistics:")
print(f"Minimum evaluators per submission: {completeness_df['unique_evaluators'].min()}")
print(f"Maximum evaluators per submission: {completeness_df['unique_evaluators'].max()}")
print(f"Average evaluators per submission: {completeness_df['unique_evaluators'].mean():.1f}")

# Show sample of the data
print(f"\nSample of Completeness Data:")
print("=" * 50)
print(completeness_df.head(10).to_string(index=False))

# Show incomplete submissions if any
incomplete_submissions = completeness_df[~completeness_df['is_complete']]
if len(incomplete_submissions) > 0:
    print(f"\nIncomplete Submissions ({len(incomplete_submissions)} found):")
    print("=" * 50)
    print(incomplete_submissions[['submission_id', 'unique_combinations', 'completeness_ratio']].to_string(index=False))

# Save to CSV
csv_path = "data/4_expert_evaluation/submission_completeness.csv"
completeness_df.to_csv(csv_path, index=False)
print(f"\nCompleteness dataframe saved to: {csv_path}")
print(f"Dataframe shape: {completeness_df.shape}")
print("Variables: 'completeness_df' contains the analysis results")