### Install Required Packages (Run this first if jiwer is not installed)

In [None]:
# Uncomment and run this cell if jiwer is not installed
# !pip install jiwer

### Imports for Jiwer and Pandas

In [1]:
import jiwer
from jiwer import wer
import pandas as pd
import os
import ast
import numpy as np

### Load the CSV file

In [2]:
# Load the CSV file
df = pd.read_csv('third_wer_oishani.csv', index_col=0)
print(f"Loaded {len(df)} rows")
df.head()

Loaded 51197 rows


Unnamed: 0,file_name,participant_id,sentence_id,Iteration,probability,reference,expected_transcription,acceptable_transcription,large,base,medium,small,tiny,large_preproc,base_preproc,medium_preproc,small_preproc,tiny_preproc,large_wer_list,large_wer_min
0,t45_M_EN_SAUCE_block0_sauce_25_3_L2_large-v3.txt,t45,25_3,L2,low prob,When I get a cold I have a runny pumpkin.,when i get a cold i have a runny pumpkin,['when i get a cold i have a runny pumpkin'],When I get cold I have a running pumpkin.,When I get cold I have a running pumpkin.,When I get cold I have a runny pumpkin.,When I get cold I have a running pumpkin.,When I get cold I have a running pumpkin.,when i get cold i have a running pumpkin,when i get cold i have a running pumpkin,when i get cold i have a runny pumpkin,when i get cold i have a running pumpkin,when i get cold i have a running pumpkin,[0.2],0.2
1,t45_M_EN_SAUCE_block0_sauce_79_2_L1_large-v3.txt,t45,79_2,L1,nonword,There are 3 tedrooms in their carrot.,there are 3 tedrooms in their carrot,"['there are three tedrooms in their carrot', '...",There are three tat rooms in the carrot.,There are three tent rooms in their carrot.,There are three tat rooms in their carriage.,There are three Ted rooms in their carrot.,There are three tat rooms in their carrots.,there are three tat rooms in the carrot,there are three tent rooms in their carrot,there are three tat rooms in their carriage,there are three ted rooms in their carrot,there are three tat rooms in their carrots,"[0.42857142857142855, 0.5714285714285714, 0.57...",0.428571
2,t45_M_EN_SAUCE_block0_sauce_34_1_L2_large-v3.txt,t45,34_1,L2,high prob,"At the zoo, I saw a tall giraffe.",at the zoo i saw a tall giraffe,['at the zoo i saw a tall giraffe'],At the zoo I saw a tall giraffe.,At the zoo I saw a tall giraffe.,At the zoo I saw a tall giraffe.,At the zoo I saw a tall giraffe.,At the zoo I saw a tall giraffe.,at the zoo i saw a tall giraffe,at the zoo i saw a tall giraffe,at the zoo i saw a tall giraffe,at the zoo i saw a tall giraffe,at the zoo i saw a tall giraffe,[0.0],0.0
3,t45_M_EN_SAUCE_block0_sauce_3_2_L1_large-v3.txt,t45,3_2,L1,high prob,"The king wore a shiny, gold crown",the king wore a shiny gold crown,['the king wore a shiny gold crown'],The king wore a shiny gold crown.,The King wore a shiny gold crown.,The king wore a shiny gold crown.,The king wore a shiny gold crown.,The King wore a shiny gold crown.,the king wore a shiny gold crown,the king wore a shiny gold crown,the king wore a shiny gold crown,the king wore a shiny gold crown,the king wore a shiny gold crown,[0.0],0.0
4,t45_M_EN_SAUCE_block0_sauce_37_2_L1_large-v3.txt,t45,37_2,L1,low prob,Grandma sits on her porch in a bat.,grandma sits on her porch in a bat,['grandma sits on her porch in a bat'],Grandma sits on her porch in a bed.,Grandma sits on her porch in a bed.,Grandma sits on her porch in the bed.,Grandma sits on her porch in the bed.,Grandma sits on her porch in a bed.,grandma sits on her porch in a bed,grandma sits on her porch in a bed,grandma sits on her porch in the bed,grandma sits on her porch in the bed,grandma sits on her porch in a bed,[0.125],0.125


### Helper Function to Calculate WER

In [3]:
def calculate_wer_for_model(df, model_name):
    """
    Calculate WER list and min WER for a given model.
    
    Parameters:
    - df: DataFrame with preprocessed transcriptions
    - model_name: Name of the model (e.g., 'base', 'medium', 'small', 'tiny')
    
    Returns:
    - DataFrame with added columns: {model_name}_wer_list and {model_name}_wer_min
    """
    preproc_col = f'{model_name}_preproc'
    wer_list_col = f'{model_name}_wer_list'
    wer_min_col = f'{model_name}_wer_min'
    
    # Initialize lists to store WER values
    wer_lists = []
    wer_mins = []
    
    for idx, row in df.iterrows():
        hypothesis = row[preproc_col]
        
        # Get expected transcription
        expected = row['expected_transcription']
        
        # Get acceptable transcriptions (parse from string if needed)
        acceptable = row['acceptable_transcription']
        if pd.isna(acceptable):
            acceptable_list = []
        elif isinstance(acceptable, str):
            try:
                acceptable_list = ast.literal_eval(acceptable)
            except:
                acceptable_list = []
        else:
            acceptable_list = acceptable if isinstance(acceptable, list) else []
        
        # Combine expected with acceptable transcriptions
        all_references = [expected] + acceptable_list
        
        # Calculate WER for each reference
        wer_values = []
        for reference in all_references:
            if pd.notna(hypothesis) and pd.notna(reference):
                wer_value = jiwer.wer(reference, hypothesis)
                wer_values.append(wer_value)
        
        # Store WER list and minimum
        wer_lists.append(wer_values)
        wer_mins.append(min(wer_values) if wer_values else np.nan)
    
    # Add columns to dataframe
    df[wer_list_col] = wer_lists
    df[wer_min_col] = wer_mins
    
    return df

### Step 1: Add WER columns for all models (base, medium, small, tiny)

In [4]:
# List of models to process (excluding 'large' since it's already done)
models = ['base', 'medium', 'small', 'tiny']

# Calculate WER for each model
for model in models:
    print(f"Calculating WER for {model}...")
    df = calculate_wer_for_model(df, model)
    print(f"Completed {model}")

print("\nAll model WERs calculated!")
df.head()

Calculating WER for base...
Completed base
Calculating WER for medium...
Completed medium
Calculating WER for small...
Completed small
Calculating WER for tiny...
Completed tiny

All model WERs calculated!


Unnamed: 0,file_name,participant_id,sentence_id,Iteration,probability,reference,expected_transcription,acceptable_transcription,large,base,...,large_wer_list,large_wer_min,base_wer_list,base_wer_min,medium_wer_list,medium_wer_min,small_wer_list,small_wer_min,tiny_wer_list,tiny_wer_min
0,t45_M_EN_SAUCE_block0_sauce_25_3_L2_large-v3.txt,t45,25_3,L2,low prob,When I get a cold I have a runny pumpkin.,when i get a cold i have a runny pumpkin,['when i get a cold i have a runny pumpkin'],When I get cold I have a running pumpkin.,When I get cold I have a running pumpkin.,...,[0.2],0.2,"[0.2, 0.2]",0.2,"[0.1, 0.1]",0.1,"[0.2, 0.2]",0.2,"[0.2, 0.2]",0.2
1,t45_M_EN_SAUCE_block0_sauce_79_2_L1_large-v3.txt,t45,79_2,L1,nonword,There are 3 tedrooms in their carrot.,there are 3 tedrooms in their carrot,"['there are three tedrooms in their carrot', '...",There are three tat rooms in the carrot.,There are three tent rooms in their carrot.,...,"[0.42857142857142855, 0.5714285714285714, 0.57...",0.428571,"[0.42857142857142855, 0.2857142857142857, 0.42...",0.285714,"[0.5714285714285714, 0.42857142857142855, 0.42...",0.428571,"[0.42857142857142855, 0.2857142857142857, 0.42...",0.285714,"[0.5714285714285714, 0.42857142857142855, 0.42...",0.428571
2,t45_M_EN_SAUCE_block0_sauce_34_1_L2_large-v3.txt,t45,34_1,L2,high prob,"At the zoo, I saw a tall giraffe.",at the zoo i saw a tall giraffe,['at the zoo i saw a tall giraffe'],At the zoo I saw a tall giraffe.,At the zoo I saw a tall giraffe.,...,[0.0],0.0,"[0.0, 0.0]",0.0,"[0.0, 0.0]",0.0,"[0.0, 0.0]",0.0,"[0.0, 0.0]",0.0
3,t45_M_EN_SAUCE_block0_sauce_3_2_L1_large-v3.txt,t45,3_2,L1,high prob,"The king wore a shiny, gold crown",the king wore a shiny gold crown,['the king wore a shiny gold crown'],The king wore a shiny gold crown.,The King wore a shiny gold crown.,...,[0.0],0.0,"[0.0, 0.0]",0.0,"[0.0, 0.0]",0.0,"[0.0, 0.0]",0.0,"[0.0, 0.0]",0.0
4,t45_M_EN_SAUCE_block0_sauce_37_2_L1_large-v3.txt,t45,37_2,L1,low prob,Grandma sits on her porch in a bat.,grandma sits on her porch in a bat,['grandma sits on her porch in a bat'],Grandma sits on her porch in a bed.,Grandma sits on her porch in a bed.,...,[0.125],0.125,"[0.125, 0.125]",0.125,"[0.25, 0.25]",0.25,"[0.25, 0.25]",0.25,"[0.125, 0.125]",0.125


### Step 2: Add column with minimum WER across all models

In [5]:
# Get all model names (including large)
all_models = ['large', 'base', 'medium', 'small', 'tiny']

# Get the min WER column names
min_wer_cols = [f'{model}_wer_min' for model in all_models]

# Calculate the minimum WER across all models
df['min_wer_across_models'] = df[min_wer_cols].min(axis=1)

print("Added 'min_wer_across_models' column")
df[['participant_id', 'sentence_id', 'Iteration'] + min_wer_cols + ['min_wer_across_models']].head(10)

Added 'min_wer_across_models' column


Unnamed: 0,participant_id,sentence_id,Iteration,large_wer_min,base_wer_min,medium_wer_min,small_wer_min,tiny_wer_min,min_wer_across_models
0,t45,25_3,L2,0.2,0.2,0.1,0.2,0.2,0.1
1,t45,79_2,L1,0.428571,0.285714,0.428571,0.285714,0.428571,0.285714
2,t45,34_1,L2,0.0,0.0,0.0,0.0,0.0,0.0
3,t45,3_2,L1,0.0,0.0,0.0,0.0,0.0,0.0
4,t45,37_2,L1,0.125,0.125,0.25,0.25,0.125,0.125
5,t45,90_2,L2,0.0,0.0,0.0,0.0,0.0,0.0
6,t45,93_1,L1,0.166667,0.333333,0.333333,0.333333,0.166667,0.166667
7,t45,82_3,L1,0.0,0.0,0.0,0.0,0.0,0.0
8,t45,43_1,L1,0.0,0.0,0.0,0.0,0.0,0.0
9,t45,52_3,L1,0.1,0.2,0.2,0.1,0.1,0.1


### Step 3: Add 'winning_model' column (model with lowest WER)

In [6]:
# Find which model has the minimum WER for each row
def get_winning_model(row):
    min_wers = {model: row[f'{model}_wer_min'] for model in all_models}
    # Handle NaN values
    min_wers = {k: v for k, v in min_wers.items() if pd.notna(v)}
    if not min_wers:
        return None
    return min(min_wers, key=min_wers.get)

df['winning_model'] = df.apply(get_winning_model, axis=1)

print("Added 'winning_model' column")
df[['participant_id', 'sentence_id', 'Iteration', 'min_wer_across_models', 'winning_model']].head(10)

Added 'winning_model' column


Unnamed: 0,participant_id,sentence_id,Iteration,min_wer_across_models,winning_model
0,t45,25_3,L2,0.1,medium
1,t45,79_2,L1,0.285714,base
2,t45,34_1,L2,0.0,large
3,t45,3_2,L1,0.0,large
4,t45,37_2,L1,0.125,large
5,t45,90_2,L2,0.0,large
6,t45,93_1,L1,0.166667,large
7,t45,82_3,L1,0.0,large
8,t45,43_1,L1,0.0,large
9,t45,52_3,L1,0.1,large


### Step 4: Filter out 'nonword' and find lowest WER across all iterations and models

filter out 'nonword' probability entries before grouping, so we only analyze 'high prob' and 'low prob' sentences

When multiple iterations have the same lowest WER, we select the first iteration in the order: L1, L2, R1P1, R1P2, R2P1, R2P2, etc. For example, if both L1 and L2 have WER=0, we select L1

In [7]:
# Filter out 'nonword' entries - we only want 'high prob' and 'low prob'
df_no_nonword = df[df['probability'] != 'nonword'].copy()

print(f"Original dataset: {len(df)} rows")
print(f"After filtering out 'nonword': {len(df_no_nonword)} rows")
print(f"\nProbability distribution after filtering:")
print(df_no_nonword['probability'].value_counts())

# Group by participant_id and sentence_id (without nonwords)
grouped = df_no_nonword.groupby(['participant_id', 'sentence_id'])

# For each group, find:
# 1. List of unique iterations
# 2. Minimum WER across all iterations and models
# 3. Which iteration and model achieved this minimum
# 4. Probability type

def get_best_iteration_info(group):
    # Get unique iterations
    iterations = group['Iteration'].unique().tolist()
    
    # Find the minimum WER value
    min_wer = group['min_wer_across_models'].min()
    
    # Get all rows with the minimum WER
    min_wer_rows = group[group['min_wer_across_models'] == min_wer]
    
    # If there are multiple rows with same min WER, sort by iteration and take first
    # Define iteration order: L1, L2, R1P1, R1P2, R2P1, R2P2, etc.
    iteration_order = ['L1', 'L2', 'R1P1', 'R1P2', 'R2P1', 'R2P2', 'R3P1', 'R3P2']
    
    # Create a mapping for sorting
    iteration_rank = {it: i for i, it in enumerate(iteration_order)}
    
    # Add a rank column for sorting (unknown iterations get high rank)
    min_wer_rows_sorted = min_wer_rows.copy()
    min_wer_rows_sorted['iteration_rank'] = min_wer_rows_sorted['Iteration'].map(
        lambda x: iteration_rank.get(x, 999)
    )
    
    # Sort by iteration rank and take the first row
    min_wer_rows_sorted = min_wer_rows_sorted.sort_values('iteration_rank')
    min_row = min_wer_rows_sorted.iloc[0]
    
    return pd.Series({
        'probability': min_row['probability'],  # Add probability to summary
        'unique_iterations': iterations,
        'best_wer_across_iterations': min_row['min_wer_across_models'],
        'best_iteration': min_row['Iteration'],
        'best_model': min_row['winning_model']
    })

best_iteration_info = grouped.apply(get_best_iteration_info).reset_index()

print("\nSummary by participant and sentence (excluding nonword):")
print(best_iteration_info.head(20))
print(f"\nTotal rows in best_iteration_info: {len(best_iteration_info)}")
print(f"Probability distribution in best_iteration_info:")
print(best_iteration_info['probability'].value_counts())

# Merge this info back to the main dataframe (all rows including nonword)
# Drop the probability column from best_iteration_info before merging to avoid conflicts
best_iteration_info_merge = best_iteration_info.drop(columns=['probability'])
df = df.merge(best_iteration_info_merge, on=['participant_id', 'sentence_id'], how='left')

print("\nAdded columns:")
print("- unique_iterations: List of all iterations for this participant-sentence combination")
print("- best_wer_across_iterations: Lowest WER achieved across all iterations and models")
print("- best_iteration: Which iteration achieved the best WER")
print("- best_model: Which model achieved the best WER")
print("\nNote: These values are calculated only from 'high prob' and 'low prob' entries (nonword excluded)")

Original dataset: 51197 rows
After filtering out 'nonword': 34131 rows

Probability distribution after filtering:
probability
high prob    17066
low prob     17065
Name: count, dtype: int64

Summary by participant and sentence (excluding nonword):
   participant_id sentence_id probability unique_iterations  \
0              t1       100_2    low prob          [L2, L1]   
1              t1       100_3   high prob          [L2, L1]   
2              t1        10_1    low prob          [L1, L2]   
3              t1        10_3   high prob          [L2, L1]   
4              t1        11_2   high prob          [L2, L1]   
5              t1        11_3    low prob          [L2, L1]   
6              t1        12_1   high prob          [L1, L2]   
7              t1        12_2    low prob          [L2, L1]   
8              t1        13_1    low prob          [L1, L2]   
9              t1        13_3   high prob          [L2, L1]   
10             t1        14_1    low prob          [L1, L2]

  best_iteration_info = grouped.apply(get_best_iteration_info).reset_index()


Examples where multiple iterations have same WER

In [8]:
# Find cases where multiple iterations have the same best WER
print("=" * 70)
print("TIE-BREAKING EXAMPLES")
print("=" * 70)
print("Looking for cases where multiple iterations achieved the same lowest WER...\n")

# Check in the filtered data
ties_found = 0
for (participant, sentence), group in df_no_nonword.groupby(['participant_id', 'sentence_id']):
    min_wer = group['min_wer_across_models'].min()
    iterations_with_min = group[group['min_wer_across_models'] == min_wer]['Iteration'].unique()
    
    if len(iterations_with_min) > 1:
        ties_found += 1
        if ties_found <= 5:  # Show first 5 examples
            best_iter = best_iteration_info[
                (best_iteration_info['participant_id'] == participant) & 
                (best_iteration_info['sentence_id'] == sentence)
            ]['best_iteration'].values[0]
            
            print(f"Example {ties_found}:")
            print(f"  Participant: {participant}, Sentence: {sentence}")
            print(f"  Iterations with WER={min_wer:.4f}: {sorted(iterations_with_min)}")
            print(f"  Selected iteration: {best_iter}")
            print()

print(f"Total cases with ties: {ties_found}")
if ties_found == 0:
    print("No ties found - each participant-sentence combination has a unique best iteration.")
else:
    print(f"In all {ties_found} cases, the earliest iteration (L1 < L2 < R1P1 < ...) was selected.")
print("=" * 70)

TIE-BREAKING EXAMPLES
Looking for cases where multiple iterations achieved the same lowest WER...

Example 1:
  Participant: t1, Sentence: 100_2
  Iterations with WER=0.0000: ['L1', 'L2']
  Selected iteration: L1

Example 2:
  Participant: t1, Sentence: 100_3
  Iterations with WER=0.0000: ['L1', 'L2']
  Selected iteration: L1

Example 3:
  Participant: t1, Sentence: 10_1
  Iterations with WER=0.1667: ['L1', 'L2']
  Selected iteration: L1

Example 4:
  Participant: t1, Sentence: 10_3
  Iterations with WER=0.0000: ['L1', 'L2']
  Selected iteration: L1

Example 5:
  Participant: t1, Sentence: 11_2
  Iterations with WER=0.0000: ['L1', 'L2']
  Selected iteration: L1

Total cases with ties: 11989
In all 11989 cases, the earliest iteration (L1 < L2 < R1P1 < ...) was selected.


Check the new columns

In [9]:
# Show a sample with all the new columns
cols_to_show = [
    'participant_id', 'sentence_id', 'Iteration', 'probability',
    'large_wer_min', 'base_wer_min', 'medium_wer_min', 'small_wer_min', 'tiny_wer_min',
    'min_wer_across_models', 'winning_model',
    'unique_iterations', 'best_wer_across_iterations', 'best_iteration', 'best_model'
]

print("Sample of data with all new columns:")
df[cols_to_show].head(15)

Sample of data with all new columns:


Unnamed: 0,participant_id,sentence_id,Iteration,probability,large_wer_min,base_wer_min,medium_wer_min,small_wer_min,tiny_wer_min,min_wer_across_models,winning_model,unique_iterations,best_wer_across_iterations,best_iteration,best_model
0,t45,25_3,L2,low prob,0.2,0.2,0.1,0.2,0.2,0.1,medium,"[L2, R1P2, L1, R1P1]",0.0,L1,large
1,t45,79_2,L1,nonword,0.428571,0.285714,0.428571,0.285714,0.428571,0.285714,base,,,,
2,t45,34_1,L2,high prob,0.0,0.0,0.0,0.0,0.0,0.0,large,"[L2, L1]",0.0,L1,large
3,t45,3_2,L1,high prob,0.0,0.0,0.0,0.0,0.0,0.0,large,"[L1, L2]",0.0,L1,large
4,t45,37_2,L1,low prob,0.125,0.125,0.25,0.25,0.125,0.125,large,"[L1, L2]",0.125,L1,large
5,t45,90_2,L2,high prob,0.0,0.0,0.0,0.0,0.0,0.0,large,"[L2, L1]",0.0,L1,large
6,t45,93_1,L1,nonword,0.166667,0.333333,0.333333,0.333333,0.166667,0.166667,large,,,,
7,t45,82_3,L1,low prob,0.0,0.0,0.0,0.0,0.0,0.0,large,"[L1, L2]",0.0,L1,large
8,t45,43_1,L1,low prob,0.0,0.0,0.0,0.0,0.0,0.0,large,"[L1, L2]",0.0,L1,large
9,t45,52_3,L1,nonword,0.1,0.2,0.2,0.1,0.1,0.1,large,,,,


### Filter for only 'high prob' and 'low prob' (as mentioned in description)

In [10]:
# Create filtered dataframe
df_filtered = df[df['probability'].isin(['high prob', 'low prob'])].copy()

print(f"Original dataset: {len(df)} rows")
print(f"Filtered dataset (high prob + low prob only): {len(df_filtered)} rows")
print(f"\nProbability distribution in filtered data:")
print(df_filtered['probability'].value_counts())

Original dataset: 51197 rows
Filtered dataset (high prob + low prob only): 34131 rows

Probability distribution in filtered data:
probability
high prob    17066
low prob     17065
Name: count, dtype: int64


### Summary Statistics

In [11]:
# Summary statistics for each model
print("Mean WER by model:")
for model in all_models:
    mean_wer = df[f'{model}_wer_min'].mean()
    print(f"{model}: {mean_wer:.4f}")

print("\nWinning model distribution (all data):")
print(df['winning_model'].value_counts())

print("\nWinning model distribution (filtered - high prob + low prob only):")
print(df_filtered['winning_model'].value_counts())

Mean WER by model:
large: 0.1649
base: 0.2071
medium: 0.1302
small: 0.1563
tiny: 0.2461

Winning model distribution (all data):
winning_model
large     41661
medium     3749
base       3072
small      1762
tiny        953
Name: count, dtype: int64

Winning model distribution (filtered - high prob + low prob only):
winning_model
large     29406
medium     1912
base       1530
small       819
tiny        464
Name: count, dtype: int64


### Verify Summary Counts - Check for 72 participants with 100 high prob + 100 low prob each

In [None]:
# Check the structure of best_iteration_info
print("=" * 70)
print("VERIFICATION: Best Iteration Summary (HIGH PROB + LOW PROB ONLY)")
print("=" * 70)

print(f"\nTotal rows in best_iteration_info: {len(best_iteration_info)}")
print(f"This represents unique participant × sentence combinations (excluding nonword)")

# Count unique participants
unique_participants = best_iteration_info['participant_id'].nunique()
print(f"\nUnique participants: {unique_participants}")

# Check probability distribution in best_iteration_info
print("\n" + "=" * 70)
print("Probability Distribution in Summary")
print("=" * 70)
prob_counts = best_iteration_info['probability'].value_counts()
print(prob_counts)


print(f"\nTotal sentences: {len(best_iteration_info)}")
print(f"Expected if 72 participants × 200 sentences: {72 * 200} = 14,400")

# Detailed per-participant breakdown
print("\n" + "=" * 70)
print("Per Participant Breakdown")
print("=" * 70)
participant_prob_counts = best_iteration_info.groupby(['participant_id', 'probability']).size().unstack(fill_value=0)

# Add total column
participant_prob_counts['total'] = participant_prob_counts.sum(axis=1)

print(f"\nFirst 10 participants:")
print(participant_prob_counts.head(10))

# Summary statistics
print("\n" + "=" * 70)
print("Statistics Across All Participants")
print("=" * 70)

if 'high prob' in participant_prob_counts.columns:
    high_prob_stats = participant_prob_counts['high prob'].describe()
    print(f"\nHigh prob sentences per participant:")
    print(f"  Mean: {high_prob_stats['mean']:.2f}")
    print(f"  Std: {high_prob_stats['std']:.2f}")
    print(f"  Min: {high_prob_stats['min']:.0f}")
    print(f"  Max: {high_prob_stats['max']:.0f}")
    participants_with_100_high = (participant_prob_counts['high prob'] == 100).sum()
    print(f"  Participants with exactly 100 high prob: {participants_with_100_high}/{unique_participants}")

if 'low prob' in participant_prob_counts.columns:
    low_prob_stats = participant_prob_counts['low prob'].describe()
    print(f"\nLow prob sentences per participant:")
    print(f"  Mean: {low_prob_stats['mean']:.2f}")
    print(f"  Std: {low_prob_stats['std']:.2f}")
    print(f"  Min: {low_prob_stats['min']:.0f}")
    print(f"  Max: {low_prob_stats['max']:.0f}")
    participants_with_100_low = (participant_prob_counts['low prob'] == 100).sum()
    print(f"  Participants with exactly 100 low prob: {participants_with_100_low}/{unique_participants}")

# Check for participants with exactly 100 + 100
print("\n" + "=" * 70)
print("IDEAL PARTICIPANT CHECK: 100 high prob + 100 low prob = 200 total")
print("=" * 70)

if 'high prob' in participant_prob_counts.columns and 'low prob' in participant_prob_counts.columns:
    ideal_participants = (
        (participant_prob_counts['high prob'] == 100) & 
        (participant_prob_counts['low prob'] == 100) &
        (participant_prob_counts['total'] == 200)
    )
    num_ideal = ideal_participants.sum()
    print(f"\nParticipants with exactly 100 high prob + 100 low prob: {num_ideal}/{unique_participants}")
    
    if num_ideal == 72:
        print("✓ Perfect! All 72 participants have the expected distribution!")
    elif num_ideal == unique_participants:
        print(f"✓ All {unique_participants} participants have the expected distribution!")
    else:
        print(f"⚠ {unique_participants - num_ideal} participants do NOT have 100+100 distribution")

# Identify problematic participants
print("\n" + "=" * 70)
print("PARTICIPANTS WITHOUT 100 HIGH PROB + 100 LOW PROB")
print("=" * 70)

if 'high prob' in participant_prob_counts.columns and 'low prob' in participant_prob_counts.columns:
    non_ideal_participants = participant_prob_counts[
        (participant_prob_counts['high prob'] != 100) | 
        (participant_prob_counts['low prob'] != 100)
    ]
    
    if len(non_ideal_participants) > 0:
        print(f"\nFound {len(non_ideal_participants)} participants with non-standard counts:\n")
        print(non_ideal_participants)
        
        # Save list of problematic participants
        problematic_list = non_ideal_participants.reset_index()[['participant_id', 'high prob', 'low prob', 'total']]
        print("\n" + "=" * 70)
        print("Problematic Participants Summary:")
        for idx, row in problematic_list.iterrows():
            print(f"  {row['participant_id']}: {row['high prob']} high prob, {row['low prob']} low prob (total: {row['total']})")
    else:
        print("\n✓ All participants have exactly 100 high prob + 100 low prob!")

# Final summary
print("\n" + "=" * 70)
print("FINAL SUMMARY")
print("=" * 70)
print(f"Total participants: {unique_participants}")
print(f"Total sentences in summary: {len(best_iteration_info)}")
print(f"Expected (72 × 200): 14,400")
if unique_participants == 72 and len(best_iteration_info) == 14400:
    print("✓ Counts match expectation perfectly!")
elif len(best_iteration_info) == 14400:
    print(f"✓ Sentence count matches! ({unique_participants} participants found)")
else:
    diff = len(best_iteration_info) - 14400
    print(f"⚠ Difference from expected: {diff:+d} sentences")
print("=" * 70)

VERIFICATION: Best Iteration Summary (HIGH PROB + LOW PROB ONLY)

Total rows in best_iteration_info: 14382
This represents unique participant × sentence combinations (excluding nonword)

Unique participants: 72

Probability Distribution in Summary
probability
low prob     7191
high prob    7191
Name: count, dtype: int64

✓ Confirmed: No 'nonword' entries in summary (as expected)

Total sentences: 14382
Expected if 72 participants × 200 sentences: 14400 = 14,400

Per Participant Breakdown

First 10 participants:
probability     high prob  low prob  total
participant_id                            
t1                    100       100    200
t10                   100       100    200
t12                   100       100    200
t13                   100       100    200
t15                   100       100    200
t17                   100       100    200
t18                   100       100    200
t19                   100       100    200
t2                    100       100    200
t20       

### Save Results

In [13]:
df.to_csv('final_wer_all.csv', index=False)

# Save the filtered dataset (high prob + low prob only)
df_filtered.to_csv('final_wer_high_low_only.csv', index=False)

# Save the summary by participant and sentence (high prob + low prob only)
best_iteration_info.to_csv('final_wer_summary_by_participant_sentence.csv', index=False)
print("Saved summary to: wer_summary_by_participant_sentence.csv")
print(f"  (Contains {len(best_iteration_info)} rows = participant × sentence combinations)")
print(f"  (Only includes 'high prob' and 'low prob' sentences - nonword excluded)")

# Save problematic participants list if any exist
if 'high prob' in participant_prob_counts.columns and 'low prob' in participant_prob_counts.columns:
    non_ideal = participant_prob_counts[
        (participant_prob_counts['high prob'] != 100) | 
        (participant_prob_counts['low prob'] != 100)
    ]
    if len(non_ideal) > 0:
        problematic_df = non_ideal.reset_index()
        problematic_df.to_csv('participants_with_non_standard_counts.csv', index=False)
        print(f"\nSaved problematic participants to: participants_with_non_standard_counts.csv")
        print(f"  ({len(non_ideal)} participants do not have exactly 100 high + 100 low)")
    else:
        print("\n✓ All participants have exactly 100 high prob + 100 low prob!")

print("\nAll files saved successfully!")

Saved summary to: wer_summary_by_participant_sentence.csv
  (Contains 14382 rows = participant × sentence combinations)
  (Only includes 'high prob' and 'low prob' sentences - nonword excluded)

Saved problematic participants to: participants_with_non_standard_counts.csv
  (1 participants do not have exactly 100 high + 100 low)

All files saved successfully!


### New Column Descriptions

**For each model (base, medium, small, tiny):**
- `{model}_wer_list`: List of WER values comparing the model's transcription to all acceptable references
- `{model}_wer_min`: Minimum WER from the list (best match)

**Cross-model comparisons:**
- `min_wer_across_models`: The lowest WER achieved by any model for this row
- `winning_model`: Name of the model that achieved the lowest WER

**Iteration-level analysis:**
- `unique_iterations`: List of all iterations for this participant-sentence combination
- `best_wer_across_iterations`: The lowest WER across all iterations and all models
- `best_iteration`: Which iteration achieved the best WER
- `best_model`: Which model achieved the best WER (in the best iteration)