# Evaluation Results Analysis

Analyze model performance across classes to identify problematic patterns, random guessing, and bias issues.


In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

plt.style.use("default")
sns.set_palette("husl")

In [2]:
def load_evaluation_results(jsonl_path):
    """Load evaluation results from JSONL file"""
    results = []
    with open(jsonl_path, "r") as f:
        for line in f:
            if line.strip():
                results.append(json.loads(line))
    return results


def analyze_predictions(results):
    """Analyze prediction patterns and performance"""
    df = pd.DataFrame(
        [
            {
                "target": r["target"],
                "prediction": r["filtered_resps"][0] if r["filtered_resps"] else "",
                "correct": r["exact_match"],
            }
            for r in results
        ]
    )

    return df


def find_problematic_classes(df):
    """Identify classes with poor performance"""
    class_performance = (
        df.groupby("target").agg({"correct": ["count", "sum", "mean"]}).round(3)
    )
    class_performance.columns = ["total", "correct_count", "accuracy"]
    class_performance = class_performance.sort_values("accuracy")

    return class_performance


def find_bias_patterns(df):
    """Find if model is biased towards specific predictions"""
    pred_counts = df["prediction"].value_counts()
    total_predictions = len(df)

    biased_classes = pred_counts[pred_counts > total_predictions * 0.2]

    top_predictions = pred_counts.head(5).index
    bias_analysis = {}

    for pred in top_predictions:
        pred_mask = df["prediction"] == pred
        accuracy = df[pred_mask]["correct"].mean()
        frequency = pred_counts[pred] / total_predictions

        bias_analysis[pred] = {
            "frequency": frequency,
            "accuracy": accuracy,
            "count": pred_counts[pred],
        }

    return bias_analysis


def find_potential_mislabels(df, threshold=0.4):
    """Find classes that are consistently predicted as another class (potential mislabels)"""

    confusion_matrix = pd.crosstab(df['target'], df['prediction'], dropna=False)

    confusion_pct = confusion_matrix.div(confusion_matrix.sum(axis=1), axis=0)

    potential_mislabels = []

    for true_class in confusion_pct.index:
        class_predictions = confusion_pct.loc[true_class]

        wrong_predictions = class_predictions[class_predictions.index != true_class]

        if len(wrong_predictions) > 0:
            max_wrong_pred = wrong_predictions.max()
            max_wrong_class = wrong_predictions.idxmax()

            if max_wrong_pred >= threshold:
                total_samples = confusion_matrix.loc[true_class].sum()
                wrong_count = confusion_matrix.loc[true_class, max_wrong_class]
                correct_count = confusion_matrix.loc[true_class, true_class] if true_class in confusion_matrix.columns else 0

                potential_mislabels.append({
                    'true_label': true_class,
                    'predicted_as': max_wrong_class,
                    'mislabel_rate': max_wrong_pred,
                    'wrong_count': wrong_count,
                    'total_samples': total_samples,
                    'accuracy': (correct_count / total_samples if total_samples > 0 else 0),
                    'risk_level': (
                        '🔴 CRITICAL' if max_wrong_pred > 0.7
                        else '🟠 HIGH RISK' if max_wrong_pred > 0.5
                        else '🟡 MODERATE'
                    )
                })

    mislabel_df = pd.DataFrame(potential_mislabels).sort_values('mislabel_rate', ascending=False)
    return mislabel_df



In [3]:
# Configuration
RESULTS_PATH = (
    "/home/euler/Projects/Forks/lmms-eval/logs/rock_classification_openrouter/gpt-4o/"
)
JSONL_FILE = "20250903_200023_samples_rock_classification.jsonl"

In [4]:
results_file = Path(RESULTS_PATH) / JSONL_FILE
results = load_evaluation_results(results_file)
df = analyze_predictions(results)

print(f"📊 Loaded {len(results)} evaluation samples")
print(f"🎯 Overall accuracy: {df['correct'].mean():.3f}")
print(f"🏷️ Unique targets: {df['target'].nunique()}")
print(f"🔮 Unique predictions: {df['prediction'].nunique()}")

📊 Loaded 3620 evaluation samples
🎯 Overall accuracy: 0.212
🏷️ Unique targets: 362
🔮 Unique predictions: 311


In [5]:
class_perf = find_problematic_classes(df)

print("🔴 Most Problematic Classes (Bottom 10):")
print(class_perf.head(10))

print("\n🟢 Best Performing Classes (Top 10):")
print(class_perf.tail(10))

🔴 Most Problematic Classes (Bottom 10):
                   total  correct_count  accuracy
target                                           
ludlamite             10            0.0       0.0
migmatite             10            0.0       0.0
millerite             10            0.0       0.0
monazite              10            0.0       0.0
mordenite             10            0.0       0.0
mudstone              10            0.0       0.0
nepheline             10            0.0       0.0
nepheline-syenite     10            0.0       0.0
nephelite             10            0.0       0.0
neptunite             10            0.0       0.0

🟢 Best Performing Classes (Top 10):
              total  correct_count  accuracy
target                                      
labradorite      10            9.0       0.9
amazonite        10           10.0       1.0
charoite         10           10.0       1.0
chalk            10           10.0       1.0
emerald          10           10.0       1.0
larimar 

In [6]:
print("🔍 DATASET MISLABELING ANALYSIS")
print("="*50)

mislabels_df = find_potential_mislabels(df, threshold=0.4)

print(f"🚨 Found {len(mislabels_df)} classes with >40% consistent wrong predictions\n")

if len(mislabels_df) > 0:
    print("📋 POTENTIAL DATASET MISLABELS (Scrollable Table):")
    print("="*80)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 30)

    display(mislabels_df)

    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.max_colwidth')

    critical_count = (mislabels_df['risk_level'] == '🔴 CRITICAL').sum()
    high_risk_count = (mislabels_df['risk_level'] == '🟠 HIGH RISK').sum()
    moderate_count = (mislabels_df['risk_level'] == '🟡 MODERATE').sum()

    print(f"\n📊 MISLABEL SUMMARY:")
    print(f"• Critical mislabels (>70%): {critical_count}")
    print(f"• High risk mislabels (50-70%): {high_risk_count}")
    print(f"• Moderate mislabels (40-50%): {moderate_count}")
    print(f"• Total suspicious classes: {len(mislabels_df)}")

    if len(mislabels_df) <= 20:
        mislabel_rates = mislabels_df['mislabel_rate'].tolist()

        plt.figure(figsize=(14, max(8, len(mislabels_df) * 0.4)))

        labels = [f"{row['true_label'][:20]}\n→ {row['predicted_as'][:20]}"
                 for _, row in mislabels_df.iterrows()]

        colors = ['red' if 'CRITICAL' in row['risk_level']
                 else 'orange' if 'HIGH RISK' in row['risk_level']
                 else 'yellow' for _, row in mislabels_df.iterrows()]

        plt.barh(range(len(mislabels_df)), mislabel_rates, color=colors, alpha=0.8)
        plt.yticks(range(len(mislabels_df)), labels, fontsize=9)
        plt.xlabel('Mislabel Rate')
        plt.title('Potential Dataset Mislabels (>40% threshold)')
        plt.axvline(x=0.5, color='red', linestyle='--', alpha=0.7, label='50% threshold')
        plt.axvline(x=0.7, color='darkred', linestyle='--', alpha=0.7, label='70% threshold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

else:
    print("✅ No classes found with >40% consistent wrong predictions")
    print("Dataset appears to have good label quality!")

print("\n" + "="*50)
print("⚖️ PREDICTION BIAS ANALYSIS:")
bias_analysis = find_bias_patterns(df)

for pred_class, stats in bias_analysis.items():
    print(f"\n{pred_class}:")
    print(f"  Frequency: {stats['frequency']:.3f} ({stats['count']} times)")
    print(f"  Accuracy: {stats['accuracy']:.3f}")

    if stats["frequency"] > 0.2 and stats["accuracy"] < 0.5:
        print("  🚨 HIGH BIAS - Low accuracy, high frequency")
    elif stats["frequency"] > 0.3:
        print("  ⚠️ POTENTIAL BIAS - Very high frequency")

🔍 DATASET MISLABELING ANALYSIS
🚨 Found 88 classes with >40% consistent wrong predictions

📋 POTENTIAL DATASET MISLABELS (Scrollable Table):


Unnamed: 0,true_label,predicted_as,mislabel_rate,wrong_count,total_samples,accuracy,risk_level
84,unakite,anyolite,0.9,9,10,0.1,🔴 CRITICAL
41,linarite,azurite,0.9,9,10,0.0,🔴 CRITICAL
45,migmatite,gneiss,0.9,9,10,0.0,🔴 CRITICAL
42,luxullianite,rhodonite,0.8,8,10,0.0,🔴 CRITICAL
13,ceruleite,turquoise,0.8,8,10,0.0,🔴 CRITICAL
74,spessartite-garnet,spessartite-garn,0.7,7,10,0.0,🟠 HIGH RISK
10,brimstone-sulfur,sulfur,0.7,7,10,0.2,🟠 HIGH RISK
65,sapphirine,chalcedony,0.7,7,10,0.0,🟠 HIGH RISK
48,neptunite,benitoite,0.6,6,10,0.0,🟠 HIGH RISK
36,kämmererite,charoite,0.6,6,10,0.0,🟠 HIGH RISK



📊 MISLABEL SUMMARY:
• Critical mislabels (>70%): 5
• High risk mislabels (50-70%): 16
• Moderate mislabels (40-50%): 67
• Total suspicious classes: 88

⚖️ PREDICTION BIAS ANALYSIS:

gneiss:
  Frequency: 0.030 (108 times)
  Accuracy: 0.046

granite-2:
  Frequency: 0.023 (82 times)
  Accuracy: 0.037

anyolite:
  Frequency: 0.022 (81 times)
  Accuracy: 0.099

analcime:
  Frequency: 0.021 (76 times)
  Accuracy: 0.053

azurite:
  Frequency: 0.017 (62 times)
  Accuracy: 0.161


In [7]:
total_classes = df["target"].nunique()
zero_accuracy_classes = (class_perf["accuracy"] == 0).sum()
perfect_accuracy_classes = (class_perf["accuracy"] == 1.0).sum()

print("📋 SUMMARY INSIGHTS:")
print(f"• Overall accuracy: {df['correct'].mean():.3f}")
print(f"• Classes with 0% accuracy: {zero_accuracy_classes}/{total_classes}")
print(f"• Classes with 100% accuracy: {perfect_accuracy_classes}/{total_classes}")
print(
    f"• Most problematic class: {class_perf.index[0]} ({class_perf.iloc[0]['accuracy']:.3f})"
)
print(
    f"• Best performing class: {class_perf.index[-1]} ({class_perf.iloc[-1]['accuracy']:.3f})"
)

if "mislabels_df" in locals() and len(mislabels_df) > 0:
    critical_mislabels = (mislabels_df["mislabel_rate"] > 0.7).sum()
    print(
        f"• Potential mislabeled classes: {len(mislabels_df)} (critical: {critical_mislabels})"
    )
else:
    print("• Potential mislabeled classes: 0 ✅")

print("\n💡 RECOMMENDATIONS:")

if "mislabels_df" in locals() and len(mislabels_df) > 0:
    critical_count = (mislabels_df["mislabel_rate"] > 0.7).sum()
    if critical_count > 0:
        print(
            f"🚨 URGENT: {critical_count} classes show >70% wrong predictions - check dataset labels!"
        )
    print("• Review and correct mislabeled samples in dataset")
    print("• Consider relabeling or removing problematic classes")

if zero_accuracy_classes > total_classes * 0.1:
    print(
        "• High number of zero-accuracy classes - consider data quality or model training"
    )

if any(stats["frequency"] > 0.3 for stats in bias_analysis.values()):
    print("• Strong prediction bias detected - model may need rebalancing")

if df["correct"].mean() < 0.3:
    print("• Low overall accuracy - consider prompt engineering or model selection")

📋 SUMMARY INSIGHTS:
• Overall accuracy: 0.212
• Classes with 0% accuracy: 180/362
• Classes with 100% accuracy: 9/362
• Most problematic class: ludlamite (0.000)
• Best performing class: crocoite (1.000)
• Potential mislabeled classes: 88 (critical: 5)

💡 RECOMMENDATIONS:
🚨 URGENT: 5 classes show >70% wrong predictions - check dataset labels!
• Review and correct mislabeled samples in dataset
• Consider relabeling or removing problematic classes
• High number of zero-accuracy classes - consider data quality or model training
• Low overall accuracy - consider prompt engineering or model selection


## Reusable Configuration

To analyze other datasets:

1. **Update paths in cell 3:**

   ```python
   RESULTS_PATH = "path/to/your/results"
   JSONL_FILE = "your_results.jsonl"
   ```

2. **Adjust thresholds in analysis functions:**

   - `find_bias_patterns()` - bias threshold (default 20%)

3. **Modify visualization parameters:**
   - Number of classes shown in plots
   - Color schemes and figure sizes
