From 1728cd9a97adfc62050de2b506b7be4c2717dd61 Mon Sep 17 00:00:00 2001
From: Nick Galluzzo <n.galluzzo@gmail.com>
Date: Mon, 4 Aug 2025 19:26:04 +0700
Subject: [PATCH 1/5] feat(evaluation): add benchmark stability testing for
 commit message evaluation

- Add new CLI command `benchmark_stability` to evaluate LLM consistency
- Implement `EvaluationBenchmarks` class with statistical analysis
- Add  test suite for benchmark functionality
- Enable variance-based stability assessment with configurable thresholds
---
 src/diffmage/cli/reports.py           |  49 ++++
 src/diffmage/evaluation/benchmarks.py | 161 +++++++++++
 tests/evaluation/test_benchmarks.py   | 399 ++++++++++++++++++++++++++
 3 files changed, 609 insertions(+)
 create mode 100644 src/diffmage/evaluation/benchmarks.py
 create mode 100644 tests/evaluation/test_benchmarks.py

diff --git a/src/diffmage/cli/reports.py b/src/diffmage/cli/reports.py
index 9ff6696..ea24ff4 100644
--- a/src/diffmage/cli/reports.py
+++ b/src/diffmage/cli/reports.py
@@ -1,7 +1,10 @@
 import typer
 from diffmage.cli.shared import app, console
+from diffmage.ai.models import get_default_model
 from diffmage.evaluation.evaluation_report import EvaluationReport
 from diffmage.evaluation.service import EvaluationService
+from diffmage.evaluation.benchmarks import EvaluationBenchmarks
+from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator
 
 
 @app.command()
@@ -49,3 +52,49 @@ def batch_report(
     except Exception as e:
         console.print(f"[red]Error generating report: {e}[/red]")
         raise typer.Exit(1)
+
+
+@app.command()
+def benchmark_stability(
+    message: str = typer.Argument(..., help="Commit message to evaluate"),
+    commit_hash: str = typer.Option(None, "--commit", "-c", help="Use commit's diff"),
+    runs: int = typer.Option(2, "--runs", "-n", help="Number of runs to perform"),
+    model_name: str = typer.Option(
+        None, "--model", "-m", help="AI model to use for evaluation"
+    ),
+    repo_path: str = typer.Option(
+        ".", "--repo-path", "-r", help="Path to git repository"
+    ),
+) -> None:
+    """Evaluate the stability of a commit message"""
+
+    try:
+        from diffmage.git.diff_parser import GitDiffParser
+
+        parser = GitDiffParser(repo_path)
+        if commit_hash:
+            analysis, diff = parser.parse_specific_commit(commit_hash)
+        else:
+            analysis = parser.parse_staged_changes()
+            diff = analysis.get_combined_diff()
+
+        if not model_name:
+            model_name = get_default_model().name
+
+        evaluator = CommitMessageEvaluator(model_name)
+        benchmarks = EvaluationBenchmarks(evaluator)
+
+        result = benchmarks.stability_test(message, diff, runs, variance_threshold=0.2)
+
+        if result["is_stable"]:
+            console.print(
+                f"\n[green]✅ Evaluator is STABLE (max variance: {result['max_variance']:.2f})[/green]"
+            )
+        else:
+            console.print(
+                f"\n[red]⚠️ Evaluator is UNSTABLE (max variance: {result['max_variance']:.2f})[/red]"
+            )
+
+    except Exception as e:
+        console.print(f"[red]Error evaluating stability: {e}[/red]")
+        raise typer.Exit(1)
diff --git a/src/diffmage/evaluation/benchmarks.py b/src/diffmage/evaluation/benchmarks.py
new file mode 100644
index 0000000..be569de
--- /dev/null
+++ b/src/diffmage/evaluation/benchmarks.py
@@ -0,0 +1,161 @@
+import time
+import statistics
+from datetime import datetime
+from rich.table import Table
+from rich.console import Console
+from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator
+from rich.progress import Progress
+
+
+class EvaluationBenchmarks:
+    """
+    Benchmarking and validation tools for LLM based commit message evaluation.
+    """
+
+    def __init__(self, evaluator: CommitMessageEvaluator):
+        self.console = Console()
+        self.evaluator = evaluator
+
+    def stability_test(
+        self, message: str, diff: str, runs: int = 3, variance_threshold: float = 0.2
+    ) -> dict:
+        """
+        Run a stability test on the evaluator.
+        """
+        if not message or not diff:
+            raise ValueError("Message and diff are required for stability test")
+
+        results = []
+        execution_times = []
+        with Progress(console=self.console) as progress:
+            task = progress.add_task("Evaluating...", total=runs)
+            for run in range(runs):
+                start_time = time.time()
+                result = self.evaluator.evaluate_commit_message(message, diff)
+
+                execution_time = time.time() - start_time
+                execution_times.append(execution_time)
+
+                run_data = {
+                    "run": run + 1,
+                    "what_score": result.what_score,
+                    "why_score": result.why_score,
+                    "overall_score": result.overall_score,
+                    "confidence": result.confidence,
+                    "execution_time": execution_time,
+                }
+                results.append(run_data)
+
+                progress.update(task, advance=1)
+                self.console.print(
+                    f"   Run {run + 1}: WHAT={result.what_score:.1f}, WHY={result.why_score:.1f}, OVERALL={result.overall_score:.1f} completed in {execution_time}s"
+                )
+
+        stats = self._calculate_statistics(results, execution_times)
+        max_variance = self._determine_stability(stats)
+        is_stable = max_variance <= variance_threshold
+
+        self._display_stability_results(stats, is_stable, variance_threshold)
+
+        return {
+            "message": message,
+            "runs": runs,
+            "results": results,
+            "statistics": stats,
+            "is_stable": is_stable,
+            "max_variance": max_variance,
+            "variance_threshold": variance_threshold,
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    def _calculate_statistics(
+        self, results: list[dict], execution_times: list[float]
+    ) -> dict:
+        """
+        Calculate statistics for the results and execution times.
+        """
+        what_scores = [r["what_score"] for r in results]
+        why_scores = [r["why_score"] for r in results]
+        overall_scores = [r["overall_score"] for r in results]
+
+        return {
+            "what": self._calculate_score_variance(what_scores),
+            "why": self._calculate_score_variance(why_scores),
+            "overall": self._calculate_score_variance(overall_scores),
+            "execution_time": {
+                "mean": statistics.mean(execution_times),
+                "std": statistics.stdev(execution_times)
+                if len(execution_times) > 1
+                else 0,
+                "min": min(execution_times),
+                "max": max(execution_times),
+            },
+        }
+
+    def _determine_stability(self, stats: dict) -> float:
+        """
+        Determine stability based on the statistics.
+        """
+        return max(
+            stats["what"]["range"], stats["why"]["range"], stats["overall"]["range"]
+        )
+
+    def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]:
+        """
+        Calculate variance and other statistics for a list of scores.
+        """
+        if not scores:
+            return {}
+
+        return {
+            "mean": statistics.mean(scores),
+            "median": statistics.median(scores),
+            "std": statistics.stdev(scores) if len(scores) > 1 else 0,
+            "min": min(scores),
+            "max": max(scores),
+            "range": max(scores) - min(scores),
+        }
+
+    def _display_stability_results(
+        self, stats: dict, is_stable: bool, threshold: float
+    ) -> None:
+        """
+        Display the stability test results.
+        """
+
+        table = Table(title="Stability Test Results", show_header=True)
+        table.add_column("Dimension", style="cyan")
+        table.add_column("Mean", justify="center")
+        table.add_column("Std Dev", justify="center")
+        table.add_column("Range", justify="center")
+        table.add_column("Status", justify="center")
+
+        for dimension in ["what", "why", "overall"]:
+            stat = stats[dimension]
+            range_val = stat["range"]
+            status = "✅ Stable" if range_val <= threshold else "⚠️ Unstable"
+            status_color = "green" if range_val <= threshold else "red"
+
+            table.add_row(
+                dimension.upper(),
+                f"{stat['mean']:.2f}",
+                f"{stat['std']:.2f}",
+                f"{range_val:.2f}",
+                f"[{status_color}]{status}[/{status_color}]",
+            )
+
+        self.console.print(table)
+
+        # Overall assessment
+        overall_color = "green" if is_stable else "red"
+        overall_status = "STABLE" if is_stable else "UNSTABLE"
+
+        self.console.print(
+            f"\n[{overall_color}]Overall Assessment: {overall_status} (threshold: ±{threshold})[/{overall_color}]"
+        )
+
+        # Performance info
+        exec_stats = stats["execution_time"]
+        self.console.print(
+            f"\n[dim]Average execution time: {exec_stats['mean']:.2f}s (±{exec_stats['std']:.2f}s)[/dim]"
+        )
diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py
new file mode 100644
index 0000000..65c7e12
--- /dev/null
+++ b/tests/evaluation/test_benchmarks.py
@@ -0,0 +1,399 @@
+import pytest
+from rich.progress import Progress
+from diffmage.evaluation.benchmarks import EvaluationBenchmarks
+from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator
+from rich.console import Console
+from unittest.mock import Mock, patch
+
+
+@pytest.fixture
+def evaluator():
+    """Fixture providing CommitMessageEvaluator instance"""
+    return CommitMessageEvaluator()
+
+
+@pytest.fixture
+def benchmarks(evaluator):
+    """Fixture providing EvaluationBenchmarks instance"""
+    return EvaluationBenchmarks(evaluator)
+
+
+class TestEvaluationBenchmarks:
+    """Test cases for EvaluationBenchmarks model."""
+
+    def test_evaluation_benchmarks_initialization(self, benchmarks):
+        """Test successful creation of EvaluationBenchmarks."""
+
+        assert benchmarks is not None
+        assert benchmarks.evaluator is not None
+        assert isinstance(benchmarks.evaluator, CommitMessageEvaluator)
+        assert isinstance(benchmarks.console, Console)
+
+    def test_stability_test_with_empty_message_raises_error(self, benchmarks):
+        """Test stability test with empty message raises error"""
+        with pytest.raises(
+            ValueError, match="Message and diff are required for stability test"
+        ):
+            benchmarks.stability_test(message="", diff="some diff")
+
+    def test_stability_test_with_none_raises_error(self, benchmarks):
+        """Test stability test with none raises error"""
+        with pytest.raises(
+            ValueError, match="Message and diff are required for stability test"
+        ):
+            benchmarks.stability_test(message=None, diff=None)
+
+    def test_stability_test_with_empty_diff_raises_error(self, benchmarks):
+        """Test stability test with empty diff raises error"""
+        with pytest.raises(
+            ValueError, match="Message and diff are required for stability test"
+        ):
+            benchmarks.stability_test(message="some message", diff="")
+
+    def test_stability_test_calls_evaluator_correct_number_of_times(self, benchmarks):
+        """Test stability test calls evaluator correct number of times"""
+        message = "some message"
+        diff = "some diff"
+        runs = 3
+
+        mock_result = Mock()
+        mock_result.what_score = 4.0
+        mock_result.why_score = 3.5
+        mock_result.overall_score = 3.8
+        mock_result.confidence = 0.8
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ) as mock_evaluate:
+            result = benchmarks.stability_test(message=message, diff=diff, runs=runs)
+
+            assert mock_evaluate.call_count == runs
+            assert result["runs"] == runs
+            assert len(result["results"]) == runs
+
+    def test_stability_test_calculates_statistics_correctly(self, benchmarks):
+        """Test stability test calculates statistics correctly"""
+        message = "some message"
+        diff = "some diff"
+
+        mock_result = Mock()
+        mock_result.what_score = 4.0
+        mock_result.why_score = 3.5
+        mock_result.overall_score = 3.8
+        mock_result.confidence = 0.8
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            result = benchmarks.stability_test(message=message, diff=diff, runs=2)
+
+            stats = result["statistics"]
+
+            assert "what" in stats
+            assert "why" in stats
+            assert "overall" in stats
+            assert "execution_time" in stats
+
+    @patch("time.time")
+    @patch.object(Progress, "__enter__")
+    @patch.object(Progress, "__exit__")
+    def test_stability_test_calculates_execution_time_correctly(
+        self, mock_exit, mock_enter, mock_time, benchmarks
+    ):
+        """Test stability test calculates execution time correctly"""
+
+        mock_time.side_effect = [1000.0, 1000.5, 1001.0, 1001.3]
+
+        mock_progress = Mock()
+        mock_task = Mock()
+        mock_progress.add_task.return_value = mock_task
+        mock_enter.return_value = mock_progress
+
+        mock_result1 = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+        mock_result2 = Mock(
+            what_score=4.5, why_score=3.5, overall_score=4.0, confidence=0.9
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result1, mock_result2],
+        ):
+            result = benchmarks.stability_test("test message", "test diff", runs=2)
+
+            assert len(result["results"]) == 2
+            assert result["results"][0]["execution_time"] == pytest.approx(0.5)
+            assert result["results"][1]["execution_time"] == pytest.approx(0.3)
+
+            assert "execution_time" in result["statistics"]
+            assert result["statistics"]["execution_time"]["mean"] == pytest.approx(0.4)
+
+    def test_stability_test_is_stable_returns_true_if_variance_is_less_than_threshold(
+        self, benchmarks
+    ):
+        """Test stability test is stable if variance is less than threshold"""
+
+        mock_result1 = Mock(
+            what_score=4.1, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+        mock_result2 = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result1, mock_result2],
+        ):
+            result = benchmarks.stability_test(
+                "test message", "test diff", runs=2, variance_threshold=0.2
+            )
+            assert result["is_stable"] is True
+            assert result["max_variance"] == pytest.approx(0.1)
+            assert result["variance_threshold"] == 0.2
+
+    def test_stability_test_is_stable_returns_false_if_variance_is_greater_than_threshold(
+        self, benchmarks
+    ):
+        """Test stability test is stable if variance is less than threshold"""
+
+        mock_result1 = Mock(
+            what_score=2.5, why_score=2.0, overall_score=2.5, confidence=0.8
+        )
+        mock_result2 = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result1, mock_result2],
+        ):
+            result = benchmarks.stability_test(
+                "test message", "test diff", runs=2, variance_threshold=0.2
+            )
+            assert result["is_stable"] is False
+            assert result["max_variance"] == pytest.approx(1.5)
+            assert result["variance_threshold"] == 0.2
+
+    def test_stability_test_returns_variance_threshold_as_is(self, benchmarks):
+        """Test stability test returns variance threshold as is"""
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[
+                Mock(what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8),
+                Mock(what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8),
+            ],
+        ):
+            result = benchmarks.stability_test(
+                "test message", "test diff", runs=2, variance_threshold=0.2
+            )
+            assert result["variance_threshold"] == 0.2
+
+    def test_stability_test_returns_expected_structure_and_values(self, benchmarks):
+        """Test stability test returns expected structure with correct values."""
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with (
+            patch.object(
+                benchmarks.evaluator,
+                "evaluate_commit_message",
+                return_value=mock_result,
+            ),
+            patch("diffmage.evaluation.benchmarks.Progress"),
+        ):
+            result = benchmarks.stability_test("test message", "test diff", runs=3)
+
+            # Test structure exists
+            assert result is not None
+            assert "message" in result
+            assert "runs" in result
+            assert "results" in result
+            assert "statistics" in result
+            assert "is_stable" in result
+            assert "max_variance" in result
+            assert "variance_threshold" in result
+            assert "timestamp" in result
+
+            # Test basic values are correct
+            assert result["message"] == "test message"
+            assert result["runs"] == 3
+            assert len(result["results"]) == 3
+            assert isinstance(result["is_stable"], bool)
+            assert result["variance_threshold"] == 0.2  # default value
+            assert isinstance(result["timestamp"], str)
+
+    def test_stability_test_processes_results_correctly(self, benchmarks):
+        """Test that individual run results are processed correctly."""
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with (
+            patch.object(
+                benchmarks.evaluator,
+                "evaluate_commit_message",
+                return_value=mock_result,
+            ),
+            patch("diffmage.evaluation.benchmarks.Progress"),
+        ):
+            result = benchmarks.stability_test("test", "diff", runs=2)
+
+            assert len(result["results"]) == 2
+
+            first_run = result["results"][0]
+            assert first_run["run"] == 1
+            assert first_run["what_score"] == 4.0
+            assert first_run["why_score"] == 3.0
+            assert first_run["overall_score"] == 3.5
+            assert first_run["confidence"] == 0.8
+            assert "execution_time" in first_run
+
+    def test_stability_test_statistics_structure(self, benchmarks):
+        """Test that statistics are calculated and structured correctly."""
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with (
+            patch.object(
+                benchmarks.evaluator,
+                "evaluate_commit_message",
+                return_value=mock_result,
+            ),
+            patch("diffmage.evaluation.benchmarks.Progress"),
+        ):
+            result = benchmarks.stability_test("test", "diff", runs=2)
+
+            stats = result["statistics"]
+
+            assert "what" in stats
+            assert "why" in stats
+            assert "overall" in stats
+            assert "execution_time" in stats
+
+            for score_type in ["what", "why", "overall"]:
+                assert "mean" in stats[score_type]
+                assert "std" in stats[score_type]
+                assert "min" in stats[score_type]
+                assert "max" in stats[score_type]
+
+    ### Calculate Statistics ####
+
+    def test_calculate_statistics_returns_expected_structure(self, benchmarks):
+        """Test that _calculate_statistics returns expected structure."""
+        results = [
+            {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5},
+            {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5},
+        ]
+        execution_times = [0.5, 0.3]
+
+        with patch.object(benchmarks, "_calculate_score_variance", return_value={}):
+            result = benchmarks._calculate_statistics(results, execution_times)
+
+            assert result is not None
+            assert "what" in result
+            assert "why" in result
+            assert "overall" in result
+            assert "execution_time" in result
+
+            assert "mean" in result["execution_time"]
+            assert "std" in result["execution_time"]
+            assert "min" in result["execution_time"]
+            assert "max" in result["execution_time"]
+
+    def test_calculate_statistics_calls_score_variance_for_each_score_type(
+        self, benchmarks
+    ):
+        """Test that _calculate_score_variance is called for each score type."""
+        results = [
+            {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5},
+            {"what_score": 5.0, "why_score": 4.0, "overall_score": 4.5},
+        ]
+        execution_times = [0.5, 0.3]
+
+        with patch.object(
+            benchmarks, "_calculate_score_variance", return_value={}
+        ) as mock_calc:
+            benchmarks._calculate_statistics(results, execution_times)
+
+            assert mock_calc.call_count == 3
+
+            calls = mock_calc.call_args_list
+            assert calls[0][0][0] == [4.0, 5.0]
+            assert calls[1][0][0] == [3.0, 4.0]
+            assert calls[2][0][0] == [3.5, 4.5]
+
+    def test_calculate_statistics_calculates_execution_time_stats_correctly(
+        self, benchmarks
+    ):
+        """Test execution time statistics are calculated correctly."""
+        results = [{"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}]
+        execution_times = [0.5, 0.3]
+
+        with patch.object(benchmarks, "_calculate_score_variance", return_value={}):
+            result = benchmarks._calculate_statistics(results, execution_times)
+
+            assert result["execution_time"]["mean"] == pytest.approx(0.4)
+            assert result["execution_time"]["std"] == pytest.approx(0.14142135623)
+            assert result["execution_time"]["min"] == 0.3
+            assert result["execution_time"]["max"] == 0.5
+
+    def test_calculate_statistics_uses_score_variance_results(self, benchmarks):
+        """Test that score variance results are properly integrated."""
+        results = [{"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}]
+        execution_times = [0.5]
+
+        mock_variance = {
+            "mean": 4.2,
+            "median": 4.0,
+            "std": 0.1,
+            "min": 4.0,
+            "max": 4.4,
+            "range": 0.4,
+        }
+
+        with patch.object(
+            benchmarks, "_calculate_score_variance", return_value=mock_variance
+        ):
+            result = benchmarks._calculate_statistics(results, execution_times)
+
+            assert result["what"] == mock_variance
+            assert result["why"] == mock_variance
+            assert result["overall"] == mock_variance
+
+    #### Score Variance #####
+
+    def test_calculate_score_if_no_scores_returns_empty_dict(self, benchmarks):
+        """Test calculate_score_variance returns empty dict if no scores"""
+        scores = []
+        result = benchmarks._calculate_score_variance(scores)
+        assert result == {}
+
+    def test_calculate_score_variance_returns_expected_structure(self, benchmarks):
+        """Test calculate_score_variance returns expected structure"""
+        scores = [1.0, 2.0, 3.0, 4.0, 5.0]
+        result = benchmarks._calculate_score_variance(scores)
+        assert result is not None
+        assert "mean" in result
+        assert "median" in result
+        assert "std" in result
+        assert "min" in result
+        assert "max" in result
+        assert "range" in result
+
+    def test_calculate_score_variance_returns_correct_values(self, benchmarks):
+        """Test calculate_score_variance returns correct values"""
+        scores = [1.0, 2.0, 3.0, 4.0, 5.0]
+        result = benchmarks._calculate_score_variance(scores)
+        assert result["mean"] == pytest.approx(3.0)
+        assert result["median"] == pytest.approx(3.0)
+        assert result["std"] == pytest.approx(1.58113883008)
+        assert result["min"] == 1.0
+        assert result["max"] == 5.0
+        assert result["range"] == 4.0

From 7bd6a67b51bbfb043996cb7b368594e2e433c1b7 Mon Sep 17 00:00:00 2001
From: Nick Galluzzo <n.galluzzo@gmail.com>
Date: Mon, 4 Aug 2025 19:26:52 +0700
Subject: [PATCH 2/5] refactor(cli): remove redundant model name default
 assignment

---
 src/diffmage/cli/reports.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/diffmage/cli/reports.py b/src/diffmage/cli/reports.py
index ea24ff4..7de9342 100644
--- a/src/diffmage/cli/reports.py
+++ b/src/diffmage/cli/reports.py
@@ -1,6 +1,5 @@
 import typer
 from diffmage.cli.shared import app, console
-from diffmage.ai.models import get_default_model
 from diffmage.evaluation.evaluation_report import EvaluationReport
 from diffmage.evaluation.service import EvaluationService
 from diffmage.evaluation.benchmarks import EvaluationBenchmarks
@@ -78,9 +77,6 @@ def benchmark_stability(
             analysis = parser.parse_staged_changes()
             diff = analysis.get_combined_diff()
 
-        if not model_name:
-            model_name = get_default_model().name
-
         evaluator = CommitMessageEvaluator(model_name)
         benchmarks = EvaluationBenchmarks(evaluator)
 

From f1c96f8e5d351153016a1d107ec2cd4685121c00 Mon Sep 17 00:00:00 2001
From: Nick Galluzzo <n.galluzzo@gmail.com>
Date: Tue, 5 Aug 2025 10:26:48 +0700
Subject: [PATCH 3/5] test(evaluation): add comprehensive stability test cases
 for benchmark evaluation

---
 tests/evaluation/test_benchmarks.py | 240 ++++++++++++++++++++++++++++
 1 file changed, 240 insertions(+)

diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py
index 65c7e12..942d5fd 100644
--- a/tests/evaluation/test_benchmarks.py
+++ b/tests/evaluation/test_benchmarks.py
@@ -1,4 +1,5 @@
 import pytest
+import time
 from rich.progress import Progress
 from diffmage.evaluation.benchmarks import EvaluationBenchmarks
 from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator
@@ -397,3 +398,242 @@ def test_calculate_score_variance_returns_correct_values(self, benchmarks):
         assert result["min"] == 1.0
         assert result["max"] == 5.0
         assert result["range"] == 4.0
+
+    def test_stability_test_single_run_handles_stdev_gracefully(self, benchmarks):
+        """Test single run doesn't crash when calculating standard deviation"""
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            result = benchmarks.stability_test("test", "diff", runs=1)
+
+            assert result["statistics"]["execution_time"]["std"] == 0
+            assert result["statistics"]["what"]["std"] == 0
+            assert result["statistics"]["why"]["std"] == 0
+            assert result["statistics"]["overall"]["std"] == 0
+
+    def test_stability_test_zero_variance_threshold_requires_perfect_stability(
+        self, benchmarks
+    ):
+        """Test zero variance threshold requires perfect stability"""
+        mock_result1 = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+        mock_result2 = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result1, mock_result2],
+        ):
+            result = benchmarks.stability_test(
+                "test", "diff", runs=2, variance_threshold=0.0
+            )
+            assert result["is_stable"] is True
+            assert result["max_variance"] == 0.0
+
+        mock_result3 = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+        mock_result4 = Mock(
+            what_score=4.1, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result3, mock_result4],
+        ):
+            result = benchmarks.stability_test(
+                "test", "diff", runs=2, variance_threshold=0.0
+            )
+            assert result["is_stable"] is False
+            assert result["max_variance"] == pytest.approx(0.1)
+
+    def test_stability_test_negative_variance_threshold_allowed(self, benchmarks):
+        """Test negative variance threshold is allowed (though not practical)"""
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            result = benchmarks.stability_test(
+                "test", "diff", runs=2, variance_threshold=-0.1
+            )
+            assert result["variance_threshold"] == -0.1
+            assert result["is_stable"] is False
+
+    def test_stability_test_evaluator_exception_propagates(self, benchmarks):
+        """Test evaluator exceptions are handled appropriately"""
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=RuntimeError("LLM API error"),
+        ):
+            with pytest.raises(RuntimeError, match="LLM API error"):
+                benchmarks.stability_test("test", "diff", runs=2)
+
+    def test_stability_test_evaluator_exception_on_second_run_propagates(
+        self, benchmarks
+    ):
+        """Test evaluator exception on second run still propagates"""
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result, RuntimeError("Network timeout")],
+        ):
+            with pytest.raises(RuntimeError, match="Network timeout"):
+                benchmarks.stability_test("test", "diff", runs=2)
+
+    def test_calculate_score_variance_single_score_returns_zero_std(self, benchmarks):
+        """Test single score returns std=0 without crashing"""
+        scores = [4.0]
+        result = benchmarks._calculate_score_variance(scores)
+
+        assert result["mean"] == 4.0
+        assert result["median"] == 4.0
+        assert result["std"] == 0
+        assert result["min"] == 4.0
+        assert result["max"] == 4.0
+        assert result["range"] == 0.0
+
+    def test_stability_test_large_runs_parameter_performance(self, benchmarks):
+        """Test large runs parameter doesn't cause performance issues"""
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            start_time = time.time()
+            result = benchmarks.stability_test("test", "diff", runs=100)
+            execution_time = time.time() - start_time
+
+            assert len(result["results"]) == 100
+            assert result["runs"] == 100
+            assert execution_time < 5.0
+
+    def test_stability_test_extreme_score_values_at_boundaries(self, benchmarks):
+        """Test stability test with extreme score values at boundaries"""
+        mock_result1 = Mock(
+            what_score=0.0, why_score=0.0, overall_score=0.0, confidence=0.0
+        )
+        mock_result2 = Mock(
+            what_score=5.0, why_score=5.0, overall_score=5.0, confidence=1.0
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result1, mock_result2],
+        ):
+            result = benchmarks.stability_test("test", "diff", runs=2)
+
+            assert result["statistics"]["what"]["min"] == 0.0
+            assert result["statistics"]["what"]["max"] == 5.0
+            assert result["statistics"]["what"]["range"] == 5.0
+            assert result["is_stable"] is False  # High variance
+
+    def test_stability_test_negative_score_values(self, benchmarks):
+        """Test stability test with negative score values (outside expected range)"""
+        mock_result = Mock(
+            what_score=-1.0, why_score=-2.0, overall_score=-1.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            result = benchmarks.stability_test("test", "diff", runs=2)
+
+            assert result["statistics"]["what"]["mean"] == -1.0
+            assert result["statistics"]["why"]["mean"] == -2.0
+
+    def test_stability_test_very_high_variance_threshold_always_stable(
+        self, benchmarks
+    ):
+        """Test very high variance threshold makes any result stable"""
+        mock_result1 = Mock(
+            what_score=1.0, why_score=1.0, overall_score=1.0, confidence=0.1
+        )
+        mock_result2 = Mock(
+            what_score=5.0, why_score=5.0, overall_score=5.0, confidence=1.0
+        )
+
+        with patch.object(
+            benchmarks.evaluator,
+            "evaluate_commit_message",
+            side_effect=[mock_result1, mock_result2],
+        ):
+            result = benchmarks.stability_test(
+                "test", "diff", runs=2, variance_threshold=10.0
+            )
+
+            assert result["is_stable"] is True
+            assert result["variance_threshold"] == 10.0
+            assert result["max_variance"] < 10.0
+
+    def test_stability_test_very_long_message_and_diff_strings(self, benchmarks):
+        """Test stability test with very long message and diff strings"""
+        long_message = "A" * 10000  # 10KB message
+        long_diff = "B" * 50000  # 50KB diff
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            result = benchmarks.stability_test(long_message, long_diff, runs=2)
+
+            assert result["message"] == long_message
+            assert len(result["results"]) == 2
+            assert result["runs"] == 2
+
+    def test_stability_test_unicode_and_special_characters(self, benchmarks):
+        """Test stability test with unicode and special characters"""
+        unicode_message = "feat: Add 支持中文 and émojis 🚀 with ñiño characters"
+        special_diff = """
+        + print("Hello 世界! 🌍")
+        - console.log('Goodbye 👋')
+        + # TODO: Fix unicode handling ñoño café
+        """
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            result = benchmarks.stability_test(unicode_message, special_diff, runs=2)
+
+            assert result["message"] == unicode_message
+            assert len(result["results"]) == 2
+            assert result["runs"] == 2
+
+    def test_stability_test_special_control_characters(self, benchmarks):
+        """Test stability test with control characters and escape sequences"""
+        control_message = "fix: Handle \\n\\t\\r and \\x00 characters"
+        control_diff = "- old\\nline\\twith\\ttabs\n+ new\\nline\\twith\\tspaces"
+        mock_result = Mock(
+            what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8
+        )
+
+        with patch.object(
+            benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result
+        ):
+            result = benchmarks.stability_test(control_message, control_diff, runs=2)
+
+            assert result["message"] == control_message
+            assert len(result["results"]) == 2

From e233a76fa9fb1bab89247108a57030a6c03e29af Mon Sep 17 00:00:00 2001
From: Nick Galluzzo <n.galluzzo@gmail.com>
Date: Tue, 5 Aug 2025 11:02:08 +0700
Subject: [PATCH 4/5] feat(evaluation): Add type hints and improve stability
 test results structure

- Introduce TypedDict classes for better type safety in benchmarking
- Update stability test to return structured TypedDict results
- Improve score variance calculation to handle empty inputs
- Enhance display of stability results with clearer dimension names
---
 src/diffmage/evaluation/benchmarks.py | 82 ++++++++++++++++++++++-----
 tests/evaluation/test_benchmarks.py   | 16 ++++--
 uv.lock                               |  2 +-
 3 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/src/diffmage/evaluation/benchmarks.py b/src/diffmage/evaluation/benchmarks.py
index be569de..622cb22 100644
--- a/src/diffmage/evaluation/benchmarks.py
+++ b/src/diffmage/evaluation/benchmarks.py
@@ -1,3 +1,4 @@
+from typing import TypedDict
 import time
 import statistics
 from datetime import datetime
@@ -7,6 +8,49 @@
 from rich.progress import Progress
 
 
+class ScoreStats(TypedDict):
+    mean: float
+    median: float
+    std: float
+    min: float
+    max: float
+    range: float
+
+
+class ExecutionTimeStats(TypedDict):
+    mean: float
+    std: float
+    min: float
+    max: float
+
+
+class BenchmarkStats(TypedDict):
+    what: ScoreStats
+    why: ScoreStats
+    overall: ScoreStats
+    execution_time: ExecutionTimeStats
+
+
+class RunResult(TypedDict):
+    run: int
+    what_score: float
+    why_score: float
+    overall_score: float
+    confidence: float
+    execution_time: float
+
+
+class StabilityTestResult(TypedDict):
+    message: str
+    runs: int
+    results: list[RunResult]
+    statistics: BenchmarkStats
+    is_stable: bool
+    max_variance: float
+    variance_threshold: float
+    timestamp: str
+
+
 class EvaluationBenchmarks:
     """
     Benchmarking and validation tools for LLM based commit message evaluation.
@@ -18,15 +62,15 @@ def __init__(self, evaluator: CommitMessageEvaluator):
 
     def stability_test(
         self, message: str, diff: str, runs: int = 3, variance_threshold: float = 0.2
-    ) -> dict:
+    ) -> StabilityTestResult:
         """
         Run a stability test on the evaluator.
         """
         if not message or not diff:
             raise ValueError("Message and diff are required for stability test")
 
-        results = []
-        execution_times = []
+        results: list[RunResult] = []
+        execution_times: list[float] = []
         with Progress(console=self.console) as progress:
             task = progress.add_task("Evaluating...", total=runs)
             for run in range(runs):
@@ -36,7 +80,7 @@ def stability_test(
                 execution_time = time.time() - start_time
                 execution_times.append(execution_time)
 
-                run_data = {
+                run_data: RunResult = {
                     "run": run + 1,
                     "what_score": result.what_score,
                     "why_score": result.why_score,
@@ -69,8 +113,8 @@ def stability_test(
         }
 
     def _calculate_statistics(
-        self, results: list[dict], execution_times: list[float]
-    ) -> dict:
+        self, results: list[RunResult], execution_times: list[float]
+    ) -> BenchmarkStats:
         """
         Calculate statistics for the results and execution times.
         """
@@ -92,7 +136,7 @@ def _calculate_statistics(
             },
         }
 
-    def _determine_stability(self, stats: dict) -> float:
+    def _determine_stability(self, stats: BenchmarkStats) -> float:
         """
         Determine stability based on the statistics.
         """
@@ -100,12 +144,19 @@ def _determine_stability(self, stats: dict) -> float:
             stats["what"]["range"], stats["why"]["range"], stats["overall"]["range"]
         )
 
-    def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]:
+    def _calculate_score_variance(self, scores: list[float]) -> ScoreStats:
         """
         Calculate variance and other statistics for a list of scores.
         """
         if not scores:
-            return {}
+            return {
+                "mean": 0.0,
+                "median": 0.0,
+                "std": 0.0,
+                "min": 0.0,
+                "max": 0.0,
+                "range": 0.0,
+            }
 
         return {
             "mean": statistics.mean(scores),
@@ -117,7 +168,7 @@ def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]:
         }
 
     def _display_stability_results(
-        self, stats: dict, is_stable: bool, threshold: float
+        self, stats: BenchmarkStats, is_stable: bool, threshold: float
     ) -> None:
         """
         Display the stability test results.
@@ -130,14 +181,19 @@ def _display_stability_results(
         table.add_column("Range", justify="center")
         table.add_column("Status", justify="center")
 
-        for dimension in ["what", "why", "overall"]:
-            stat = stats[dimension]
+        dimensions_data = [
+            ("WHAT", stats["what"]),
+            ("WHY", stats["why"]),
+            ("OVERALL", stats["overall"]),
+        ]
+
+        for dimension_name, stat in dimensions_data:
             range_val = stat["range"]
             status = "✅ Stable" if range_val <= threshold else "⚠️ Unstable"
             status_color = "green" if range_val <= threshold else "red"
 
             table.add_row(
-                dimension.upper(),
+                dimension_name,
                 f"{stat['mean']:.2f}",
                 f"{stat['std']:.2f}",
                 f"{range_val:.2f}",
diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py
index 942d5fd..419d5c3 100644
--- a/tests/evaluation/test_benchmarks.py
+++ b/tests/evaluation/test_benchmarks.py
@@ -370,11 +370,19 @@ def test_calculate_statistics_uses_score_variance_results(self, benchmarks):
 
     #### Score Variance #####
 
-    def test_calculate_score_if_no_scores_returns_empty_dict(self, benchmarks):
-        """Test calculate_score_variance returns empty dict if no scores"""
-        scores = []
+    def test_calculate_score_if_no_scores_returns_zero_stats(self, benchmarks):
+        """Test calculate_score_variance returns zero stats if no scores"""
+        scores: list[float] = []
         result = benchmarks._calculate_score_variance(scores)
-        assert result == {}
+        expected = {
+            "mean": 0.0,
+            "median": 0.0,
+            "std": 0.0,
+            "min": 0.0,
+            "max": 0.0,
+            "range": 0.0,
+        }
+        assert result == expected
 
     def test_calculate_score_variance_returns_expected_structure(self, benchmarks):
         """Test calculate_score_variance returns expected structure"""
diff --git a/uv.lock b/uv.lock
index 018681c..829e8b0 100644
--- a/uv.lock
+++ b/uv.lock
@@ -314,7 +314,7 @@ wheels = [
 
 [[package]]
 name = "diffmage"
-version = "0.3.2"
+version = "0.5.0"
 source = { editable = "." }
 dependencies = [
     { name = "gitpython" },

From b160807d2ce8cd67dc98842b8d166e7724a2cc00 Mon Sep 17 00:00:00 2001
From: Nick Galluzzo <n.galluzzo@gmail.com>
Date: Tue, 5 Aug 2025 11:02:08 +0700
Subject: [PATCH 5/5] refactor(evaluation): Add type hints and improve
 stability test results structure

- Introduce TypedDict classes for better type safety in benchmarking
- Update stability test to return structured TypedDict results
- Improve score variance calculation to handle empty inputs
- Enhance display of stability results with clearer dimension names
---
 src/diffmage/evaluation/benchmarks.py | 82 ++++++++++++++++++++++-----
 tests/evaluation/test_benchmarks.py   | 16 ++++--
 uv.lock                               |  2 +-
 3 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/src/diffmage/evaluation/benchmarks.py b/src/diffmage/evaluation/benchmarks.py
index be569de..622cb22 100644
--- a/src/diffmage/evaluation/benchmarks.py
+++ b/src/diffmage/evaluation/benchmarks.py
@@ -1,3 +1,4 @@
+from typing import TypedDict
 import time
 import statistics
 from datetime import datetime
@@ -7,6 +8,49 @@
 from rich.progress import Progress
 
 
+class ScoreStats(TypedDict):
+    mean: float
+    median: float
+    std: float
+    min: float
+    max: float
+    range: float
+
+
+class ExecutionTimeStats(TypedDict):
+    mean: float
+    std: float
+    min: float
+    max: float
+
+
+class BenchmarkStats(TypedDict):
+    what: ScoreStats
+    why: ScoreStats
+    overall: ScoreStats
+    execution_time: ExecutionTimeStats
+
+
+class RunResult(TypedDict):
+    run: int
+    what_score: float
+    why_score: float
+    overall_score: float
+    confidence: float
+    execution_time: float
+
+
+class StabilityTestResult(TypedDict):
+    message: str
+    runs: int
+    results: list[RunResult]
+    statistics: BenchmarkStats
+    is_stable: bool
+    max_variance: float
+    variance_threshold: float
+    timestamp: str
+
+
 class EvaluationBenchmarks:
     """
     Benchmarking and validation tools for LLM based commit message evaluation.
@@ -18,15 +62,15 @@ def __init__(self, evaluator: CommitMessageEvaluator):
 
     def stability_test(
         self, message: str, diff: str, runs: int = 3, variance_threshold: float = 0.2
-    ) -> dict:
+    ) -> StabilityTestResult:
         """
         Run a stability test on the evaluator.
         """
         if not message or not diff:
             raise ValueError("Message and diff are required for stability test")
 
-        results = []
-        execution_times = []
+        results: list[RunResult] = []
+        execution_times: list[float] = []
         with Progress(console=self.console) as progress:
             task = progress.add_task("Evaluating...", total=runs)
             for run in range(runs):
@@ -36,7 +80,7 @@ def stability_test(
                 execution_time = time.time() - start_time
                 execution_times.append(execution_time)
 
-                run_data = {
+                run_data: RunResult = {
                     "run": run + 1,
                     "what_score": result.what_score,
                     "why_score": result.why_score,
@@ -69,8 +113,8 @@ def stability_test(
         }
 
     def _calculate_statistics(
-        self, results: list[dict], execution_times: list[float]
-    ) -> dict:
+        self, results: list[RunResult], execution_times: list[float]
+    ) -> BenchmarkStats:
         """
         Calculate statistics for the results and execution times.
         """
@@ -92,7 +136,7 @@ def _calculate_statistics(
             },
         }
 
-    def _determine_stability(self, stats: dict) -> float:
+    def _determine_stability(self, stats: BenchmarkStats) -> float:
         """
         Determine stability based on the statistics.
         """
@@ -100,12 +144,19 @@ def _determine_stability(self, stats: dict) -> float:
             stats["what"]["range"], stats["why"]["range"], stats["overall"]["range"]
         )
 
-    def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]:
+    def _calculate_score_variance(self, scores: list[float]) -> ScoreStats:
         """
         Calculate variance and other statistics for a list of scores.
         """
         if not scores:
-            return {}
+            return {
+                "mean": 0.0,
+                "median": 0.0,
+                "std": 0.0,
+                "min": 0.0,
+                "max": 0.0,
+                "range": 0.0,
+            }
 
         return {
             "mean": statistics.mean(scores),
@@ -117,7 +168,7 @@ def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]:
         }
 
     def _display_stability_results(
-        self, stats: dict, is_stable: bool, threshold: float
+        self, stats: BenchmarkStats, is_stable: bool, threshold: float
     ) -> None:
         """
         Display the stability test results.
@@ -130,14 +181,19 @@ def _display_stability_results(
         table.add_column("Range", justify="center")
         table.add_column("Status", justify="center")
 
-        for dimension in ["what", "why", "overall"]:
-            stat = stats[dimension]
+        dimensions_data = [
+            ("WHAT", stats["what"]),
+            ("WHY", stats["why"]),
+            ("OVERALL", stats["overall"]),
+        ]
+
+        for dimension_name, stat in dimensions_data:
             range_val = stat["range"]
             status = "✅ Stable" if range_val <= threshold else "⚠️ Unstable"
             status_color = "green" if range_val <= threshold else "red"
 
             table.add_row(
-                dimension.upper(),
+                dimension_name,
                 f"{stat['mean']:.2f}",
                 f"{stat['std']:.2f}",
                 f"{range_val:.2f}",
diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py
index 942d5fd..419d5c3 100644
--- a/tests/evaluation/test_benchmarks.py
+++ b/tests/evaluation/test_benchmarks.py
@@ -370,11 +370,19 @@ def test_calculate_statistics_uses_score_variance_results(self, benchmarks):
 
     #### Score Variance #####
 
-    def test_calculate_score_if_no_scores_returns_empty_dict(self, benchmarks):
-        """Test calculate_score_variance returns empty dict if no scores"""
-        scores = []
+    def test_calculate_score_if_no_scores_returns_zero_stats(self, benchmarks):
+        """Test calculate_score_variance returns zero stats if no scores"""
+        scores: list[float] = []
         result = benchmarks._calculate_score_variance(scores)
-        assert result == {}
+        expected = {
+            "mean": 0.0,
+            "median": 0.0,
+            "std": 0.0,
+            "min": 0.0,
+            "max": 0.0,
+            "range": 0.0,
+        }
+        assert result == expected
 
     def test_calculate_score_variance_returns_expected_structure(self, benchmarks):
         """Test calculate_score_variance returns expected structure"""
diff --git a/uv.lock b/uv.lock
index 018681c..829e8b0 100644
--- a/uv.lock
+++ b/uv.lock
@@ -314,7 +314,7 @@ wheels = [
 
 [[package]]
 name = "diffmage"
-version = "0.3.2"
+version = "0.5.0"
 source = { editable = "." }
 dependencies = [
     { name = "gitpython" },