From 1728cd9a97adfc62050de2b506b7be4c2717dd61 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Mon, 4 Aug 2025 19:26:04 +0700 Subject: [PATCH 1/5] feat(evaluation): add benchmark stability testing for commit message evaluation - Add new CLI command `benchmark_stability` to evaluate LLM consistency - Implement `EvaluationBenchmarks` class with statistical analysis - Add test suite for benchmark functionality - Enable variance-based stability assessment with configurable thresholds --- src/diffmage/cli/reports.py | 49 ++++ src/diffmage/evaluation/benchmarks.py | 161 +++++++++++ tests/evaluation/test_benchmarks.py | 399 ++++++++++++++++++++++++++ 3 files changed, 609 insertions(+) create mode 100644 src/diffmage/evaluation/benchmarks.py create mode 100644 tests/evaluation/test_benchmarks.py diff --git a/src/diffmage/cli/reports.py b/src/diffmage/cli/reports.py index 9ff6696..ea24ff4 100644 --- a/src/diffmage/cli/reports.py +++ b/src/diffmage/cli/reports.py @@ -1,7 +1,10 @@ import typer from diffmage.cli.shared import app, console +from diffmage.ai.models import get_default_model from diffmage.evaluation.evaluation_report import EvaluationReport from diffmage.evaluation.service import EvaluationService +from diffmage.evaluation.benchmarks import EvaluationBenchmarks +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator @app.command() @@ -49,3 +52,49 @@ def batch_report( except Exception as e: console.print(f"[red]Error generating report: {e}[/red]") raise typer.Exit(1) + + +@app.command() +def benchmark_stability( + message: str = typer.Argument(..., help="Commit message to evaluate"), + commit_hash: str = typer.Option(None, "--commit", "-c", help="Use commit's diff"), + runs: int = typer.Option(2, "--runs", "-n", help="Number of runs to perform"), + model_name: str = typer.Option( + None, "--model", "-m", help="AI model to use for evaluation" + ), + repo_path: str = typer.Option( + ".", "--repo-path", "-r", help="Path to git repository" + ), +) -> None: + """Evaluate the stability of a commit message""" + + try: + from diffmage.git.diff_parser import GitDiffParser + + parser = GitDiffParser(repo_path) + if commit_hash: + analysis, diff = parser.parse_specific_commit(commit_hash) + else: + analysis = parser.parse_staged_changes() + diff = analysis.get_combined_diff() + + if not model_name: + model_name = get_default_model().name + + evaluator = CommitMessageEvaluator(model_name) + benchmarks = EvaluationBenchmarks(evaluator) + + result = benchmarks.stability_test(message, diff, runs, variance_threshold=0.2) + + if result["is_stable"]: + console.print( + f"\n[green]✅ Evaluator is STABLE (max variance: {result['max_variance']:.2f})[/green]" + ) + else: + console.print( + f"\n[red]⚠️ Evaluator is UNSTABLE (max variance: {result['max_variance']:.2f})[/red]" + ) + + except Exception as e: + console.print(f"[red]Error evaluating stability: {e}[/red]") + raise typer.Exit(1) diff --git a/src/diffmage/evaluation/benchmarks.py b/src/diffmage/evaluation/benchmarks.py new file mode 100644 index 0000000..be569de --- /dev/null +++ b/src/diffmage/evaluation/benchmarks.py @@ -0,0 +1,161 @@ +import time +import statistics +from datetime import datetime +from rich.table import Table +from rich.console import Console +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator +from rich.progress import Progress + + +class EvaluationBenchmarks: + """ + Benchmarking and validation tools for LLM based commit message evaluation. + """ + + def __init__(self, evaluator: CommitMessageEvaluator): + self.console = Console() + self.evaluator = evaluator + + def stability_test( + self, message: str, diff: str, runs: int = 3, variance_threshold: float = 0.2 + ) -> dict: + """ + Run a stability test on the evaluator. + """ + if not message or not diff: + raise ValueError("Message and diff are required for stability test") + + results = [] + execution_times = [] + with Progress(console=self.console) as progress: + task = progress.add_task("Evaluating...", total=runs) + for run in range(runs): + start_time = time.time() + result = self.evaluator.evaluate_commit_message(message, diff) + + execution_time = time.time() - start_time + execution_times.append(execution_time) + + run_data = { + "run": run + 1, + "what_score": result.what_score, + "why_score": result.why_score, + "overall_score": result.overall_score, + "confidence": result.confidence, + "execution_time": execution_time, + } + results.append(run_data) + + progress.update(task, advance=1) + self.console.print( + f" Run {run + 1}: WHAT={result.what_score:.1f}, WHY={result.why_score:.1f}, OVERALL={result.overall_score:.1f} completed in {execution_time}s" + ) + + stats = self._calculate_statistics(results, execution_times) + max_variance = self._determine_stability(stats) + is_stable = max_variance <= variance_threshold + + self._display_stability_results(stats, is_stable, variance_threshold) + + return { + "message": message, + "runs": runs, + "results": results, + "statistics": stats, + "is_stable": is_stable, + "max_variance": max_variance, + "variance_threshold": variance_threshold, + "timestamp": datetime.now().isoformat(), + } + + def _calculate_statistics( + self, results: list[dict], execution_times: list[float] + ) -> dict: + """ + Calculate statistics for the results and execution times. + """ + what_scores = [r["what_score"] for r in results] + why_scores = [r["why_score"] for r in results] + overall_scores = [r["overall_score"] for r in results] + + return { + "what": self._calculate_score_variance(what_scores), + "why": self._calculate_score_variance(why_scores), + "overall": self._calculate_score_variance(overall_scores), + "execution_time": { + "mean": statistics.mean(execution_times), + "std": statistics.stdev(execution_times) + if len(execution_times) > 1 + else 0, + "min": min(execution_times), + "max": max(execution_times), + }, + } + + def _determine_stability(self, stats: dict) -> float: + """ + Determine stability based on the statistics. + """ + return max( + stats["what"]["range"], stats["why"]["range"], stats["overall"]["range"] + ) + + def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]: + """ + Calculate variance and other statistics for a list of scores. + """ + if not scores: + return {} + + return { + "mean": statistics.mean(scores), + "median": statistics.median(scores), + "std": statistics.stdev(scores) if len(scores) > 1 else 0, + "min": min(scores), + "max": max(scores), + "range": max(scores) - min(scores), + } + + def _display_stability_results( + self, stats: dict, is_stable: bool, threshold: float + ) -> None: + """ + Display the stability test results. + """ + + table = Table(title="Stability Test Results", show_header=True) + table.add_column("Dimension", style="cyan") + table.add_column("Mean", justify="center") + table.add_column("Std Dev", justify="center") + table.add_column("Range", justify="center") + table.add_column("Status", justify="center") + + for dimension in ["what", "why", "overall"]: + stat = stats[dimension] + range_val = stat["range"] + status = "✅ Stable" if range_val <= threshold else "⚠️ Unstable" + status_color = "green" if range_val <= threshold else "red" + + table.add_row( + dimension.upper(), + f"{stat['mean']:.2f}", + f"{stat['std']:.2f}", + f"{range_val:.2f}", + f"[{status_color}]{status}[/{status_color}]", + ) + + self.console.print(table) + + # Overall assessment + overall_color = "green" if is_stable else "red" + overall_status = "STABLE" if is_stable else "UNSTABLE" + + self.console.print( + f"\n[{overall_color}]Overall Assessment: {overall_status} (threshold: ±{threshold})[/{overall_color}]" + ) + + # Performance info + exec_stats = stats["execution_time"] + self.console.print( + f"\n[dim]Average execution time: {exec_stats['mean']:.2f}s (±{exec_stats['std']:.2f}s)[/dim]" + ) diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py new file mode 100644 index 0000000..65c7e12 --- /dev/null +++ b/tests/evaluation/test_benchmarks.py @@ -0,0 +1,399 @@ +import pytest +from rich.progress import Progress +from diffmage.evaluation.benchmarks import EvaluationBenchmarks +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator +from rich.console import Console +from unittest.mock import Mock, patch + + +@pytest.fixture +def evaluator(): + """Fixture providing CommitMessageEvaluator instance""" + return CommitMessageEvaluator() + + +@pytest.fixture +def benchmarks(evaluator): + """Fixture providing EvaluationBenchmarks instance""" + return EvaluationBenchmarks(evaluator) + + +class TestEvaluationBenchmarks: + """Test cases for EvaluationBenchmarks model.""" + + def test_evaluation_benchmarks_initialization(self, benchmarks): + """Test successful creation of EvaluationBenchmarks.""" + + assert benchmarks is not None + assert benchmarks.evaluator is not None + assert isinstance(benchmarks.evaluator, CommitMessageEvaluator) + assert isinstance(benchmarks.console, Console) + + def test_stability_test_with_empty_message_raises_error(self, benchmarks): + """Test stability test with empty message raises error""" + with pytest.raises( + ValueError, match="Message and diff are required for stability test" + ): + benchmarks.stability_test(message="", diff="some diff") + + def test_stability_test_with_none_raises_error(self, benchmarks): + """Test stability test with none raises error""" + with pytest.raises( + ValueError, match="Message and diff are required for stability test" + ): + benchmarks.stability_test(message=None, diff=None) + + def test_stability_test_with_empty_diff_raises_error(self, benchmarks): + """Test stability test with empty diff raises error""" + with pytest.raises( + ValueError, match="Message and diff are required for stability test" + ): + benchmarks.stability_test(message="some message", diff="") + + def test_stability_test_calls_evaluator_correct_number_of_times(self, benchmarks): + """Test stability test calls evaluator correct number of times""" + message = "some message" + diff = "some diff" + runs = 3 + + mock_result = Mock() + mock_result.what_score = 4.0 + mock_result.why_score = 3.5 + mock_result.overall_score = 3.8 + mock_result.confidence = 0.8 + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ) as mock_evaluate: + result = benchmarks.stability_test(message=message, diff=diff, runs=runs) + + assert mock_evaluate.call_count == runs + assert result["runs"] == runs + assert len(result["results"]) == runs + + def test_stability_test_calculates_statistics_correctly(self, benchmarks): + """Test stability test calculates statistics correctly""" + message = "some message" + diff = "some diff" + + mock_result = Mock() + mock_result.what_score = 4.0 + mock_result.why_score = 3.5 + mock_result.overall_score = 3.8 + mock_result.confidence = 0.8 + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(message=message, diff=diff, runs=2) + + stats = result["statistics"] + + assert "what" in stats + assert "why" in stats + assert "overall" in stats + assert "execution_time" in stats + + @patch("time.time") + @patch.object(Progress, "__enter__") + @patch.object(Progress, "__exit__") + def test_stability_test_calculates_execution_time_correctly( + self, mock_exit, mock_enter, mock_time, benchmarks + ): + """Test stability test calculates execution time correctly""" + + mock_time.side_effect = [1000.0, 1000.5, 1001.0, 1001.3] + + mock_progress = Mock() + mock_task = Mock() + mock_progress.add_task.return_value = mock_task + mock_enter.return_value = mock_progress + + mock_result1 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.5, why_score=3.5, overall_score=4.0, confidence=0.9 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test("test message", "test diff", runs=2) + + assert len(result["results"]) == 2 + assert result["results"][0]["execution_time"] == pytest.approx(0.5) + assert result["results"][1]["execution_time"] == pytest.approx(0.3) + + assert "execution_time" in result["statistics"] + assert result["statistics"]["execution_time"]["mean"] == pytest.approx(0.4) + + def test_stability_test_is_stable_returns_true_if_variance_is_less_than_threshold( + self, benchmarks + ): + """Test stability test is stable if variance is less than threshold""" + + mock_result1 = Mock( + what_score=4.1, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test message", "test diff", runs=2, variance_threshold=0.2 + ) + assert result["is_stable"] is True + assert result["max_variance"] == pytest.approx(0.1) + assert result["variance_threshold"] == 0.2 + + def test_stability_test_is_stable_returns_false_if_variance_is_greater_than_threshold( + self, benchmarks + ): + """Test stability test is stable if variance is less than threshold""" + + mock_result1 = Mock( + what_score=2.5, why_score=2.0, overall_score=2.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test message", "test diff", runs=2, variance_threshold=0.2 + ) + assert result["is_stable"] is False + assert result["max_variance"] == pytest.approx(1.5) + assert result["variance_threshold"] == 0.2 + + def test_stability_test_returns_variance_threshold_as_is(self, benchmarks): + """Test stability test returns variance threshold as is""" + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[ + Mock(what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8), + Mock(what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8), + ], + ): + result = benchmarks.stability_test( + "test message", "test diff", runs=2, variance_threshold=0.2 + ) + assert result["variance_threshold"] == 0.2 + + def test_stability_test_returns_expected_structure_and_values(self, benchmarks): + """Test stability test returns expected structure with correct values.""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with ( + patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + return_value=mock_result, + ), + patch("diffmage.evaluation.benchmarks.Progress"), + ): + result = benchmarks.stability_test("test message", "test diff", runs=3) + + # Test structure exists + assert result is not None + assert "message" in result + assert "runs" in result + assert "results" in result + assert "statistics" in result + assert "is_stable" in result + assert "max_variance" in result + assert "variance_threshold" in result + assert "timestamp" in result + + # Test basic values are correct + assert result["message"] == "test message" + assert result["runs"] == 3 + assert len(result["results"]) == 3 + assert isinstance(result["is_stable"], bool) + assert result["variance_threshold"] == 0.2 # default value + assert isinstance(result["timestamp"], str) + + def test_stability_test_processes_results_correctly(self, benchmarks): + """Test that individual run results are processed correctly.""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with ( + patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + return_value=mock_result, + ), + patch("diffmage.evaluation.benchmarks.Progress"), + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + assert len(result["results"]) == 2 + + first_run = result["results"][0] + assert first_run["run"] == 1 + assert first_run["what_score"] == 4.0 + assert first_run["why_score"] == 3.0 + assert first_run["overall_score"] == 3.5 + assert first_run["confidence"] == 0.8 + assert "execution_time" in first_run + + def test_stability_test_statistics_structure(self, benchmarks): + """Test that statistics are calculated and structured correctly.""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with ( + patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + return_value=mock_result, + ), + patch("diffmage.evaluation.benchmarks.Progress"), + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + stats = result["statistics"] + + assert "what" in stats + assert "why" in stats + assert "overall" in stats + assert "execution_time" in stats + + for score_type in ["what", "why", "overall"]: + assert "mean" in stats[score_type] + assert "std" in stats[score_type] + assert "min" in stats[score_type] + assert "max" in stats[score_type] + + ### Calculate Statistics #### + + def test_calculate_statistics_returns_expected_structure(self, benchmarks): + """Test that _calculate_statistics returns expected structure.""" + results = [ + {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}, + {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}, + ] + execution_times = [0.5, 0.3] + + with patch.object(benchmarks, "_calculate_score_variance", return_value={}): + result = benchmarks._calculate_statistics(results, execution_times) + + assert result is not None + assert "what" in result + assert "why" in result + assert "overall" in result + assert "execution_time" in result + + assert "mean" in result["execution_time"] + assert "std" in result["execution_time"] + assert "min" in result["execution_time"] + assert "max" in result["execution_time"] + + def test_calculate_statistics_calls_score_variance_for_each_score_type( + self, benchmarks + ): + """Test that _calculate_score_variance is called for each score type.""" + results = [ + {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}, + {"what_score": 5.0, "why_score": 4.0, "overall_score": 4.5}, + ] + execution_times = [0.5, 0.3] + + with patch.object( + benchmarks, "_calculate_score_variance", return_value={} + ) as mock_calc: + benchmarks._calculate_statistics(results, execution_times) + + assert mock_calc.call_count == 3 + + calls = mock_calc.call_args_list + assert calls[0][0][0] == [4.0, 5.0] + assert calls[1][0][0] == [3.0, 4.0] + assert calls[2][0][0] == [3.5, 4.5] + + def test_calculate_statistics_calculates_execution_time_stats_correctly( + self, benchmarks + ): + """Test execution time statistics are calculated correctly.""" + results = [{"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}] + execution_times = [0.5, 0.3] + + with patch.object(benchmarks, "_calculate_score_variance", return_value={}): + result = benchmarks._calculate_statistics(results, execution_times) + + assert result["execution_time"]["mean"] == pytest.approx(0.4) + assert result["execution_time"]["std"] == pytest.approx(0.14142135623) + assert result["execution_time"]["min"] == 0.3 + assert result["execution_time"]["max"] == 0.5 + + def test_calculate_statistics_uses_score_variance_results(self, benchmarks): + """Test that score variance results are properly integrated.""" + results = [{"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}] + execution_times = [0.5] + + mock_variance = { + "mean": 4.2, + "median": 4.0, + "std": 0.1, + "min": 4.0, + "max": 4.4, + "range": 0.4, + } + + with patch.object( + benchmarks, "_calculate_score_variance", return_value=mock_variance + ): + result = benchmarks._calculate_statistics(results, execution_times) + + assert result["what"] == mock_variance + assert result["why"] == mock_variance + assert result["overall"] == mock_variance + + #### Score Variance ##### + + def test_calculate_score_if_no_scores_returns_empty_dict(self, benchmarks): + """Test calculate_score_variance returns empty dict if no scores""" + scores = [] + result = benchmarks._calculate_score_variance(scores) + assert result == {} + + def test_calculate_score_variance_returns_expected_structure(self, benchmarks): + """Test calculate_score_variance returns expected structure""" + scores = [1.0, 2.0, 3.0, 4.0, 5.0] + result = benchmarks._calculate_score_variance(scores) + assert result is not None + assert "mean" in result + assert "median" in result + assert "std" in result + assert "min" in result + assert "max" in result + assert "range" in result + + def test_calculate_score_variance_returns_correct_values(self, benchmarks): + """Test calculate_score_variance returns correct values""" + scores = [1.0, 2.0, 3.0, 4.0, 5.0] + result = benchmarks._calculate_score_variance(scores) + assert result["mean"] == pytest.approx(3.0) + assert result["median"] == pytest.approx(3.0) + assert result["std"] == pytest.approx(1.58113883008) + assert result["min"] == 1.0 + assert result["max"] == 5.0 + assert result["range"] == 4.0 From 7bd6a67b51bbfb043996cb7b368594e2e433c1b7 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Mon, 4 Aug 2025 19:26:52 +0700 Subject: [PATCH 2/5] refactor(cli): remove redundant model name default assignment --- src/diffmage/cli/reports.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/diffmage/cli/reports.py b/src/diffmage/cli/reports.py index ea24ff4..7de9342 100644 --- a/src/diffmage/cli/reports.py +++ b/src/diffmage/cli/reports.py @@ -1,6 +1,5 @@ import typer from diffmage.cli.shared import app, console -from diffmage.ai.models import get_default_model from diffmage.evaluation.evaluation_report import EvaluationReport from diffmage.evaluation.service import EvaluationService from diffmage.evaluation.benchmarks import EvaluationBenchmarks @@ -78,9 +77,6 @@ def benchmark_stability( analysis = parser.parse_staged_changes() diff = analysis.get_combined_diff() - if not model_name: - model_name = get_default_model().name - evaluator = CommitMessageEvaluator(model_name) benchmarks = EvaluationBenchmarks(evaluator) From f1c96f8e5d351153016a1d107ec2cd4685121c00 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Tue, 5 Aug 2025 10:26:48 +0700 Subject: [PATCH 3/5] test(evaluation): add comprehensive stability test cases for benchmark evaluation --- tests/evaluation/test_benchmarks.py | 240 ++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py index 65c7e12..942d5fd 100644 --- a/tests/evaluation/test_benchmarks.py +++ b/tests/evaluation/test_benchmarks.py @@ -1,4 +1,5 @@ import pytest +import time from rich.progress import Progress from diffmage.evaluation.benchmarks import EvaluationBenchmarks from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator @@ -397,3 +398,242 @@ def test_calculate_score_variance_returns_correct_values(self, benchmarks): assert result["min"] == 1.0 assert result["max"] == 5.0 assert result["range"] == 4.0 + + def test_stability_test_single_run_handles_stdev_gracefully(self, benchmarks): + """Test single run doesn't crash when calculating standard deviation""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test("test", "diff", runs=1) + + assert result["statistics"]["execution_time"]["std"] == 0 + assert result["statistics"]["what"]["std"] == 0 + assert result["statistics"]["why"]["std"] == 0 + assert result["statistics"]["overall"]["std"] == 0 + + def test_stability_test_zero_variance_threshold_requires_perfect_stability( + self, benchmarks + ): + """Test zero variance threshold requires perfect stability""" + mock_result1 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=0.0 + ) + assert result["is_stable"] is True + assert result["max_variance"] == 0.0 + + mock_result3 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result4 = Mock( + what_score=4.1, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result3, mock_result4], + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=0.0 + ) + assert result["is_stable"] is False + assert result["max_variance"] == pytest.approx(0.1) + + def test_stability_test_negative_variance_threshold_allowed(self, benchmarks): + """Test negative variance threshold is allowed (though not practical)""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=-0.1 + ) + assert result["variance_threshold"] == -0.1 + assert result["is_stable"] is False + + def test_stability_test_evaluator_exception_propagates(self, benchmarks): + """Test evaluator exceptions are handled appropriately""" + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=RuntimeError("LLM API error"), + ): + with pytest.raises(RuntimeError, match="LLM API error"): + benchmarks.stability_test("test", "diff", runs=2) + + def test_stability_test_evaluator_exception_on_second_run_propagates( + self, benchmarks + ): + """Test evaluator exception on second run still propagates""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result, RuntimeError("Network timeout")], + ): + with pytest.raises(RuntimeError, match="Network timeout"): + benchmarks.stability_test("test", "diff", runs=2) + + def test_calculate_score_variance_single_score_returns_zero_std(self, benchmarks): + """Test single score returns std=0 without crashing""" + scores = [4.0] + result = benchmarks._calculate_score_variance(scores) + + assert result["mean"] == 4.0 + assert result["median"] == 4.0 + assert result["std"] == 0 + assert result["min"] == 4.0 + assert result["max"] == 4.0 + assert result["range"] == 0.0 + + def test_stability_test_large_runs_parameter_performance(self, benchmarks): + """Test large runs parameter doesn't cause performance issues""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + start_time = time.time() + result = benchmarks.stability_test("test", "diff", runs=100) + execution_time = time.time() - start_time + + assert len(result["results"]) == 100 + assert result["runs"] == 100 + assert execution_time < 5.0 + + def test_stability_test_extreme_score_values_at_boundaries(self, benchmarks): + """Test stability test with extreme score values at boundaries""" + mock_result1 = Mock( + what_score=0.0, why_score=0.0, overall_score=0.0, confidence=0.0 + ) + mock_result2 = Mock( + what_score=5.0, why_score=5.0, overall_score=5.0, confidence=1.0 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + assert result["statistics"]["what"]["min"] == 0.0 + assert result["statistics"]["what"]["max"] == 5.0 + assert result["statistics"]["what"]["range"] == 5.0 + assert result["is_stable"] is False # High variance + + def test_stability_test_negative_score_values(self, benchmarks): + """Test stability test with negative score values (outside expected range)""" + mock_result = Mock( + what_score=-1.0, why_score=-2.0, overall_score=-1.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + assert result["statistics"]["what"]["mean"] == -1.0 + assert result["statistics"]["why"]["mean"] == -2.0 + + def test_stability_test_very_high_variance_threshold_always_stable( + self, benchmarks + ): + """Test very high variance threshold makes any result stable""" + mock_result1 = Mock( + what_score=1.0, why_score=1.0, overall_score=1.0, confidence=0.1 + ) + mock_result2 = Mock( + what_score=5.0, why_score=5.0, overall_score=5.0, confidence=1.0 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=10.0 + ) + + assert result["is_stable"] is True + assert result["variance_threshold"] == 10.0 + assert result["max_variance"] < 10.0 + + def test_stability_test_very_long_message_and_diff_strings(self, benchmarks): + """Test stability test with very long message and diff strings""" + long_message = "A" * 10000 # 10KB message + long_diff = "B" * 50000 # 50KB diff + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(long_message, long_diff, runs=2) + + assert result["message"] == long_message + assert len(result["results"]) == 2 + assert result["runs"] == 2 + + def test_stability_test_unicode_and_special_characters(self, benchmarks): + """Test stability test with unicode and special characters""" + unicode_message = "feat: Add 支持中文 and émojis 🚀 with ñiño characters" + special_diff = """ + + print("Hello 世界! 🌍") + - console.log('Goodbye 👋') + + # TODO: Fix unicode handling ñoño café + """ + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(unicode_message, special_diff, runs=2) + + assert result["message"] == unicode_message + assert len(result["results"]) == 2 + assert result["runs"] == 2 + + def test_stability_test_special_control_characters(self, benchmarks): + """Test stability test with control characters and escape sequences""" + control_message = "fix: Handle \\n\\t\\r and \\x00 characters" + control_diff = "- old\\nline\\twith\\ttabs\n+ new\\nline\\twith\\tspaces" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(control_message, control_diff, runs=2) + + assert result["message"] == control_message + assert len(result["results"]) == 2 From e233a76fa9fb1bab89247108a57030a6c03e29af Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Tue, 5 Aug 2025 11:02:08 +0700 Subject: [PATCH 4/5] feat(evaluation): Add type hints and improve stability test results structure - Introduce TypedDict classes for better type safety in benchmarking - Update stability test to return structured TypedDict results - Improve score variance calculation to handle empty inputs - Enhance display of stability results with clearer dimension names --- src/diffmage/evaluation/benchmarks.py | 82 ++++++++++++++++++++++----- tests/evaluation/test_benchmarks.py | 16 ++++-- uv.lock | 2 +- 3 files changed, 82 insertions(+), 18 deletions(-) diff --git a/src/diffmage/evaluation/benchmarks.py b/src/diffmage/evaluation/benchmarks.py index be569de..622cb22 100644 --- a/src/diffmage/evaluation/benchmarks.py +++ b/src/diffmage/evaluation/benchmarks.py @@ -1,3 +1,4 @@ +from typing import TypedDict import time import statistics from datetime import datetime @@ -7,6 +8,49 @@ from rich.progress import Progress +class ScoreStats(TypedDict): + mean: float + median: float + std: float + min: float + max: float + range: float + + +class ExecutionTimeStats(TypedDict): + mean: float + std: float + min: float + max: float + + +class BenchmarkStats(TypedDict): + what: ScoreStats + why: ScoreStats + overall: ScoreStats + execution_time: ExecutionTimeStats + + +class RunResult(TypedDict): + run: int + what_score: float + why_score: float + overall_score: float + confidence: float + execution_time: float + + +class StabilityTestResult(TypedDict): + message: str + runs: int + results: list[RunResult] + statistics: BenchmarkStats + is_stable: bool + max_variance: float + variance_threshold: float + timestamp: str + + class EvaluationBenchmarks: """ Benchmarking and validation tools for LLM based commit message evaluation. @@ -18,15 +62,15 @@ def __init__(self, evaluator: CommitMessageEvaluator): def stability_test( self, message: str, diff: str, runs: int = 3, variance_threshold: float = 0.2 - ) -> dict: + ) -> StabilityTestResult: """ Run a stability test on the evaluator. """ if not message or not diff: raise ValueError("Message and diff are required for stability test") - results = [] - execution_times = [] + results: list[RunResult] = [] + execution_times: list[float] = [] with Progress(console=self.console) as progress: task = progress.add_task("Evaluating...", total=runs) for run in range(runs): @@ -36,7 +80,7 @@ def stability_test( execution_time = time.time() - start_time execution_times.append(execution_time) - run_data = { + run_data: RunResult = { "run": run + 1, "what_score": result.what_score, "why_score": result.why_score, @@ -69,8 +113,8 @@ def stability_test( } def _calculate_statistics( - self, results: list[dict], execution_times: list[float] - ) -> dict: + self, results: list[RunResult], execution_times: list[float] + ) -> BenchmarkStats: """ Calculate statistics for the results and execution times. """ @@ -92,7 +136,7 @@ def _calculate_statistics( }, } - def _determine_stability(self, stats: dict) -> float: + def _determine_stability(self, stats: BenchmarkStats) -> float: """ Determine stability based on the statistics. """ @@ -100,12 +144,19 @@ def _determine_stability(self, stats: dict) -> float: stats["what"]["range"], stats["why"]["range"], stats["overall"]["range"] ) - def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]: + def _calculate_score_variance(self, scores: list[float]) -> ScoreStats: """ Calculate variance and other statistics for a list of scores. """ if not scores: - return {} + return { + "mean": 0.0, + "median": 0.0, + "std": 0.0, + "min": 0.0, + "max": 0.0, + "range": 0.0, + } return { "mean": statistics.mean(scores), @@ -117,7 +168,7 @@ def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]: } def _display_stability_results( - self, stats: dict, is_stable: bool, threshold: float + self, stats: BenchmarkStats, is_stable: bool, threshold: float ) -> None: """ Display the stability test results. @@ -130,14 +181,19 @@ def _display_stability_results( table.add_column("Range", justify="center") table.add_column("Status", justify="center") - for dimension in ["what", "why", "overall"]: - stat = stats[dimension] + dimensions_data = [ + ("WHAT", stats["what"]), + ("WHY", stats["why"]), + ("OVERALL", stats["overall"]), + ] + + for dimension_name, stat in dimensions_data: range_val = stat["range"] status = "✅ Stable" if range_val <= threshold else "⚠️ Unstable" status_color = "green" if range_val <= threshold else "red" table.add_row( - dimension.upper(), + dimension_name, f"{stat['mean']:.2f}", f"{stat['std']:.2f}", f"{range_val:.2f}", diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py index 942d5fd..419d5c3 100644 --- a/tests/evaluation/test_benchmarks.py +++ b/tests/evaluation/test_benchmarks.py @@ -370,11 +370,19 @@ def test_calculate_statistics_uses_score_variance_results(self, benchmarks): #### Score Variance ##### - def test_calculate_score_if_no_scores_returns_empty_dict(self, benchmarks): - """Test calculate_score_variance returns empty dict if no scores""" - scores = [] + def test_calculate_score_if_no_scores_returns_zero_stats(self, benchmarks): + """Test calculate_score_variance returns zero stats if no scores""" + scores: list[float] = [] result = benchmarks._calculate_score_variance(scores) - assert result == {} + expected = { + "mean": 0.0, + "median": 0.0, + "std": 0.0, + "min": 0.0, + "max": 0.0, + "range": 0.0, + } + assert result == expected def test_calculate_score_variance_returns_expected_structure(self, benchmarks): """Test calculate_score_variance returns expected structure""" diff --git a/uv.lock b/uv.lock index 018681c..829e8b0 100644 --- a/uv.lock +++ b/uv.lock @@ -314,7 +314,7 @@ wheels = [ [[package]] name = "diffmage" -version = "0.3.2" +version = "0.5.0" source = { editable = "." } dependencies = [ { name = "gitpython" }, From b160807d2ce8cd67dc98842b8d166e7724a2cc00 Mon Sep 17 00:00:00 2001 From: Nick Galluzzo Date: Tue, 5 Aug 2025 11:02:08 +0700 Subject: [PATCH 5/5] refactor(evaluation): Add type hints and improve stability test results structure - Introduce TypedDict classes for better type safety in benchmarking - Update stability test to return structured TypedDict results - Improve score variance calculation to handle empty inputs - Enhance display of stability results with clearer dimension names --- src/diffmage/evaluation/benchmarks.py | 82 ++++++++++++++++++++++----- tests/evaluation/test_benchmarks.py | 16 ++++-- uv.lock | 2 +- 3 files changed, 82 insertions(+), 18 deletions(-) diff --git a/src/diffmage/evaluation/benchmarks.py b/src/diffmage/evaluation/benchmarks.py index be569de..622cb22 100644 --- a/src/diffmage/evaluation/benchmarks.py +++ b/src/diffmage/evaluation/benchmarks.py @@ -1,3 +1,4 @@ +from typing import TypedDict import time import statistics from datetime import datetime @@ -7,6 +8,49 @@ from rich.progress import Progress +class ScoreStats(TypedDict): + mean: float + median: float + std: float + min: float + max: float + range: float + + +class ExecutionTimeStats(TypedDict): + mean: float + std: float + min: float + max: float + + +class BenchmarkStats(TypedDict): + what: ScoreStats + why: ScoreStats + overall: ScoreStats + execution_time: ExecutionTimeStats + + +class RunResult(TypedDict): + run: int + what_score: float + why_score: float + overall_score: float + confidence: float + execution_time: float + + +class StabilityTestResult(TypedDict): + message: str + runs: int + results: list[RunResult] + statistics: BenchmarkStats + is_stable: bool + max_variance: float + variance_threshold: float + timestamp: str + + class EvaluationBenchmarks: """ Benchmarking and validation tools for LLM based commit message evaluation. @@ -18,15 +62,15 @@ def __init__(self, evaluator: CommitMessageEvaluator): def stability_test( self, message: str, diff: str, runs: int = 3, variance_threshold: float = 0.2 - ) -> dict: + ) -> StabilityTestResult: """ Run a stability test on the evaluator. """ if not message or not diff: raise ValueError("Message and diff are required for stability test") - results = [] - execution_times = [] + results: list[RunResult] = [] + execution_times: list[float] = [] with Progress(console=self.console) as progress: task = progress.add_task("Evaluating...", total=runs) for run in range(runs): @@ -36,7 +80,7 @@ def stability_test( execution_time = time.time() - start_time execution_times.append(execution_time) - run_data = { + run_data: RunResult = { "run": run + 1, "what_score": result.what_score, "why_score": result.why_score, @@ -69,8 +113,8 @@ def stability_test( } def _calculate_statistics( - self, results: list[dict], execution_times: list[float] - ) -> dict: + self, results: list[RunResult], execution_times: list[float] + ) -> BenchmarkStats: """ Calculate statistics for the results and execution times. """ @@ -92,7 +136,7 @@ def _calculate_statistics( }, } - def _determine_stability(self, stats: dict) -> float: + def _determine_stability(self, stats: BenchmarkStats) -> float: """ Determine stability based on the statistics. """ @@ -100,12 +144,19 @@ def _determine_stability(self, stats: dict) -> float: stats["what"]["range"], stats["why"]["range"], stats["overall"]["range"] ) - def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]: + def _calculate_score_variance(self, scores: list[float]) -> ScoreStats: """ Calculate variance and other statistics for a list of scores. """ if not scores: - return {} + return { + "mean": 0.0, + "median": 0.0, + "std": 0.0, + "min": 0.0, + "max": 0.0, + "range": 0.0, + } return { "mean": statistics.mean(scores), @@ -117,7 +168,7 @@ def _calculate_score_variance(self, scores: list[float]) -> dict[str, float]: } def _display_stability_results( - self, stats: dict, is_stable: bool, threshold: float + self, stats: BenchmarkStats, is_stable: bool, threshold: float ) -> None: """ Display the stability test results. @@ -130,14 +181,19 @@ def _display_stability_results( table.add_column("Range", justify="center") table.add_column("Status", justify="center") - for dimension in ["what", "why", "overall"]: - stat = stats[dimension] + dimensions_data = [ + ("WHAT", stats["what"]), + ("WHY", stats["why"]), + ("OVERALL", stats["overall"]), + ] + + for dimension_name, stat in dimensions_data: range_val = stat["range"] status = "✅ Stable" if range_val <= threshold else "⚠️ Unstable" status_color = "green" if range_val <= threshold else "red" table.add_row( - dimension.upper(), + dimension_name, f"{stat['mean']:.2f}", f"{stat['std']:.2f}", f"{range_val:.2f}", diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py index 942d5fd..419d5c3 100644 --- a/tests/evaluation/test_benchmarks.py +++ b/tests/evaluation/test_benchmarks.py @@ -370,11 +370,19 @@ def test_calculate_statistics_uses_score_variance_results(self, benchmarks): #### Score Variance ##### - def test_calculate_score_if_no_scores_returns_empty_dict(self, benchmarks): - """Test calculate_score_variance returns empty dict if no scores""" - scores = [] + def test_calculate_score_if_no_scores_returns_zero_stats(self, benchmarks): + """Test calculate_score_variance returns zero stats if no scores""" + scores: list[float] = [] result = benchmarks._calculate_score_variance(scores) - assert result == {} + expected = { + "mean": 0.0, + "median": 0.0, + "std": 0.0, + "min": 0.0, + "max": 0.0, + "range": 0.0, + } + assert result == expected def test_calculate_score_variance_returns_expected_structure(self, benchmarks): """Test calculate_score_variance returns expected structure""" diff --git a/uv.lock b/uv.lock index 018681c..829e8b0 100644 --- a/uv.lock +++ b/uv.lock @@ -314,7 +314,7 @@ wheels = [ [[package]] name = "diffmage" -version = "0.3.2" +version = "0.5.0" source = { editable = "." } dependencies = [ { name = "gitpython" },