diff --git a/src/diffmage/cli/reports.py b/src/diffmage/cli/reports.py index 9ff6696..7de9342 100644 --- a/src/diffmage/cli/reports.py +++ b/src/diffmage/cli/reports.py @@ -2,6 +2,8 @@ from diffmage.cli.shared import app, console from diffmage.evaluation.evaluation_report import EvaluationReport from diffmage.evaluation.service import EvaluationService +from diffmage.evaluation.benchmarks import EvaluationBenchmarks +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator @app.command() @@ -49,3 +51,46 @@ def batch_report( except Exception as e: console.print(f"[red]Error generating report: {e}[/red]") raise typer.Exit(1) + + +@app.command() +def benchmark_stability( + message: str = typer.Argument(..., help="Commit message to evaluate"), + commit_hash: str = typer.Option(None, "--commit", "-c", help="Use commit's diff"), + runs: int = typer.Option(2, "--runs", "-n", help="Number of runs to perform"), + model_name: str = typer.Option( + None, "--model", "-m", help="AI model to use for evaluation" + ), + repo_path: str = typer.Option( + ".", "--repo-path", "-r", help="Path to git repository" + ), +) -> None: + """Evaluate the stability of a commit message""" + + try: + from diffmage.git.diff_parser import GitDiffParser + + parser = GitDiffParser(repo_path) + if commit_hash: + analysis, diff = parser.parse_specific_commit(commit_hash) + else: + analysis = parser.parse_staged_changes() + diff = analysis.get_combined_diff() + + evaluator = CommitMessageEvaluator(model_name) + benchmarks = EvaluationBenchmarks(evaluator) + + result = benchmarks.stability_test(message, diff, runs, variance_threshold=0.2) + + if result["is_stable"]: + console.print( + f"\n[green]✅ Evaluator is STABLE (max variance: {result['max_variance']:.2f})[/green]" + ) + else: + console.print( + f"\n[red]⚠️ Evaluator is UNSTABLE (max variance: {result['max_variance']:.2f})[/red]" + ) + + except Exception as e: + console.print(f"[red]Error evaluating stability: {e}[/red]") + raise typer.Exit(1) diff --git a/src/diffmage/evaluation/benchmarks.py b/src/diffmage/evaluation/benchmarks.py new file mode 100644 index 0000000..622cb22 --- /dev/null +++ b/src/diffmage/evaluation/benchmarks.py @@ -0,0 +1,217 @@ +from typing import TypedDict +import time +import statistics +from datetime import datetime +from rich.table import Table +from rich.console import Console +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator +from rich.progress import Progress + + +class ScoreStats(TypedDict): + mean: float + median: float + std: float + min: float + max: float + range: float + + +class ExecutionTimeStats(TypedDict): + mean: float + std: float + min: float + max: float + + +class BenchmarkStats(TypedDict): + what: ScoreStats + why: ScoreStats + overall: ScoreStats + execution_time: ExecutionTimeStats + + +class RunResult(TypedDict): + run: int + what_score: float + why_score: float + overall_score: float + confidence: float + execution_time: float + + +class StabilityTestResult(TypedDict): + message: str + runs: int + results: list[RunResult] + statistics: BenchmarkStats + is_stable: bool + max_variance: float + variance_threshold: float + timestamp: str + + +class EvaluationBenchmarks: + """ + Benchmarking and validation tools for LLM based commit message evaluation. + """ + + def __init__(self, evaluator: CommitMessageEvaluator): + self.console = Console() + self.evaluator = evaluator + + def stability_test( + self, message: str, diff: str, runs: int = 3, variance_threshold: float = 0.2 + ) -> StabilityTestResult: + """ + Run a stability test on the evaluator. + """ + if not message or not diff: + raise ValueError("Message and diff are required for stability test") + + results: list[RunResult] = [] + execution_times: list[float] = [] + with Progress(console=self.console) as progress: + task = progress.add_task("Evaluating...", total=runs) + for run in range(runs): + start_time = time.time() + result = self.evaluator.evaluate_commit_message(message, diff) + + execution_time = time.time() - start_time + execution_times.append(execution_time) + + run_data: RunResult = { + "run": run + 1, + "what_score": result.what_score, + "why_score": result.why_score, + "overall_score": result.overall_score, + "confidence": result.confidence, + "execution_time": execution_time, + } + results.append(run_data) + + progress.update(task, advance=1) + self.console.print( + f" Run {run + 1}: WHAT={result.what_score:.1f}, WHY={result.why_score:.1f}, OVERALL={result.overall_score:.1f} completed in {execution_time}s" + ) + + stats = self._calculate_statistics(results, execution_times) + max_variance = self._determine_stability(stats) + is_stable = max_variance <= variance_threshold + + self._display_stability_results(stats, is_stable, variance_threshold) + + return { + "message": message, + "runs": runs, + "results": results, + "statistics": stats, + "is_stable": is_stable, + "max_variance": max_variance, + "variance_threshold": variance_threshold, + "timestamp": datetime.now().isoformat(), + } + + def _calculate_statistics( + self, results: list[RunResult], execution_times: list[float] + ) -> BenchmarkStats: + """ + Calculate statistics for the results and execution times. + """ + what_scores = [r["what_score"] for r in results] + why_scores = [r["why_score"] for r in results] + overall_scores = [r["overall_score"] for r in results] + + return { + "what": self._calculate_score_variance(what_scores), + "why": self._calculate_score_variance(why_scores), + "overall": self._calculate_score_variance(overall_scores), + "execution_time": { + "mean": statistics.mean(execution_times), + "std": statistics.stdev(execution_times) + if len(execution_times) > 1 + else 0, + "min": min(execution_times), + "max": max(execution_times), + }, + } + + def _determine_stability(self, stats: BenchmarkStats) -> float: + """ + Determine stability based on the statistics. + """ + return max( + stats["what"]["range"], stats["why"]["range"], stats["overall"]["range"] + ) + + def _calculate_score_variance(self, scores: list[float]) -> ScoreStats: + """ + Calculate variance and other statistics for a list of scores. + """ + if not scores: + return { + "mean": 0.0, + "median": 0.0, + "std": 0.0, + "min": 0.0, + "max": 0.0, + "range": 0.0, + } + + return { + "mean": statistics.mean(scores), + "median": statistics.median(scores), + "std": statistics.stdev(scores) if len(scores) > 1 else 0, + "min": min(scores), + "max": max(scores), + "range": max(scores) - min(scores), + } + + def _display_stability_results( + self, stats: BenchmarkStats, is_stable: bool, threshold: float + ) -> None: + """ + Display the stability test results. + """ + + table = Table(title="Stability Test Results", show_header=True) + table.add_column("Dimension", style="cyan") + table.add_column("Mean", justify="center") + table.add_column("Std Dev", justify="center") + table.add_column("Range", justify="center") + table.add_column("Status", justify="center") + + dimensions_data = [ + ("WHAT", stats["what"]), + ("WHY", stats["why"]), + ("OVERALL", stats["overall"]), + ] + + for dimension_name, stat in dimensions_data: + range_val = stat["range"] + status = "✅ Stable" if range_val <= threshold else "⚠️ Unstable" + status_color = "green" if range_val <= threshold else "red" + + table.add_row( + dimension_name, + f"{stat['mean']:.2f}", + f"{stat['std']:.2f}", + f"{range_val:.2f}", + f"[{status_color}]{status}[/{status_color}]", + ) + + self.console.print(table) + + # Overall assessment + overall_color = "green" if is_stable else "red" + overall_status = "STABLE" if is_stable else "UNSTABLE" + + self.console.print( + f"\n[{overall_color}]Overall Assessment: {overall_status} (threshold: ±{threshold})[/{overall_color}]" + ) + + # Performance info + exec_stats = stats["execution_time"] + self.console.print( + f"\n[dim]Average execution time: {exec_stats['mean']:.2f}s (±{exec_stats['std']:.2f}s)[/dim]" + ) diff --git a/tests/evaluation/test_benchmarks.py b/tests/evaluation/test_benchmarks.py new file mode 100644 index 0000000..419d5c3 --- /dev/null +++ b/tests/evaluation/test_benchmarks.py @@ -0,0 +1,647 @@ +import pytest +import time +from rich.progress import Progress +from diffmage.evaluation.benchmarks import EvaluationBenchmarks +from diffmage.evaluation.commit_message_evaluator import CommitMessageEvaluator +from rich.console import Console +from unittest.mock import Mock, patch + + +@pytest.fixture +def evaluator(): + """Fixture providing CommitMessageEvaluator instance""" + return CommitMessageEvaluator() + + +@pytest.fixture +def benchmarks(evaluator): + """Fixture providing EvaluationBenchmarks instance""" + return EvaluationBenchmarks(evaluator) + + +class TestEvaluationBenchmarks: + """Test cases for EvaluationBenchmarks model.""" + + def test_evaluation_benchmarks_initialization(self, benchmarks): + """Test successful creation of EvaluationBenchmarks.""" + + assert benchmarks is not None + assert benchmarks.evaluator is not None + assert isinstance(benchmarks.evaluator, CommitMessageEvaluator) + assert isinstance(benchmarks.console, Console) + + def test_stability_test_with_empty_message_raises_error(self, benchmarks): + """Test stability test with empty message raises error""" + with pytest.raises( + ValueError, match="Message and diff are required for stability test" + ): + benchmarks.stability_test(message="", diff="some diff") + + def test_stability_test_with_none_raises_error(self, benchmarks): + """Test stability test with none raises error""" + with pytest.raises( + ValueError, match="Message and diff are required for stability test" + ): + benchmarks.stability_test(message=None, diff=None) + + def test_stability_test_with_empty_diff_raises_error(self, benchmarks): + """Test stability test with empty diff raises error""" + with pytest.raises( + ValueError, match="Message and diff are required for stability test" + ): + benchmarks.stability_test(message="some message", diff="") + + def test_stability_test_calls_evaluator_correct_number_of_times(self, benchmarks): + """Test stability test calls evaluator correct number of times""" + message = "some message" + diff = "some diff" + runs = 3 + + mock_result = Mock() + mock_result.what_score = 4.0 + mock_result.why_score = 3.5 + mock_result.overall_score = 3.8 + mock_result.confidence = 0.8 + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ) as mock_evaluate: + result = benchmarks.stability_test(message=message, diff=diff, runs=runs) + + assert mock_evaluate.call_count == runs + assert result["runs"] == runs + assert len(result["results"]) == runs + + def test_stability_test_calculates_statistics_correctly(self, benchmarks): + """Test stability test calculates statistics correctly""" + message = "some message" + diff = "some diff" + + mock_result = Mock() + mock_result.what_score = 4.0 + mock_result.why_score = 3.5 + mock_result.overall_score = 3.8 + mock_result.confidence = 0.8 + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(message=message, diff=diff, runs=2) + + stats = result["statistics"] + + assert "what" in stats + assert "why" in stats + assert "overall" in stats + assert "execution_time" in stats + + @patch("time.time") + @patch.object(Progress, "__enter__") + @patch.object(Progress, "__exit__") + def test_stability_test_calculates_execution_time_correctly( + self, mock_exit, mock_enter, mock_time, benchmarks + ): + """Test stability test calculates execution time correctly""" + + mock_time.side_effect = [1000.0, 1000.5, 1001.0, 1001.3] + + mock_progress = Mock() + mock_task = Mock() + mock_progress.add_task.return_value = mock_task + mock_enter.return_value = mock_progress + + mock_result1 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.5, why_score=3.5, overall_score=4.0, confidence=0.9 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test("test message", "test diff", runs=2) + + assert len(result["results"]) == 2 + assert result["results"][0]["execution_time"] == pytest.approx(0.5) + assert result["results"][1]["execution_time"] == pytest.approx(0.3) + + assert "execution_time" in result["statistics"] + assert result["statistics"]["execution_time"]["mean"] == pytest.approx(0.4) + + def test_stability_test_is_stable_returns_true_if_variance_is_less_than_threshold( + self, benchmarks + ): + """Test stability test is stable if variance is less than threshold""" + + mock_result1 = Mock( + what_score=4.1, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test message", "test diff", runs=2, variance_threshold=0.2 + ) + assert result["is_stable"] is True + assert result["max_variance"] == pytest.approx(0.1) + assert result["variance_threshold"] == 0.2 + + def test_stability_test_is_stable_returns_false_if_variance_is_greater_than_threshold( + self, benchmarks + ): + """Test stability test is stable if variance is less than threshold""" + + mock_result1 = Mock( + what_score=2.5, why_score=2.0, overall_score=2.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test message", "test diff", runs=2, variance_threshold=0.2 + ) + assert result["is_stable"] is False + assert result["max_variance"] == pytest.approx(1.5) + assert result["variance_threshold"] == 0.2 + + def test_stability_test_returns_variance_threshold_as_is(self, benchmarks): + """Test stability test returns variance threshold as is""" + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[ + Mock(what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8), + Mock(what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8), + ], + ): + result = benchmarks.stability_test( + "test message", "test diff", runs=2, variance_threshold=0.2 + ) + assert result["variance_threshold"] == 0.2 + + def test_stability_test_returns_expected_structure_and_values(self, benchmarks): + """Test stability test returns expected structure with correct values.""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with ( + patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + return_value=mock_result, + ), + patch("diffmage.evaluation.benchmarks.Progress"), + ): + result = benchmarks.stability_test("test message", "test diff", runs=3) + + # Test structure exists + assert result is not None + assert "message" in result + assert "runs" in result + assert "results" in result + assert "statistics" in result + assert "is_stable" in result + assert "max_variance" in result + assert "variance_threshold" in result + assert "timestamp" in result + + # Test basic values are correct + assert result["message"] == "test message" + assert result["runs"] == 3 + assert len(result["results"]) == 3 + assert isinstance(result["is_stable"], bool) + assert result["variance_threshold"] == 0.2 # default value + assert isinstance(result["timestamp"], str) + + def test_stability_test_processes_results_correctly(self, benchmarks): + """Test that individual run results are processed correctly.""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with ( + patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + return_value=mock_result, + ), + patch("diffmage.evaluation.benchmarks.Progress"), + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + assert len(result["results"]) == 2 + + first_run = result["results"][0] + assert first_run["run"] == 1 + assert first_run["what_score"] == 4.0 + assert first_run["why_score"] == 3.0 + assert first_run["overall_score"] == 3.5 + assert first_run["confidence"] == 0.8 + assert "execution_time" in first_run + + def test_stability_test_statistics_structure(self, benchmarks): + """Test that statistics are calculated and structured correctly.""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with ( + patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + return_value=mock_result, + ), + patch("diffmage.evaluation.benchmarks.Progress"), + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + stats = result["statistics"] + + assert "what" in stats + assert "why" in stats + assert "overall" in stats + assert "execution_time" in stats + + for score_type in ["what", "why", "overall"]: + assert "mean" in stats[score_type] + assert "std" in stats[score_type] + assert "min" in stats[score_type] + assert "max" in stats[score_type] + + ### Calculate Statistics #### + + def test_calculate_statistics_returns_expected_structure(self, benchmarks): + """Test that _calculate_statistics returns expected structure.""" + results = [ + {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}, + {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}, + ] + execution_times = [0.5, 0.3] + + with patch.object(benchmarks, "_calculate_score_variance", return_value={}): + result = benchmarks._calculate_statistics(results, execution_times) + + assert result is not None + assert "what" in result + assert "why" in result + assert "overall" in result + assert "execution_time" in result + + assert "mean" in result["execution_time"] + assert "std" in result["execution_time"] + assert "min" in result["execution_time"] + assert "max" in result["execution_time"] + + def test_calculate_statistics_calls_score_variance_for_each_score_type( + self, benchmarks + ): + """Test that _calculate_score_variance is called for each score type.""" + results = [ + {"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}, + {"what_score": 5.0, "why_score": 4.0, "overall_score": 4.5}, + ] + execution_times = [0.5, 0.3] + + with patch.object( + benchmarks, "_calculate_score_variance", return_value={} + ) as mock_calc: + benchmarks._calculate_statistics(results, execution_times) + + assert mock_calc.call_count == 3 + + calls = mock_calc.call_args_list + assert calls[0][0][0] == [4.0, 5.0] + assert calls[1][0][0] == [3.0, 4.0] + assert calls[2][0][0] == [3.5, 4.5] + + def test_calculate_statistics_calculates_execution_time_stats_correctly( + self, benchmarks + ): + """Test execution time statistics are calculated correctly.""" + results = [{"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}] + execution_times = [0.5, 0.3] + + with patch.object(benchmarks, "_calculate_score_variance", return_value={}): + result = benchmarks._calculate_statistics(results, execution_times) + + assert result["execution_time"]["mean"] == pytest.approx(0.4) + assert result["execution_time"]["std"] == pytest.approx(0.14142135623) + assert result["execution_time"]["min"] == 0.3 + assert result["execution_time"]["max"] == 0.5 + + def test_calculate_statistics_uses_score_variance_results(self, benchmarks): + """Test that score variance results are properly integrated.""" + results = [{"what_score": 4.0, "why_score": 3.0, "overall_score": 3.5}] + execution_times = [0.5] + + mock_variance = { + "mean": 4.2, + "median": 4.0, + "std": 0.1, + "min": 4.0, + "max": 4.4, + "range": 0.4, + } + + with patch.object( + benchmarks, "_calculate_score_variance", return_value=mock_variance + ): + result = benchmarks._calculate_statistics(results, execution_times) + + assert result["what"] == mock_variance + assert result["why"] == mock_variance + assert result["overall"] == mock_variance + + #### Score Variance ##### + + def test_calculate_score_if_no_scores_returns_zero_stats(self, benchmarks): + """Test calculate_score_variance returns zero stats if no scores""" + scores: list[float] = [] + result = benchmarks._calculate_score_variance(scores) + expected = { + "mean": 0.0, + "median": 0.0, + "std": 0.0, + "min": 0.0, + "max": 0.0, + "range": 0.0, + } + assert result == expected + + def test_calculate_score_variance_returns_expected_structure(self, benchmarks): + """Test calculate_score_variance returns expected structure""" + scores = [1.0, 2.0, 3.0, 4.0, 5.0] + result = benchmarks._calculate_score_variance(scores) + assert result is not None + assert "mean" in result + assert "median" in result + assert "std" in result + assert "min" in result + assert "max" in result + assert "range" in result + + def test_calculate_score_variance_returns_correct_values(self, benchmarks): + """Test calculate_score_variance returns correct values""" + scores = [1.0, 2.0, 3.0, 4.0, 5.0] + result = benchmarks._calculate_score_variance(scores) + assert result["mean"] == pytest.approx(3.0) + assert result["median"] == pytest.approx(3.0) + assert result["std"] == pytest.approx(1.58113883008) + assert result["min"] == 1.0 + assert result["max"] == 5.0 + assert result["range"] == 4.0 + + def test_stability_test_single_run_handles_stdev_gracefully(self, benchmarks): + """Test single run doesn't crash when calculating standard deviation""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test("test", "diff", runs=1) + + assert result["statistics"]["execution_time"]["std"] == 0 + assert result["statistics"]["what"]["std"] == 0 + assert result["statistics"]["why"]["std"] == 0 + assert result["statistics"]["overall"]["std"] == 0 + + def test_stability_test_zero_variance_threshold_requires_perfect_stability( + self, benchmarks + ): + """Test zero variance threshold requires perfect stability""" + mock_result1 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result2 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=0.0 + ) + assert result["is_stable"] is True + assert result["max_variance"] == 0.0 + + mock_result3 = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + mock_result4 = Mock( + what_score=4.1, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result3, mock_result4], + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=0.0 + ) + assert result["is_stable"] is False + assert result["max_variance"] == pytest.approx(0.1) + + def test_stability_test_negative_variance_threshold_allowed(self, benchmarks): + """Test negative variance threshold is allowed (though not practical)""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=-0.1 + ) + assert result["variance_threshold"] == -0.1 + assert result["is_stable"] is False + + def test_stability_test_evaluator_exception_propagates(self, benchmarks): + """Test evaluator exceptions are handled appropriately""" + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=RuntimeError("LLM API error"), + ): + with pytest.raises(RuntimeError, match="LLM API error"): + benchmarks.stability_test("test", "diff", runs=2) + + def test_stability_test_evaluator_exception_on_second_run_propagates( + self, benchmarks + ): + """Test evaluator exception on second run still propagates""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result, RuntimeError("Network timeout")], + ): + with pytest.raises(RuntimeError, match="Network timeout"): + benchmarks.stability_test("test", "diff", runs=2) + + def test_calculate_score_variance_single_score_returns_zero_std(self, benchmarks): + """Test single score returns std=0 without crashing""" + scores = [4.0] + result = benchmarks._calculate_score_variance(scores) + + assert result["mean"] == 4.0 + assert result["median"] == 4.0 + assert result["std"] == 0 + assert result["min"] == 4.0 + assert result["max"] == 4.0 + assert result["range"] == 0.0 + + def test_stability_test_large_runs_parameter_performance(self, benchmarks): + """Test large runs parameter doesn't cause performance issues""" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + start_time = time.time() + result = benchmarks.stability_test("test", "diff", runs=100) + execution_time = time.time() - start_time + + assert len(result["results"]) == 100 + assert result["runs"] == 100 + assert execution_time < 5.0 + + def test_stability_test_extreme_score_values_at_boundaries(self, benchmarks): + """Test stability test with extreme score values at boundaries""" + mock_result1 = Mock( + what_score=0.0, why_score=0.0, overall_score=0.0, confidence=0.0 + ) + mock_result2 = Mock( + what_score=5.0, why_score=5.0, overall_score=5.0, confidence=1.0 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + assert result["statistics"]["what"]["min"] == 0.0 + assert result["statistics"]["what"]["max"] == 5.0 + assert result["statistics"]["what"]["range"] == 5.0 + assert result["is_stable"] is False # High variance + + def test_stability_test_negative_score_values(self, benchmarks): + """Test stability test with negative score values (outside expected range)""" + mock_result = Mock( + what_score=-1.0, why_score=-2.0, overall_score=-1.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test("test", "diff", runs=2) + + assert result["statistics"]["what"]["mean"] == -1.0 + assert result["statistics"]["why"]["mean"] == -2.0 + + def test_stability_test_very_high_variance_threshold_always_stable( + self, benchmarks + ): + """Test very high variance threshold makes any result stable""" + mock_result1 = Mock( + what_score=1.0, why_score=1.0, overall_score=1.0, confidence=0.1 + ) + mock_result2 = Mock( + what_score=5.0, why_score=5.0, overall_score=5.0, confidence=1.0 + ) + + with patch.object( + benchmarks.evaluator, + "evaluate_commit_message", + side_effect=[mock_result1, mock_result2], + ): + result = benchmarks.stability_test( + "test", "diff", runs=2, variance_threshold=10.0 + ) + + assert result["is_stable"] is True + assert result["variance_threshold"] == 10.0 + assert result["max_variance"] < 10.0 + + def test_stability_test_very_long_message_and_diff_strings(self, benchmarks): + """Test stability test with very long message and diff strings""" + long_message = "A" * 10000 # 10KB message + long_diff = "B" * 50000 # 50KB diff + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(long_message, long_diff, runs=2) + + assert result["message"] == long_message + assert len(result["results"]) == 2 + assert result["runs"] == 2 + + def test_stability_test_unicode_and_special_characters(self, benchmarks): + """Test stability test with unicode and special characters""" + unicode_message = "feat: Add 支持中文 and émojis 🚀 with ñiño characters" + special_diff = """ + + print("Hello 世界! 🌍") + - console.log('Goodbye 👋') + + # TODO: Fix unicode handling ñoño café + """ + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(unicode_message, special_diff, runs=2) + + assert result["message"] == unicode_message + assert len(result["results"]) == 2 + assert result["runs"] == 2 + + def test_stability_test_special_control_characters(self, benchmarks): + """Test stability test with control characters and escape sequences""" + control_message = "fix: Handle \\n\\t\\r and \\x00 characters" + control_diff = "- old\\nline\\twith\\ttabs\n+ new\\nline\\twith\\tspaces" + mock_result = Mock( + what_score=4.0, why_score=3.0, overall_score=3.5, confidence=0.8 + ) + + with patch.object( + benchmarks.evaluator, "evaluate_commit_message", return_value=mock_result + ): + result = benchmarks.stability_test(control_message, control_diff, runs=2) + + assert result["message"] == control_message + assert len(result["results"]) == 2 diff --git a/uv.lock b/uv.lock index 018681c..829e8b0 100644 --- a/uv.lock +++ b/uv.lock @@ -314,7 +314,7 @@ wheels = [ [[package]] name = "diffmage" -version = "0.3.2" +version = "0.5.0" source = { editable = "." } dependencies = [ { name = "gitpython" },