From 801bdfa3a19aba355292393535bf19e9d7b28719 Mon Sep 17 00:00:00 2001 From: oshtz Date: Wed, 31 Dec 2025 13:09:10 +0200 Subject: [PATCH] feat: Enhance scoring and analysis features - Added AggregateScore and MultiRunStats interfaces to improve statistical tracking of scores. - Introduced a comprehensive Testing Methodology document detailing scoring methods, benchmark modes, and statistical analysis. - Implemented MultiRunAnalysis component for visual representation of multi-run statistics and model comparisons. - Created judge calibration functionality to assess and improve LLM judge accuracy with reference samples. - Updated RunResult interface to include error tracking for better UI feedback. --- README.md | 2 +- TESTING_METHODOLOGY.md | 499 ++++++++++++++++++ package-lock.json | 4 +- package.json | 2 +- src-tauri/Cargo.lock | 2 +- src-tauri/Cargo.toml | 2 +- src-tauri/tauri.conf.json | 2 +- src/components/arena/ExecutionControls.tsx | 122 +++-- src/components/arena/ParameterPanel.tsx | 39 +- src/components/code-arena/CodeArena.tsx | 19 +- .../code-arena/CodeArenaModelPanel.tsx | 2 +- src/components/code-arena/CodeEditorView.tsx | 20 +- src/components/results/MultiRunAnalysis.tsx | 184 +++++++ src/components/results/ReportSummary.tsx | 9 +- src/components/results/Results.tsx | 4 + src/components/ui/scroll-area.tsx | 2 +- src/scoring/code-arena-judge.ts | 19 +- src/scoring/exact-match.ts | 50 +- src/scoring/judge-calibration.ts | 274 ++++++++++ src/scoring/numeric-tolerance.ts | 32 +- src/services/codeArenaExecution.ts | 34 +- src/services/execution.ts | 64 ++- src/stores/modelStore.ts | 50 ++ src/stores/runStore.ts | 262 ++++++++- src/types/index.ts | 27 + 25 files changed, 1616 insertions(+), 110 deletions(-) create mode 100644 TESTING_METHODOLOGY.md create mode 100644 src/components/results/MultiRunAnalysis.tsx create mode 100644 src/scoring/judge-calibration.ts diff --git a/README.md b/README.md index d1ae6ea..c7ecd88 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ Enter your OpenRouter API key in the Settings (gear icon in header). ### Updates - The app checks for updates on startup -- Click the version button in the header (e.g. `v0.1.1`) to view update status, release notes, or manually re-check +- Click the version button in the header (e.g. `v0.1.2`) to view update status, release notes, or manually re-check - Updates are pulled from GitHub Releases and expect a `Benchmaker-Portable.exe` asset on the latest tag ## Development diff --git a/TESTING_METHODOLOGY.md b/TESTING_METHODOLOGY.md new file mode 100644 index 0000000..8c14b1a --- /dev/null +++ b/TESTING_METHODOLOGY.md @@ -0,0 +1,499 @@ +# Testing Methodology + +Internal reference for Benchmaker's evaluation system and how to achieve accurate, reproducible 0-100 scoring. + +--- + +## Quick Reference + +| Scoring Method | Scale | Output Range | Reproducibility | Best For | +|----------------|-------|--------------|-----------------|----------| +| Exact Match | 0-1 → 0-100 | 0-100 continuous | Deterministic | Short, precise answers | +| Regex Match | 0-1 → 0-100 | 0 or 100 | Deterministic | Pattern validation | +| Numeric Tolerance | 0-1 → 0-100 | 0-100 continuous | Deterministic | Math/calculations | +| Boolean | 0-1 → 0-100 | 0 or 100 | Deterministic | Contains check | +| LLM Judge | 0-10 → 0-100 | 0-100 continuous | ±5-10% variance | Complex/subjective | +| Code Arena Judge | 0-100 native | 0-100 continuous | ±5-15% variance | Frontend code | + +--- + +## 1. Overview + +Benchmaker evaluates LLM responses to produce a **0-100 rating per task per model**. The system supports: + +- **Two benchmark modes**: Standard Arena and Code Arena +- **Five scoring methods**: From deterministic (exact match) to AI-powered (LLM judge) +- **Weighted test cases**: Prioritize important tests in aggregate scores +- **Statistical analysis**: Multi-run variance, confidence intervals, model comparison +- **Benchmark mode**: One-click setup for reproducible results (temp=0) + +### Core Principles + +1. All scores are **normalized to 0-1 internally**, displayed as **0-100** +2. Each test case can use a different scoring method +3. Test case weights affect aggregate scores +4. Full execution context is persisted for reproducibility + +--- + +## 2. Benchmark Modes + +### Standard Arena + +- **Purpose**: Evaluate models across multiple test cases in a test suite +- **Flow**: Test Suite → Test Cases → Models → Scored Results +- **Scoring**: Any of the 5 scoring methods per test case +- **Aggregation**: Weighted average per model + +### Code Arena + +- **Purpose**: Compare frontend code generation with live preview +- **Flow**: Prompt → Models → Code Extraction → LLM Judge +- **Scoring**: Code Arena Judge (0-100 with weighted criteria) +- **Output**: Side-by-side comparison with rendered previews + +--- + +## 3. Score Calculation + +### Internal Representation + +```typescript +interface ScoringResult { + score: number // 0-1 normalized (primary score) + confidence?: number // 0-1, scorer confidence + notes?: string // Human-readable explanation + rawScore?: number // Original scale (e.g., 0-10) + maxScore?: number // Maximum on original scale +} +``` + +### Display Conversion + +``` +Display Score = score × 100 +``` + +Example: `score: 0.85` → **85/100** + +### Aggregate Scoring + +Three functions for different levels of detail: + +**Simple aggregate** - weighted average per model: +```typescript +getAggregateScores(runId, testCases?) +// Returns Map +``` + +**Detailed aggregate** - full statistics: +```typescript +getDetailedAggregateScores(runId, testCases?) +// Returns Map + +interface AggregateScore { + mean: number // Weighted mean (0-1) + stdDev: number // Standard deviation + min: number + max: number + count: number // Number of scored results + totalWeight: number // Sum of weights + confidence95: [number, number] // 95% CI +} +``` + +**Multi-run statistics** - variance across runs: +```typescript +getMultiRunStats(runIds, testCases?) +// Returns Map + +interface MultiRunStats { + runIds: string[] + modelId: string + scores: number[] // Score from each run + mean: number + stdDev: number + min: number + max: number + confidence95: [number, number] +} +``` + +### Test Case Weights + +Each test case has a `weight` field (default: 1). Higher weight = more impact on aggregate score. + +```typescript +interface TestCase { + weight: number // Default 1, increase for important tests + // ... +} + +// Calculation: +weightedAverage = Σ(score × weight) / Σ(weight) +``` + +--- + +## 4. Scoring Methods + +### 4.1 Exact Match + +**Location**: `src/scoring/exact-match.ts` + +| Condition | Score | Display | +|-----------|-------|---------| +| Exact string match | 1.0 | 100 | +| Case-insensitive match | 0.95 | 95 | +| Expected within response | 0.60-0.95 | 60-95 (varies by extra content) | +| Case-insensitive contains | 0.55-0.90 | 55-90 (varies by extra content) | +| High similarity (>50%) | similarity × 0.7 | 35-70 (continuous) | +| Low similarity (20-50%) | similarity × 0.4 | 8-20 (continuous) | +| No match (<20% similar) | 0 | 0 | + +Uses Levenshtein distance for continuous partial matching. Scores adjust based on extra content ratio when expected output is found within a longer response. Best for short, precise answers. + +### 4.2 Regex Match + +**Location**: `src/scoring/regex-match.ts` + +| Condition | Score | Display | +|-----------|-------|---------| +| Pattern matches | 1.0 | 100 | +| No match | 0 | 0 | + +Supports `/pattern/flags` or plain patterns. Best for format validation. + +### 4.3 Numeric Tolerance + +**Location**: `src/scoring/numeric-tolerance.ts` + +| Condition | Score | Display | +|-----------|-------|---------| +| Within 1% tolerance | 1.0 | 100 | +| Within 25% | Continuous decay | 0-100 (smooth curve) | +| Outside 25% | 0 | 0 | + +Uses continuous scoring with smooth exponential decay: `score = 1 - (error/0.25)^0.5`. This provides scores like: 0% error = 100, 1% = 80, 6.25% = 50, 25% = 0. + +Extracts all numbers from response (including scientific notation). Best for math problems. + +### 4.4 Boolean Match + +**Location**: `src/scoring/index.ts` + +| Condition | Score | Display | +|-----------|-------|---------| +| Expected substring found | 1.0 | 100 | +| Not found | 0 | 0 | +| No expected output | 1.0 | 100 (auto-pass) | + +Case-insensitive substring check. Best for simple contains/doesn't contain. + +### 4.5 LLM Judge + +**Location**: `src/scoring/llm-judge.ts` +**Temperature**: 0.1 + +| Score | Meaning | +|-------|---------| +| 10 | Perfect, fully correct and complete | +| 8-9 | Excellent with minor issues | +| 6-7 | Good but missing elements | +| 4-5 | Partially correct, significant issues | +| 2-3 | Mostly incorrect | +| 0-1 | Completely wrong | + +Conversion: `displayScore = (rawScore / 10) × 100` + +Best for complex, subjective, or open-ended tasks. + +### 4.6 Code Arena Judge + +**Location**: `src/scoring/code-arena-judge.ts` +**Temperature**: 0.3 + +| Criterion | Weight | +|-----------|--------| +| Visual Accuracy | 40% | +| Code Quality | 30% | +| Functionality | 20% | +| Responsiveness | 10% | + +Scores directly on 0-100 scale. Best for frontend code generation. + +--- + +## 5. Reproducibility + +### Benchmark Mode + +Toggle in the Parameter Panel to enable reproducible benchmarking: + +- Temperature locked to **0** +- Frequency penalty locked to **0** +- Presence penalty locked to **0** + +```typescript +// Programmatically +modelStore.toggleBenchmarkMode() + +// Get effective parameters (respects benchmark mode) +const params = modelStore.getEffectiveParameters() +``` + +### Default Parameters + +```typescript +{ + temperature: 0.7, + topP: 1, + maxTokens: 2048, + frequencyPenalty: 0, + presencePenalty: 0, + benchmarkMode: false, +} +``` + +### Persisted Per Run + +| Variable | Storage | +|----------|---------| +| Model IDs | `RunResult.models` | +| Parameters | `RunResult.parameters` | +| System Prompt | `TestSuite.systemPrompt` | +| Judge Prompt | `TestSuite.judgeSystemPrompt` | +| Judge Model | `RunResult.judgeModel` | +| Timestamps | `RunResult.startedAt/completedAt` | + +### Sources of Variance + +| Source | Impact | Mitigation | +|--------|--------|------------| +| Model temperature > 0 | High | Enable Benchmark Mode | +| LLM judge | Medium (±5-10%) | Run multiple times | +| API-side sampling | Low | Cannot control | +| Code Arena judge | Medium (±5-15%) | Run multiple times | + +--- + +## 6. Statistical Comparison + +Compare two models across multiple runs: + +```typescript +compareModels(runIds, modelA, modelB, testCases?) +// Returns ModelComparison | null + +interface ModelComparison { + modelA: string + modelB: string + meanA: number + meanB: number + scoreDiff: number // meanA - meanB + pooledStdErr: number + tStatistic: number + pValue: number // Two-tailed + isSignificant: boolean // p < 0.05 + effectSize: number // Cohen's d +} +``` + +### Interpretation + +- **pValue < 0.05**: Statistically significant difference +- **Effect size (Cohen's d)**: + - |d| < 0.2: Negligible + - |d| 0.2-0.5: Small + - |d| 0.5-0.8: Medium + - |d| > 0.8: Large + +Requires at least 2 runs per model. + +--- + +## 7. Judge Calibration + +Test LLM judges against known reference samples: + +```typescript +import { calibrateJudge, interpretCalibrationResult } from '@/scoring/judge-calibration' + +const result = await calibrateJudge(client, judgeModelId) +const interpretation = interpretCalibrationResult(result) +``` + +### Calibration Result + +```typescript +interface CalibrationResult { + judgeModelId: string + timestamp: number + samples: CalibrationSampleResult[] + summary: { + totalSamples: number + passedSamples: number + passRate: number // % within tolerance + meanAbsoluteError: number + maxError: number + bias: number // Positive = overscoring + correlation: number // Pearson correlation + } +} +``` + +### Quality Ratings + +| Rating | Pass Rate | MAE | Correlation | +|--------|-----------|-----|-------------| +| Excellent | ≥90% | ≤10% | ≥0.9 | +| Good | ≥75% | ≤15% | ≥0.8 | +| Fair | ≥50% | ≤25% | ≥0.6 | +| Poor | <50% | >25% | <0.6 | + +Default calibration includes 8 reference samples covering factual, empty, irrelevant, and explanation-type responses. + +--- + +## 8. Execution Pipeline + +### Standard Arena + +``` +1. Create result entries for all (test case × model) combinations +2. Execute in parallel (concurrency limit: 5) +3. For each task: + a. Build messages (system prompt + user prompt) + b. Stream response with retry logic + c. Record latency, tokens, cost + d. Score using configured method +4. Aggregate scores per model +``` + +### Retry Logic + +- Max retries: 2 for empty responses +- Backoff: 400ms × (attempt + 1) +- Fallback: Non-streaming request if streaming fails +- **Request timeout**: 2 minutes per request (prevents hung API calls from blocking execution) + +### Cost Calculation + +``` +Cost = (prompt_tokens × prompt_price) + (completion_tokens × completion_price) +``` + +--- + +## 9. Best Practices + +### For Accurate Results + +1. **Enable Benchmark Mode** for reproducible model responses +2. **Use deterministic scoring** (exact, regex, numeric, boolean) when possible +3. **Set test case weights** for important tests +4. **Run 3-5 times** for LLM-judged tests +5. **Calibrate your judge** before relying on LLM judge scores +6. **Use statistical comparison** to verify differences are significant + +### Multi-Run Protocol + +**Using the UI:** +1. In the Arena tab, click the dropdown arrow next to "Run Benchmark" +2. Select "Run 3 times", "Run 5 times", or "Run 10 times" +3. The system will execute the benchmark sequentially, showing progress +4. In the Results tab, the "Multi-Run Analysis" panel will automatically appear +5. View mean scores, standard deviations, confidence intervals, and statistical comparisons + +**Programmatic API:** +```typescript +// Run benchmark 3-5 times, collect run IDs +const runIds = [run1.id, run2.id, run3.id] + +// Get multi-run statistics +const stats = runStore.getMultiRunStats(runIds, testSuite.testCases) + +// Report results +const modelStats = stats.get('gpt-4') +console.log(`gpt-4: ${(modelStats.mean * 100).toFixed(1)} ± ${(modelStats.stdDev * 100).toFixed(1)}`) +// Output: "gpt-4: 85.2 ± 3.1" +``` + +### Score Reporting Format + +``` +Model X: 85.2 ± 3.1 (95% CI: [82.1, 88.3]) + ↑ ↑ ↑ + mean stdDev confidence interval +``` + +### Confidence Guidelines + +| Scoring Method | Confidence | Runs Needed | +|----------------|------------|-------------| +| Exact Match | High | 1 | +| Regex Match | High | 1 | +| Numeric Tolerance | High | 1 | +| Boolean | High | 1 | +| LLM Judge | Medium | 3-5 | +| Code Arena Judge | Medium | 3-5 | + +--- + +## 10. Limitations + +### Inherent Constraints + +1. **LLM Judge Variance**: Even at temp=0.1, ~±5-10% variance exists +2. **API Non-Determinism**: Some providers vary even at temp=0 +3. **Code Truncation in Quick Judge**: Codes >5000 chars are truncated (warning shown in results) + +### Future Improvements + +- Inter-rater reliability (multiple judges) +- Elo-style rankings +- Export/reporting + +--- + +## Code Reference + +| Component | Location | +|-----------|----------| +| **Scoring** | +| Exact match | `src/scoring/exact-match.ts` | +| Regex match | `src/scoring/regex-match.ts` | +| Numeric tolerance | `src/scoring/numeric-tolerance.ts` | +| Boolean match | `src/scoring/index.ts` | +| LLM judge | `src/scoring/llm-judge.ts` | +| Code arena judge | `src/scoring/code-arena-judge.ts` | +| Judge calibration | `src/scoring/judge-calibration.ts` | +| **State** | +| Aggregate scores | `src/stores/runStore.ts` → `getAggregateScores()` | +| Detailed stats | `src/stores/runStore.ts` → `getDetailedAggregateScores()` | +| Multi-run stats | `src/stores/runStore.ts` → `getMultiRunStats()` | +| Model comparison | `src/stores/runStore.ts` → `compareModels()` | +| Benchmark mode | `src/stores/modelStore.ts` → `toggleBenchmarkMode()` | +| **Execution** | +| Standard arena | `src/services/execution.ts` | +| Code arena | `src/services/codeArenaExecution.ts` | +| **UI** | +| Parameter panel | `src/components/arena/ParameterPanel.tsx` | + +--- + +## Summary + +Benchmaker produces accurate 0-100 scores per task per model with: + +| Capability | Usage | +|------------|-------| +| Deterministic scoring | Use exact/regex/numeric/boolean | +| Weighted aggregation | Pass `testCases` to aggregate functions | +| Benchmark mode | Toggle in Parameter Panel | +| Multi-run statistics | `getMultiRunStats(runIds, testCases)` | +| Confidence intervals | `getDetailedAggregateScores(runId, testCases)` | +| Statistical comparison | `compareModels(runIds, modelA, modelB)` | +| Judge calibration | `calibrateJudge(client, judgeModelId)` | + +**For reliable results**: Enable Benchmark Mode, use deterministic scoring where possible, run LLM-judged tests 3-5 times, and verify differences with `compareModels()`. diff --git a/package-lock.json b/package-lock.json index e8864e3..352f079 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "benchmaker", - "version": "0.1.1", + "version": "0.1.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "benchmaker", - "version": "0.1.1", + "version": "0.1.2", "dependencies": { "@monaco-editor/react": "^4.7.0", "@radix-ui/react-alert-dialog": "^1.1.15", diff --git a/package.json b/package.json index 4a41a16..98d3da0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "benchmaker", - "version": "0.1.1", + "version": "0.1.2", "type": "module", "scripts": { "dev": "vite", diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index 454abf6..bc097e0 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -109,7 +109,7 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "benchmaker" -version = "0.1.1" +version = "0.1.2" dependencies = [ "rusqlite", "serde", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 7a7aa31..7e3eb0b 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmaker" -version = "0.1.1" +version = "0.1.2" description = "Benchmaker" authors = ["you"] edition = "2021" diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index 9b7b292..fa12ce5 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -7,7 +7,7 @@ }, "package": { "productName": "Benchmaker", - "version": "0.1.1" + "version": "0.1.2" }, "tauri": { "allowlist": { diff --git a/src/components/arena/ExecutionControls.tsx b/src/components/arena/ExecutionControls.tsx index 16010f1..9bf601a 100644 --- a/src/components/arena/ExecutionControls.tsx +++ b/src/components/arena/ExecutionControls.tsx @@ -1,6 +1,12 @@ import { useState } from 'react' -import { Play, Square, Loader2 } from 'lucide-react' +import { Play, Square, Repeat, ChevronDown } from 'lucide-react' import { Button } from '@/components/ui/button' +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger, +} from '@/components/ui/dropdown-menu' import { useToast } from '@/components/ui/use-toast' import { useSettingsStore } from '@/stores/settingsStore' import { useModelStore } from '@/stores/modelStore' @@ -15,15 +21,33 @@ interface ExecutionControlsProps { export function ExecutionControls({ testSuite }: ExecutionControlsProps) { const { apiKey } = useSettingsStore() const { selectedModelIds, parameters, judgeModelId } = useModelStore() - const { createRun, updateRunStatus } = useRunStore() + const { createRun } = useRunStore() const { toast } = useToast() const [isRunning, setIsRunning] = useState(false) const [abortController, setAbortController] = useState(null) + const [currentRunIndex, setCurrentRunIndex] = useState(0) + const [totalRuns, setTotalRuns] = useState(1) const canRun = selectedModelIds.length > 0 && testSuite.testCases.length > 0 - const handleRun = async () => { + const executeSingleRun = async (controller: AbortController): Promise => { + const run = createRun({ + testSuiteId: testSuite.id, + testSuiteName: testSuite.name, + models: selectedModelIds, + parameters, + results: [], + status: 'running', + startedAt: Date.now(), + judgeModel: judgeModelId || undefined, + }) + + await executeRun(run.id, testSuite, apiKey!, controller.signal) + return run.id + } + + const handleRun = async (numRuns: number = 1) => { if (!canRun || !apiKey) { toast({ title: 'Cannot start run', @@ -36,33 +60,44 @@ export function ExecutionControls({ testSuite }: ExecutionControlsProps) { const controller = new AbortController() setAbortController(controller) setIsRunning(true) + setTotalRuns(numRuns) + setCurrentRunIndex(0) - const run = createRun({ - testSuiteId: testSuite.id, - testSuiteName: testSuite.name, - models: selectedModelIds, - parameters, - results: [], - status: 'running', - startedAt: Date.now(), - judgeModel: judgeModelId || undefined, - }) + const completedRunIds: string[] = [] + let cancelled = false try { - await executeRun(run.id, testSuite, apiKey, controller.signal) - toast({ - title: 'Run completed', - description: `Benchmarked ${selectedModelIds.length} models on ${testSuite.testCases.length} test cases`, - }) + for (let i = 0; i < numRuns; i++) { + if (controller.signal.aborted) { + cancelled = true + break + } + + setCurrentRunIndex(i + 1) + const runId = await executeSingleRun(controller) + completedRunIds.push(runId) + } + + if (!cancelled) { + if (numRuns === 1) { + toast({ + title: 'Run completed', + description: `Benchmarked ${selectedModelIds.length} models on ${testSuite.testCases.length} test cases`, + }) + } else { + toast({ + title: `${numRuns} runs completed`, + description: `Completed ${numRuns} benchmark runs. Use Results tab to analyze multi-run statistics.`, + }) + } + } } catch (error) { - if (error instanceof Error && error.name === 'AbortError') { - updateRunStatus(run.id, 'cancelled') + if (error instanceof DOMException && error.name === 'AbortError') { toast({ - title: 'Run cancelled', - description: 'The benchmark run was stopped', + title: 'Runs cancelled', + description: `Stopped after ${completedRunIds.length} of ${numRuns} runs`, }) } else { - updateRunStatus(run.id, 'failed') toast({ title: 'Run failed', description: error instanceof Error ? error.message : 'Unknown error', @@ -72,6 +107,8 @@ export function ExecutionControls({ testSuite }: ExecutionControlsProps) { } finally { setIsRunning(false) setAbortController(null) + setCurrentRunIndex(0) + setTotalRuns(1) } } @@ -81,22 +118,45 @@ export function ExecutionControls({ testSuite }: ExecutionControlsProps) { } } + const runOptions = [3, 5, 10] + return (
{isRunning ? ( ) : ( - + Run Benchmark + + + + + + + {runOptions.map((n) => ( + handleRun(n)}> + + Run {n} times + + ))} + + +
)} {selectedModelIds.length === 0 && ( diff --git a/src/components/arena/ParameterPanel.tsx b/src/components/arena/ParameterPanel.tsx index ffde367..d275178 100644 --- a/src/components/arena/ParameterPanel.tsx +++ b/src/components/arena/ParameterPanel.tsx @@ -1,13 +1,14 @@ -import { RotateCcw } from 'lucide-react' +import { RotateCcw, FlaskConical } from 'lucide-react' import { Button } from '@/components/ui/button' import { Input } from '@/components/ui/input' import { Label } from '@/components/ui/label' import { Slider } from '@/components/ui/slider' +import { Switch } from '@/components/ui/switch' import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card' import { useModelStore } from '@/stores/modelStore' export function ParameterPanel() { - const { parameters, setParameters, resetParameters } = useModelStore() + const { parameters, setParameters, resetParameters, toggleBenchmarkMode } = useModelStore() return ( @@ -24,19 +25,37 @@ export function ParameterPanel() { + {/* Benchmark Mode Toggle */} +
+
+ +
+ +

Uses temp=0 for reproducible results

+
+
+ +
+
- {parameters.temperature.toFixed(2)} + {parameters.benchmarkMode ? '0.00 (locked)' : parameters.temperature.toFixed(2)}
setParameters({ temperature: v })} min={0} max={2} step={0.01} + disabled={parameters.benchmarkMode} + className={parameters.benchmarkMode ? 'opacity-50' : ''} />
@@ -74,15 +93,17 @@ export function ParameterPanel() {
- {parameters.frequencyPenalty.toFixed(2)} + {parameters.benchmarkMode ? '0.00 (locked)' : parameters.frequencyPenalty.toFixed(2)}
setParameters({ frequencyPenalty: v })} min={-2} max={2} step={0.01} + disabled={parameters.benchmarkMode} + className={parameters.benchmarkMode ? 'opacity-50' : ''} /> @@ -90,15 +111,17 @@ export function ParameterPanel() {
- {parameters.presencePenalty.toFixed(2)} + {parameters.benchmarkMode ? '0.00 (locked)' : parameters.presencePenalty.toFixed(2)}
setParameters({ presencePenalty: v })} min={-2} max={2} step={0.01} + disabled={parameters.benchmarkMode} + className={parameters.benchmarkMode ? 'opacity-50' : ''} />
diff --git a/src/components/code-arena/CodeArena.tsx b/src/components/code-arena/CodeArena.tsx index bef3bbb..4db25bb 100644 --- a/src/components/code-arena/CodeArena.tsx +++ b/src/components/code-arena/CodeArena.tsx @@ -2,6 +2,7 @@ import { useEffect } from 'react' import { Key, Code2 } from 'lucide-react' import { EmptyState } from '@/components/ui/empty-state' import { ResizablePanelGroup, ResizablePanel, ResizableHandle } from '@/components/ui/resizable' +import { ScrollArea } from '@/components/ui/scroll-area' import { useSettingsStore } from '@/stores/settingsStore' import { useModelStore } from '@/stores/modelStore' import { getOpenRouterClient } from '@/services/openrouter' @@ -97,15 +98,17 @@ export function CodeArena() { {/* Left panel - Configuration */} -
- -
- + +
+ +
+ +
+
+ +
-
- -
-
+ diff --git a/src/components/code-arena/CodeArenaModelPanel.tsx b/src/components/code-arena/CodeArenaModelPanel.tsx index d2b4de9..e7cd377 100644 --- a/src/components/code-arena/CodeArenaModelPanel.tsx +++ b/src/components/code-arena/CodeArenaModelPanel.tsx @@ -143,7 +143,7 @@ export function CodeArenaModelPanel({ ) : isPreviewMode ? ( ) : ( - + )} diff --git a/src/components/code-arena/CodeEditorView.tsx b/src/components/code-arena/CodeEditorView.tsx index 2a2f704..7408561 100644 --- a/src/components/code-arena/CodeEditorView.tsx +++ b/src/components/code-arena/CodeEditorView.tsx @@ -1,22 +1,34 @@ -import { useRef } from 'react' +import { useRef, useEffect, useState } from 'react' import { Copy, Check } from 'lucide-react' import { Button } from '@/components/ui/button' import { ScrollArea } from '@/components/ui/scroll-area' -import { useState } from 'react' interface CodeEditorViewProps { code: string className?: string showLineNumbers?: boolean + isStreaming?: boolean } export function CodeEditorView({ code, className = '', - showLineNumbers = true + showLineNumbers = true, + isStreaming = false }: CodeEditorViewProps) { const [copied, setCopied] = useState(false) const codeRef = useRef(null) + const scrollAreaRef = useRef(null) + + // Auto-scroll to bottom when streaming + useEffect(() => { + if (isStreaming && scrollAreaRef.current) { + const viewport = scrollAreaRef.current.querySelector('[data-radix-scroll-area-viewport]') + if (viewport) { + viewport.scrollTop = viewport.scrollHeight + } + } + }, [code, isStreaming]) const handleCopy = async () => { try { @@ -115,7 +127,7 @@ export function CodeEditorView({ )} - +
{/* Line numbers */} {showLineNumbers && ( diff --git a/src/components/results/MultiRunAnalysis.tsx b/src/components/results/MultiRunAnalysis.tsx new file mode 100644 index 0000000..911500c --- /dev/null +++ b/src/components/results/MultiRunAnalysis.tsx @@ -0,0 +1,184 @@ +import { useState, useMemo } from 'react' +import { BarChart3, TrendingUp, AlertCircle } from 'lucide-react' +import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card' +import { Button } from '@/components/ui/button' +import { Badge } from '@/components/ui/badge' +import { + Collapsible, + CollapsibleContent, + CollapsibleTrigger, +} from '@/components/ui/collapsible' +import { useRunStore, type ModelComparison } from '@/stores/runStore' +import { useTestSuiteStore } from '@/stores/testSuiteStore' +import type { RunResult } from '@/types' + +interface MultiRunAnalysisProps { + currentRun: RunResult +} + +export function MultiRunAnalysis({ currentRun }: MultiRunAnalysisProps) { + const { getRunsForTestSuite, getMultiRunStats, compareModels } = useRunStore() + const { testSuites } = useTestSuiteStore() + const [isOpen, setIsOpen] = useState(false) + + const testSuite = testSuites.find((s) => s.id === currentRun.testSuiteId) + const relatedRuns = getRunsForTestSuite(currentRun.testSuiteId) + + // Only show if there are multiple completed runs + if (relatedRuns.length < 2) { + return null + } + + const runIds = relatedRuns.map((r) => r.id) + const multiRunStats = getMultiRunStats(runIds, testSuite?.testCases) + + // Get all unique models across runs + const allModels = Array.from(multiRunStats.keys()) + + // Sort by mean score descending + const sortedModels = allModels.sort((a, b) => { + const statsA = multiRunStats.get(a) + const statsB = multiRunStats.get(b) + return (statsB?.mean || 0) - (statsA?.mean || 0) + }) + + // Calculate model comparisons for top 2 models + const topComparison: ModelComparison | null = useMemo(() => { + if (sortedModels.length < 2) return null + return compareModels(runIds, sortedModels[0], sortedModels[1], testSuite?.testCases) || null + }, [runIds, sortedModels, testSuite?.testCases, compareModels]) + + const formatScore = (score: number) => `${(score * 100).toFixed(1)}%` + const formatCI = (ci: [number, number]) => + `[${(ci[0] * 100).toFixed(1)}, ${(ci[1] * 100).toFixed(1)}]` + + return ( + + + + +
+ + + Multi-Run Analysis + + {relatedRuns.length} runs + + + +
+
+
+ + + +
+ {/* Model Statistics Table */} +
+ + + + + + + + + + + + {sortedModels.map((modelId, index) => { + const stats = multiRunStats.get(modelId) + if (!stats) return null + + return ( + + + + + + + + ) + })} + +
ModelMeanStd Dev95% CIRange
+ {index === 0 && } + {modelId.split('/').pop()} + + {formatScore(stats.mean)} + + ±{(stats.stdDev * 100).toFixed(1)}% + + {formatCI(stats.confidence95)} + + {formatScore(stats.min)} - {formatScore(stats.max)} +
+
+ + {/* Statistical Comparison */} + {topComparison && ( +
+

+ + Statistical Comparison: Top 2 Models +

+
+
+ Score Difference: + + {(topComparison.scoreDiff * 100).toFixed(2)}% + +
+
+ p-value: + + {topComparison.pValue.toFixed(4)} + +
+
+ Effect Size (Cohen's d): + + {topComparison.effectSize.toFixed(3)} + +
+
+ Significant: + + {topComparison.isSignificant ? 'Yes (p < 0.05)' : 'No'} + +
+
+ {topComparison.isSignificant && ( +

+ {sortedModels[0].split('/').pop()} significantly outperforms{' '} + {sortedModels[1].split('/').pop()} with a{' '} + {Math.abs(topComparison.effectSize) > 0.8 + ? 'large' + : Math.abs(topComparison.effectSize) > 0.5 + ? 'medium' + : 'small'}{' '} + effect size. +

+ )} +
+ )} + +

+ Based on {relatedRuns.length} completed runs of "{currentRun.testSuiteName}". + Run more benchmarks for higher statistical confidence. +

+
+
+
+
+
+ ) +} diff --git a/src/components/results/ReportSummary.tsx b/src/components/results/ReportSummary.tsx index e85272f..341808c 100644 --- a/src/components/results/ReportSummary.tsx +++ b/src/components/results/ReportSummary.tsx @@ -124,15 +124,20 @@ export function ReportSummary({ run }: ReportSummaryProps) { {completedCount}
{failedCount > 0 && ( -
+
{failedCount}
)}

- {run.models.length} models × {totalCount} tests + {run.models.length} models × {Math.round(totalCount / run.models.length)} tests

+ {run.errorSummary && ( +

+ {run.errorSummary} +

+ )} diff --git a/src/components/results/Results.tsx b/src/components/results/Results.tsx index b9dbb98..eaa6037 100644 --- a/src/components/results/Results.tsx +++ b/src/components/results/Results.tsx @@ -15,6 +15,7 @@ import { useTestSuiteStore } from '@/stores/testSuiteStore' import { useSettingsStore } from '@/stores/settingsStore' import { ComparisonGrid } from './ComparisonGrid' import { ReportSummary } from './ReportSummary' +import { MultiRunAnalysis } from './MultiRunAnalysis' export function Results() { const { runs, currentRunId, setCurrentRun, deleteRun } = useRunStore() @@ -113,6 +114,9 @@ export function Results() {
+
+ +
diff --git a/src/components/ui/scroll-area.tsx b/src/components/ui/scroll-area.tsx index b96134c..f5ceed9 100644 --- a/src/components/ui/scroll-area.tsx +++ b/src/components/ui/scroll-area.tsx @@ -11,7 +11,7 @@ const ScrollArea = React.forwardRef< className={cn("relative overflow-hidden", className)} {...props} > - + {children} diff --git a/src/scoring/code-arena-judge.ts b/src/scoring/code-arena-judge.ts index 5feed72..b310b41 100644 --- a/src/scoring/code-arena-judge.ts +++ b/src/scoring/code-arena-judge.ts @@ -82,7 +82,8 @@ export async function scoreCodeArenaOutput( } return { - score: 0.5, // Default to 50% if we can't parse + score: 0, // Default to 0 if we can't parse - don't inflate scores + confidence: 0, // Low confidence since we couldn't parse notes: 'Could not parse judge response: ' + content.slice(0, 200), } } @@ -119,6 +120,8 @@ Code: Reply with just a number from 0-100.` +const QUICK_SCORE_CODE_LIMIT = 5000 + export async function quickScoreCodeArenaOutput( prompt: string, code: string, @@ -132,10 +135,13 @@ export async function quickScoreCodeArenaOutput( } } + const wasTruncated = code.length > QUICK_SCORE_CODE_LIMIT + const truncatedCode = wasTruncated ? code.slice(0, QUICK_SCORE_CODE_LIMIT) : code + try { const judgePrompt = SIMPLE_JUDGE_PROMPT .replace('{prompt}', prompt) - .replace('{code}', code.slice(0, 5000)) // Limit code length for quick scoring + .replace('{code}', truncatedCode) const response = await client.createChatCompletion({ model: judgeModelId, @@ -154,18 +160,23 @@ export async function quickScoreCodeArenaOutput( if (!scoreMatch) { return { - score: 0.5, + score: 0, // Default to 0 if we can't parse - don't inflate scores + confidence: 0, notes: 'Could not parse quick score', } } const rawScore = Math.min(100, Math.max(0, parseInt(scoreMatch[1], 10))) + const truncationWarning = wasTruncated + ? ` (Warning: code truncated from ${code.length} to ${QUICK_SCORE_CODE_LIMIT} chars)` + : '' return { score: rawScore / 100, rawScore, maxScore: 100, - notes: 'Quick evaluation', + confidence: wasTruncated ? 0.8 : 1, // Lower confidence when truncated + notes: `Quick evaluation${truncationWarning}`, } } catch (error) { console.error('Failed quick score:', error) diff --git a/src/scoring/exact-match.ts b/src/scoring/exact-match.ts index 17ba2e6..de2b0cc 100644 --- a/src/scoring/exact-match.ts +++ b/src/scoring/exact-match.ts @@ -23,29 +23,59 @@ export function scoreExactMatch(response: string, expected: string): ScoringResu // Case-insensitive match if (normalizedResponse.toLowerCase() === normalizedExpected.toLowerCase()) { return { - score: 0.9, + score: 0.95, // Very high but not perfect (case matters slightly) confidence: 1, notes: 'Case-insensitive match', } } + // Calculate similarity using Levenshtein distance + const similarity = calculateSimilarity(normalizedResponse, normalizedExpected) + // Check if response contains the expected (for longer responses) + // Score based on how much extra content surrounds the expected output if (normalizedResponse.includes(normalizedExpected)) { + // Penalize based on how much extra content there is + const extraContentRatio = 1 - (normalizedExpected.length / normalizedResponse.length) + // Score from 0.95 (exact length match) down to 0.6 (lots of extra content) + const containsScore = Math.max(0.6, 0.95 - (extraContentRatio * 0.35)) return { - score: 0.7, - confidence: 0.8, - notes: 'Expected output found within response', + score: containsScore, + confidence: 0.9, + notes: `Expected output found within response (${(extraContentRatio * 100).toFixed(0)}% extra content)`, } } - // Calculate similarity for partial matches - const similarity = calculateSimilarity(normalizedResponse, normalizedExpected) + // Case-insensitive contains check + if (normalizedResponse.toLowerCase().includes(normalizedExpected.toLowerCase())) { + const extraContentRatio = 1 - (normalizedExpected.length / normalizedResponse.length) + const containsScore = Math.max(0.55, 0.90 - (extraContentRatio * 0.35)) + return { + score: containsScore, + confidence: 0.85, + notes: `Expected output found (case-insensitive, ${(extraContentRatio * 100).toFixed(0)}% extra content)`, + } + } + + // Use similarity for partial matches - continuous scoring + // Similarity of 1.0 = perfect match, 0.0 = completely different + if (similarity > 0.5) { + // Scale similarity to 0-0.7 range for partial matches + // This ensures partial matches never score higher than contains matches + const score = similarity * 0.7 + return { + score, + confidence: Math.max(0.4, similarity * 0.8), + notes: `Partial similarity: ${(similarity * 100).toFixed(1)}%`, + } + } - if (similarity > 0.8) { + // Very low similarity - use raw similarity scaled down + if (similarity > 0.2) { return { - score: similarity * 0.8, - confidence: 0.6, - notes: `High similarity (${(similarity * 100).toFixed(1)}%)`, + score: similarity * 0.4, // Max 0.2 score for low similarity + confidence: 0.3, + notes: `Low similarity: ${(similarity * 100).toFixed(1)}%`, } } diff --git a/src/scoring/judge-calibration.ts b/src/scoring/judge-calibration.ts new file mode 100644 index 0000000..28b50a6 --- /dev/null +++ b/src/scoring/judge-calibration.ts @@ -0,0 +1,274 @@ +import { scoreLLMJudge } from './llm-judge' +import type { OpenRouterClient } from '@/services/openrouter' + +/** + * Reference sample for judge calibration + * Contains known-good evaluations to measure judge accuracy + */ +export interface CalibrationSample { + id: string + prompt: string + response: string + expectedOutput?: string + expectedScore: number // Known correct score (0-1) + tolerance: number // Acceptable deviation (e.g., 0.1 = ±10%) + category: string // e.g., 'factual', 'creative', 'code' +} + +/** + * Result of calibrating a judge against reference samples + */ +export interface CalibrationResult { + judgeModelId: string + timestamp: number + samples: CalibrationSampleResult[] + summary: { + totalSamples: number + passedSamples: number + passRate: number // Percentage within tolerance + meanAbsoluteError: number // Average |expected - actual| + maxError: number + bias: number // Average (actual - expected), positive = overscoring + correlation: number // Pearson correlation coefficient + } +} + +export interface CalibrationSampleResult { + sampleId: string + expectedScore: number + actualScore: number + error: number // actual - expected + absoluteError: number + withinTolerance: boolean + notes?: string +} + +/** + * Default calibration samples covering different response types + */ +export const DEFAULT_CALIBRATION_SAMPLES: CalibrationSample[] = [ + { + id: 'perfect-factual', + prompt: 'What is 2 + 2?', + response: '4', + expectedOutput: '4', + expectedScore: 1.0, + tolerance: 0.1, + category: 'factual', + }, + { + id: 'wrong-factual', + prompt: 'What is 2 + 2?', + response: '5', + expectedOutput: '4', + expectedScore: 0.0, + tolerance: 0.15, + category: 'factual', + }, + { + id: 'partial-factual', + prompt: 'What is the capital of France?', + response: 'Paris is a major city in France known for the Eiffel Tower.', + expectedOutput: 'Paris', + expectedScore: 0.8, + tolerance: 0.15, + category: 'factual', + }, + { + id: 'verbose-correct', + prompt: 'What is 10 * 5?', + response: 'To calculate 10 multiplied by 5, we need to add 10 five times: 10 + 10 + 10 + 10 + 10 = 50. Therefore, 10 * 5 = 50.', + expectedOutput: '50', + expectedScore: 0.9, + tolerance: 0.1, + category: 'factual', + }, + { + id: 'empty-response', + prompt: 'What is the meaning of life?', + response: '', + expectedScore: 0.0, + tolerance: 0.05, + category: 'empty', + }, + { + id: 'irrelevant-response', + prompt: 'What is the speed of light?', + response: 'I like pizza.', + expectedOutput: '299,792,458 meters per second', + expectedScore: 0.0, + tolerance: 0.1, + category: 'irrelevant', + }, + { + id: 'good-explanation', + prompt: 'Explain why the sky is blue in simple terms.', + response: 'The sky appears blue because of a phenomenon called Rayleigh scattering. When sunlight enters Earth\'s atmosphere, it collides with gas molecules. Blue light has a shorter wavelength, so it gets scattered more than other colors, making the sky look blue to us.', + expectedScore: 0.9, + tolerance: 0.1, + category: 'explanation', + }, + { + id: 'mediocre-explanation', + prompt: 'Explain why the sky is blue in simple terms.', + response: 'The sky is blue because of the sun and the air.', + expectedScore: 0.4, + tolerance: 0.2, + category: 'explanation', + }, +] + +/** + * Run calibration tests against a judge model + */ +export async function calibrateJudge( + client: OpenRouterClient, + judgeModelId: string, + samples: CalibrationSample[] = DEFAULT_CALIBRATION_SAMPLES, + judgeSystemPrompt?: string +): Promise { + const results: CalibrationSampleResult[] = [] + + for (const sample of samples) { + try { + const scoringResult = await scoreLLMJudge( + sample.prompt, + sample.response, + sample.expectedOutput, + client, + judgeModelId, + judgeSystemPrompt + ) + + const actualScore = scoringResult.score + const error = actualScore - sample.expectedScore + const absoluteError = Math.abs(error) + const withinTolerance = absoluteError <= sample.tolerance + + results.push({ + sampleId: sample.id, + expectedScore: sample.expectedScore, + actualScore, + error, + absoluteError, + withinTolerance, + notes: scoringResult.notes, + }) + } catch (error) { + results.push({ + sampleId: sample.id, + expectedScore: sample.expectedScore, + actualScore: 0, + error: -sample.expectedScore, + absoluteError: sample.expectedScore, + withinTolerance: false, + notes: `Error: ${error instanceof Error ? error.message : 'Unknown error'}`, + }) + } + } + + // Calculate summary statistics + const passedSamples = results.filter(r => r.withinTolerance).length + const meanAbsoluteError = results.reduce((sum, r) => sum + r.absoluteError, 0) / results.length + const maxError = Math.max(...results.map(r => r.absoluteError)) + const bias = results.reduce((sum, r) => sum + r.error, 0) / results.length + + // Calculate Pearson correlation + const correlation = calculateCorrelation( + results.map(r => r.expectedScore), + results.map(r => r.actualScore) + ) + + return { + judgeModelId, + timestamp: Date.now(), + samples: results, + summary: { + totalSamples: results.length, + passedSamples, + passRate: passedSamples / results.length, + meanAbsoluteError, + maxError, + bias, + correlation, + }, + } +} + +/** + * Calculate Pearson correlation coefficient + */ +function calculateCorrelation(x: number[], y: number[]): number { + const n = x.length + if (n === 0) return 0 + + const sumX = x.reduce((a, b) => a + b, 0) + const sumY = y.reduce((a, b) => a + b, 0) + const sumXY = x.reduce((sum, xi, i) => sum + xi * y[i], 0) + const sumX2 = x.reduce((sum, xi) => sum + xi * xi, 0) + const sumY2 = y.reduce((sum, yi) => sum + yi * yi, 0) + + const numerator = n * sumXY - sumX * sumY + const denominator = Math.sqrt((n * sumX2 - sumX * sumX) * (n * sumY2 - sumY * sumY)) + + if (denominator === 0) return 0 + return numerator / denominator +} + +/** + * Interpret calibration results + */ +export function interpretCalibrationResult(result: CalibrationResult): { + quality: 'excellent' | 'good' | 'fair' | 'poor' + recommendation: string + details: string[] +} { + const { summary } = result + const details: string[] = [] + + // Assess overall quality + let quality: 'excellent' | 'good' | 'fair' | 'poor' + + if (summary.passRate >= 0.9 && summary.meanAbsoluteError <= 0.1 && summary.correlation >= 0.9) { + quality = 'excellent' + } else if (summary.passRate >= 0.75 && summary.meanAbsoluteError <= 0.15 && summary.correlation >= 0.8) { + quality = 'good' + } else if (summary.passRate >= 0.5 && summary.meanAbsoluteError <= 0.25 && summary.correlation >= 0.6) { + quality = 'fair' + } else { + quality = 'poor' + } + + // Generate details + details.push(`Pass rate: ${(summary.passRate * 100).toFixed(1)}% (${summary.passedSamples}/${summary.totalSamples} within tolerance)`) + details.push(`Mean absolute error: ${(summary.meanAbsoluteError * 100).toFixed(1)}%`) + details.push(`Max error: ${(summary.maxError * 100).toFixed(1)}%`) + details.push(`Correlation: ${summary.correlation.toFixed(3)}`) + + if (summary.bias > 0.05) { + details.push(`Bias: +${(summary.bias * 100).toFixed(1)}% (tends to overscore)`) + } else if (summary.bias < -0.05) { + details.push(`Bias: ${(summary.bias * 100).toFixed(1)}% (tends to underscore)`) + } else { + details.push(`Bias: ${(summary.bias * 100).toFixed(1)}% (minimal)`) + } + + // Generate recommendation + let recommendation: string + switch (quality) { + case 'excellent': + recommendation = 'This judge model is highly reliable for scoring. Results can be trusted with minimal variance.' + break + case 'good': + recommendation = 'This judge model is reliable for most use cases. Consider running 2-3 times for LLM-judged critical tasks.' + break + case 'fair': + recommendation = 'This judge model shows moderate reliability. Run 3-5 times and average scores for better accuracy.' + break + case 'poor': + recommendation = 'This judge model is unreliable. Consider using a different judge model or deterministic scoring methods.' + break + } + + return { quality, recommendation, details } +} diff --git a/src/scoring/numeric-tolerance.ts b/src/scoring/numeric-tolerance.ts index 279c530..aae0690 100644 --- a/src/scoring/numeric-tolerance.ts +++ b/src/scoring/numeric-tolerance.ts @@ -54,27 +54,29 @@ export function scoreNumericTolerance( const diff = Math.abs(closest - expectedNumber) const relativeDiff = expectedNumber !== 0 ? diff / Math.abs(expectedNumber) : diff - // Partial credit based on how close the answer is - if (relativeDiff < 0.1) { + // Continuous scoring based on relative difference + // Score decreases smoothly from 1.0 to 0 as error increases + // At 25% error, score is 0. Uses exponential decay for smooth curve. + if (relativeDiff >= 0.25) { return { - score: 0.8, - confidence: 0.9, - notes: `Close match: ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`, + score: 0, + confidence: 1, + notes: `No match: closest was ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`, } } - if (relativeDiff < 0.25) { - return { - score: 0.5, - confidence: 0.8, - notes: `Partial match: ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`, - } - } + // Continuous score: 1.0 at 0% error, decays to 0 at 25% error + // Using formula: score = 1 - (relativeDiff / 0.25)^0.5 for smooth decay + // This gives: 0% error = 1.0, 1% = 0.8, 6.25% = 0.5, 25% = 0 + const score = Math.max(0, 1 - Math.pow(relativeDiff / 0.25, 0.5)) + + // Confidence decreases as the error increases + const confidence = Math.max(0.5, 1 - relativeDiff * 2) return { - score: 0, - confidence: 1, - notes: `No match: closest was ${closest} (expected ${expectedNumber})`, + score, + confidence, + notes: `${score >= 0.99 ? 'Exact' : score >= 0.8 ? 'Close' : 'Partial'} match: ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`, } } diff --git a/src/services/codeArenaExecution.ts b/src/services/codeArenaExecution.ts index 1e65de1..4e10cc2 100644 --- a/src/services/codeArenaExecution.ts +++ b/src/services/codeArenaExecution.ts @@ -6,6 +6,30 @@ import { extractCodeFromStreamingContent, extractCodeFromResponse } from './code import { scoreCodeArenaOutput } from '@/scoring/code-arena-judge' import type { ChatMessage, ModelParameters, OpenRouterModel, CodeArenaOutput } from '@/types' +// Per-request timeout in milliseconds (2 minutes) +const REQUEST_TIMEOUT_MS = 120000 + +/** + * Wraps a promise with a timeout. If the timeout expires, throws an error. + */ +function withTimeout(promise: Promise, timeoutMs: number, errorMessage: string): Promise { + return new Promise((resolve, reject) => { + const timeoutId = setTimeout(() => { + reject(new Error(errorMessage)) + }, timeoutMs) + + promise + .then((result) => { + clearTimeout(timeoutId) + resolve(result) + }) + .catch((error) => { + clearTimeout(timeoutId) + reject(error) + }) + }) +} + function calculateCost( usage: StreamResult['usage'], model: OpenRouterModel | undefined @@ -87,9 +111,9 @@ export async function executeCodeArenaRun( content: prompt, }) - // Stream the response + // Stream the response with timeout let streamedContent = '' - const result = await client.createChatCompletionStreamWithUsage( + const streamPromise = client.createChatCompletionStreamWithUsage( { model: modelId, messages, @@ -115,6 +139,12 @@ export async function executeCodeArenaRun( }) } ) + + const result = await withTimeout( + streamPromise, + REQUEST_TIMEOUT_MS, + `Request timed out after ${REQUEST_TIMEOUT_MS / 1000}s` + ) const latencyMs = Date.now() - startTime const model = modelMap.get(modelId) diff --git a/src/services/execution.ts b/src/services/execution.ts index eb7fc44..958db71 100644 --- a/src/services/execution.ts +++ b/src/services/execution.ts @@ -27,7 +27,10 @@ export async function executeRun( ): Promise { const client = getOpenRouterClient(apiKey) const { addResult, updateResult, setResultScore, completeRun } = useRunStore.getState() - const { selectedModelIds, parameters, judgeModelId, availableModels } = useModelStore.getState() + const { selectedModelIds, judgeModelId, availableModels, getEffectiveParameters } = useModelStore.getState() + + // Use effective parameters (respects benchmark mode) + const parameters = getEffectiveParameters() // Create a map for quick model lookup const modelMap = new Map(availableModels.map(m => [m.id, m])) @@ -127,19 +130,46 @@ export async function executeRun( } // Execute with concurrency limit - await executeWithConcurrency(tasks, concurrencyLimit, signal) + const executionErrors = await executeWithConcurrency(tasks, concurrencyLimit, signal) - completeRun(runId) + // Complete run with error summary if any errors occurred + completeRun(runId, executionErrors.count > 0 ? { + errorCount: executionErrors.count, + errorSummary: executionErrors.summary, + } : undefined) } const MAX_EMPTY_RESPONSE_RETRIES = 2 const EMPTY_RESPONSE_BACKOFF_MS = 400 +// Per-request timeout in milliseconds (2 minutes) +const REQUEST_TIMEOUT_MS = 120000 interface ResponseWithUsage { content: string usage?: StreamResult['usage'] } +/** + * Wraps a promise with a timeout. If the timeout expires, throws an error. + */ +function withTimeout(promise: Promise, timeoutMs: number, errorMessage: string): Promise { + return new Promise((resolve, reject) => { + const timeoutId = setTimeout(() => { + reject(new Error(errorMessage)) + }, timeoutMs) + + promise + .then((result) => { + clearTimeout(timeoutId) + resolve(result) + }) + .catch((error) => { + clearTimeout(timeoutId) + reject(error) + }) + }) +} + async function generateResponseWithRetries( client: ReturnType, modelId: string, @@ -158,7 +188,7 @@ async function generateResponseWithRetries( } let streamedContent = '' - const result = await client.createChatCompletionStreamWithUsage( + const streamPromise = client.createChatCompletionStreamWithUsage( { model: modelId, messages, @@ -175,6 +205,13 @@ async function generateResponseWithRetries( }) } ) + + // Apply timeout to the streaming request + const result = await withTimeout( + streamPromise, + REQUEST_TIMEOUT_MS, + `Request timed out after ${REQUEST_TIMEOUT_MS / 1000}s` + ) if (result.content.trim().length > 0) { return result @@ -243,11 +280,17 @@ function delay(ms: number): Promise { }) } +export interface ExecutionErrors { + count: number + summary: string + details: Error[] +} + async function executeWithConcurrency( tasks: Array<() => Promise>, limit: number, signal: AbortSignal -): Promise { +): Promise { const executing = new Set>() const errors: Error[] = [] @@ -279,7 +322,18 @@ async function executeWithConcurrency( await Promise.all(executing) + // Build error summary + const errorSummary = errors.length > 0 + ? `${errors.length} task(s) failed: ${[...new Set(errors.map(e => e.message))].join('; ')}` + : '' + if (errors.length > 0) { console.error('Some tasks failed:', errors) } + + return { + count: errors.length, + summary: errorSummary, + details: errors, + } } diff --git a/src/stores/modelStore.ts b/src/stores/modelStore.ts index b35aeec..65f6a46 100644 --- a/src/stores/modelStore.ts +++ b/src/stores/modelStore.ts @@ -26,6 +26,8 @@ interface ModelState { // Parameter Actions setParameters: (params: Partial) => void resetParameters: () => void + toggleBenchmarkMode: () => void + getEffectiveParameters: () => ModelParameters // Getters getSelectedModels: () => OpenRouterModel[] @@ -38,6 +40,14 @@ const defaultParameters: ModelParameters = { maxTokens: 2048, frequencyPenalty: 0, presencePenalty: 0, + benchmarkMode: false, +} + +// Benchmark mode uses temp=0 for reproducibility +const benchmarkModeParameters: Partial = { + temperature: 0, + frequencyPenalty: 0, + presencePenalty: 0, } export const useModelStore = create()( @@ -88,6 +98,46 @@ export const useModelStore = create()( const state = get() return state.availableModels.find((m) => m.id === state.judgeModelId) || null }, + + toggleBenchmarkMode: () => { + set((state) => { + const newBenchmarkMode = !state.parameters.benchmarkMode + if (newBenchmarkMode) { + // Enable benchmark mode: apply benchmark parameters + return { + parameters: { + ...state.parameters, + ...benchmarkModeParameters, + benchmarkMode: true, + }, + } + } else { + // Disable benchmark mode: restore defaults but keep maxTokens and topP + return { + parameters: { + ...state.parameters, + temperature: defaultParameters.temperature, + frequencyPenalty: defaultParameters.frequencyPenalty, + presencePenalty: defaultParameters.presencePenalty, + benchmarkMode: false, + }, + } + } + }) + }, + + getEffectiveParameters: () => { + const state = get() + if (state.parameters.benchmarkMode) { + return { + ...state.parameters, + temperature: 0, // Always 0 in benchmark mode + frequencyPenalty: 0, + presencePenalty: 0, + } + } + return state.parameters + }, }), { name: 'benchmaker-models', diff --git a/src/stores/runStore.ts b/src/stores/runStore.ts index 1e37ef1..346e099 100644 --- a/src/stores/runStore.ts +++ b/src/stores/runStore.ts @@ -1,5 +1,23 @@ import { create } from 'zustand' -import type { RunResult, TestCaseResult, ExecutionStatus, ScoringResult } from '@/types' +import type { RunResult, TestCaseResult, ExecutionStatus, ScoringResult, TestCase, AggregateScore, MultiRunStats } from '@/types' + +export interface ModelComparison { + modelA: string + modelB: string + meanA: number + meanB: number + scoreDiff: number // meanA - meanB + pooledStdErr: number // Standard error of the difference + tStatistic: number // t-statistic for significance test + pValue: number // Two-tailed p-value + isSignificant: boolean // p < 0.05 + effectSize: number // Cohen's d +} + +interface ErrorInfo { + errorCount: number + errorSummary: string +} interface RunState { runs: RunResult[] @@ -8,7 +26,7 @@ interface RunState { // Run Actions createRun: (run: Omit) => RunResult updateRunStatus: (runId: string, status: ExecutionStatus) => void - completeRun: (runId: string) => void + completeRun: (runId: string, errorInfo?: ErrorInfo) => void deleteRun: (runId: string) => void clearAllRuns: () => void setCurrentRun: (runId: string | null) => void @@ -24,9 +42,15 @@ interface RunState { getRunById: (runId: string) => RunResult | null getResultsForTestCase: (runId: string, testCaseId: string) => TestCaseResult[] getResultsForModel: (runId: string, modelId: string) => TestCaseResult[] - getAggregateScores: (runId: string) => Map + getAggregateScores: (runId: string, testCases?: TestCase[]) => Map + getDetailedAggregateScores: (runId: string, testCases?: TestCase[]) => Map getAggregateCosts: (runId: string) => Map getTotalCost: (runId: string) => number + + // Multi-run analysis + getMultiRunStats: (runIds: string[], testCases?: TestCase[]) => Map + compareModels: (runIds: string[], modelA: string, modelB: string, testCases?: TestCase[]) => ModelComparison | null + getRunsForTestSuite: (testSuiteId: string) => RunResult[] } function generateId(): string { @@ -55,11 +79,16 @@ export const useRunStore = create()((set, get) => ({ })) }, - completeRun: (runId) => { + completeRun: (runId, errorInfo) => { set((state) => ({ runs: state.runs.map((run) => run.id === runId - ? { ...run, status: 'completed' as ExecutionStatus, completedAt: Date.now() } + ? { + ...run, + status: 'completed' as ExecutionStatus, + completedAt: Date.now(), + ...(errorInfo && { errorCount: errorInfo.errorCount, errorSummary: errorInfo.errorSummary }) + } : run ), })) @@ -156,26 +185,100 @@ export const useRunStore = create()((set, get) => ({ return run?.results.filter((r) => r.modelId === modelId) || [] }, - getAggregateScores: (runId) => { + getAggregateScores: (runId, testCases) => { const run = get().runs.find((r) => r.id === runId) const scores = new Map() if (!run) return scores - const modelScores = new Map() + // Build weight map from test cases + const weightMap = new Map() + if (testCases) { + for (const tc of testCases) { + weightMap.set(tc.id, tc.weight || 1) + } + } + + const modelScores = new Map() for (const result of run.results) { if (result.score) { - const existing = modelScores.get(result.modelId) || { total: 0, count: 0 } + const weight = weightMap.get(result.testCaseId) || 1 + const existing = modelScores.get(result.modelId) || { weightedTotal: 0, totalWeight: 0 } modelScores.set(result.modelId, { - total: existing.total + result.score.score, - count: existing.count + 1, + weightedTotal: existing.weightedTotal + (result.score.score * weight), + totalWeight: existing.totalWeight + weight, }) } } - for (const [modelId, { total, count }] of modelScores) { - scores.set(modelId, count > 0 ? total / count : 0) + for (const [modelId, { weightedTotal, totalWeight }] of modelScores) { + scores.set(modelId, totalWeight > 0 ? weightedTotal / totalWeight : 0) + } + + return scores + }, + + getDetailedAggregateScores: (runId, testCases) => { + const run = get().runs.find((r) => r.id === runId) + const scores = new Map() + + if (!run) return scores + + // Build weight map from test cases + const weightMap = new Map() + if (testCases) { + for (const tc of testCases) { + weightMap.set(tc.id, tc.weight || 1) + } + } + + // Group results by model with weights + const modelResults = new Map>() + + for (const result of run.results) { + if (result.score) { + const weight = weightMap.get(result.testCaseId) || 1 + const existing = modelResults.get(result.modelId) || [] + existing.push({ score: result.score.score, weight }) + modelResults.set(result.modelId, existing) + } + } + + // Calculate detailed statistics for each model + for (const [modelId, results] of modelResults) { + if (results.length === 0) continue + + const totalWeight = results.reduce((sum, r) => sum + r.weight, 0) + const weightedMean = results.reduce((sum, r) => sum + r.score * r.weight, 0) / totalWeight + + // Calculate weighted standard deviation + const weightedVariance = results.reduce((sum, r) => { + return sum + r.weight * Math.pow(r.score - weightedMean, 2) + }, 0) / totalWeight + const stdDev = Math.sqrt(weightedVariance) + + const scoreValues = results.map(r => r.score) + const min = Math.min(...scoreValues) + const max = Math.max(...scoreValues) + + // 95% confidence interval (assuming normal distribution) + // CI = mean ± (1.96 * stdDev / sqrt(n)) + const marginOfError = results.length > 1 ? (1.96 * stdDev) / Math.sqrt(results.length) : 0 + const confidence95: [number, number] = [ + Math.max(0, weightedMean - marginOfError), + Math.min(1, weightedMean + marginOfError) + ] + + scores.set(modelId, { + mean: weightedMean, + stdDev, + min, + max, + count: results.length, + totalWeight, + confidence95, + }) } return scores @@ -205,4 +308,139 @@ export const useRunStore = create()((set, get) => ({ return total + (result.cost || 0) }, 0) }, + + getRunsForTestSuite: (testSuiteId) => { + return get().runs.filter((r) => r.testSuiteId === testSuiteId && r.status === 'completed') + }, + + getMultiRunStats: (runIds, testCases) => { + const stats = new Map() + const state = get() + + // Get all runs + const runs = runIds + .map((id) => state.runs.find((r) => r.id === id)) + .filter((r): r is RunResult => r !== undefined && r.status === 'completed') + + if (runs.length === 0) return stats + + // Collect all models across runs + const allModels = new Set() + for (const run of runs) { + for (const modelId of run.models) { + allModels.add(modelId) + } + } + + // Calculate stats for each model + for (const modelId of allModels) { + const scores: number[] = [] + const validRunIds: string[] = [] + + for (const run of runs) { + // Get aggregate score for this model in this run + const aggregateScores = state.getAggregateScores(run.id, testCases) + const score = aggregateScores.get(modelId) + if (score !== undefined) { + scores.push(score) + validRunIds.push(run.id) + } + } + + if (scores.length === 0) continue + + const mean = scores.reduce((a, b) => a + b, 0) / scores.length + const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length + const stdDev = Math.sqrt(variance) + const min = Math.min(...scores) + const max = Math.max(...scores) + + // 95% CI = mean ± (1.96 * stdDev / sqrt(n)) + const marginOfError = scores.length > 1 ? (1.96 * stdDev) / Math.sqrt(scores.length) : 0 + const confidence95: [number, number] = [ + Math.max(0, mean - marginOfError), + Math.min(1, mean + marginOfError) + ] + + stats.set(modelId, { + runIds: validRunIds, + modelId, + scores, + mean, + stdDev, + min, + max, + confidence95, + }) + } + + return stats + }, + + compareModels: (runIds, modelA, modelB, testCases) => { + const state = get() + const multiRunStats = state.getMultiRunStats(runIds, testCases) + + const statsA = multiRunStats.get(modelA) + const statsB = multiRunStats.get(modelB) + + if (!statsA || !statsB) return null + if (statsA.scores.length < 2 || statsB.scores.length < 2) return null + + const nA = statsA.scores.length + const nB = statsB.scores.length + const meanA = statsA.mean + const meanB = statsB.mean + + // Calculate pooled standard error + const varA = statsA.scores.reduce((sum, s) => sum + Math.pow(s - meanA, 2), 0) / (nA - 1) + const varB = statsB.scores.reduce((sum, s) => sum + Math.pow(s - meanB, 2), 0) / (nB - 1) + + // Pooled variance for two-sample t-test + const pooledVar = ((nA - 1) * varA + (nB - 1) * varB) / (nA + nB - 2) + const pooledStdErr = Math.sqrt(pooledVar * (1/nA + 1/nB)) + + // t-statistic + const tStatistic = pooledStdErr > 0 ? (meanA - meanB) / pooledStdErr : 0 + + // Approximate p-value using normal distribution (good for n > 30, acceptable for smaller) + // For a more accurate p-value, we'd need a t-distribution table or library + const pValue = 2 * (1 - normalCDF(Math.abs(tStatistic))) + + // Cohen's d effect size + const pooledStdDev = Math.sqrt(pooledVar) + const effectSize = pooledStdDev > 0 ? (meanA - meanB) / pooledStdDev : 0 + + return { + modelA, + modelB, + meanA, + meanB, + scoreDiff: meanA - meanB, + pooledStdErr, + tStatistic, + pValue, + isSignificant: pValue < 0.05, + effectSize, + } + }, })) + +// Helper function for normal CDF approximation +function normalCDF(x: number): number { + // Approximation of the cumulative distribution function for standard normal + const a1 = 0.254829592 + const a2 = -0.284496736 + const a3 = 1.421413741 + const a4 = -1.453152027 + const a5 = 1.061405429 + const p = 0.3275911 + + const sign = x < 0 ? -1 : 1 + x = Math.abs(x) / Math.sqrt(2) + + const t = 1.0 / (1.0 + p * x) + const y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x) + + return 0.5 * (1.0 + sign * y) +} diff --git a/src/types/index.ts b/src/types/index.ts index 2beb5f6..75873dc 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -51,6 +51,29 @@ export interface ScoringConfig { rubric?: string } +// Aggregate scoring with statistics +export interface AggregateScore { + mean: number // Weighted mean score (0-1) + stdDev: number // Standard deviation + min: number // Minimum score + max: number // Maximum score + count: number // Number of scored results + totalWeight: number // Sum of weights used + confidence95?: [number, number] // 95% confidence interval +} + +// Multi-run statistics +export interface MultiRunStats { + runIds: string[] + modelId: string + scores: number[] // Individual run scores + mean: number + stdDev: number + min: number + max: number + confidence95: [number, number] +} + // Model Types export interface OpenRouterModel { id: string @@ -73,6 +96,7 @@ export interface ModelParameters { maxTokens: number frequencyPenalty: number presencePenalty: number + benchmarkMode?: boolean // When true, uses temp=0 for reproducibility } // Execution Types @@ -104,6 +128,9 @@ export interface RunResult { startedAt: number completedAt?: number judgeModel?: string + // Error tracking for surfacing in UI + errorCount?: number + errorSummary?: string } // Code Arena Types