From 801bdfa3a19aba355292393535bf19e9d7b28719 Mon Sep 17 00:00:00 2001
From: oshtz <omershatzberg@gmail.com>
Date: Wed, 31 Dec 2025 13:09:10 +0200
Subject: [PATCH] feat: Enhance scoring and analysis features

- Added AggregateScore and MultiRunStats interfaces to improve statistical tracking of scores.
- Introduced a comprehensive Testing Methodology document detailing scoring methods, benchmark modes, and statistical analysis.
- Implemented MultiRunAnalysis component for visual representation of multi-run statistics and model comparisons.
- Created judge calibration functionality to assess and improve LLM judge accuracy with reference samples.
- Updated RunResult interface to include error tracking for better UI feedback.
---
 README.md                                     |   2 +-
 TESTING_METHODOLOGY.md                        | 499 ++++++++++++++++++
 package-lock.json                             |   4 +-
 package.json                                  |   2 +-
 src-tauri/Cargo.lock                          |   2 +-
 src-tauri/Cargo.toml                          |   2 +-
 src-tauri/tauri.conf.json                     |   2 +-
 src/components/arena/ExecutionControls.tsx    | 122 +++--
 src/components/arena/ParameterPanel.tsx       |  39 +-
 src/components/code-arena/CodeArena.tsx       |  19 +-
 .../code-arena/CodeArenaModelPanel.tsx        |   2 +-
 src/components/code-arena/CodeEditorView.tsx  |  20 +-
 src/components/results/MultiRunAnalysis.tsx   | 184 +++++++
 src/components/results/ReportSummary.tsx      |   9 +-
 src/components/results/Results.tsx            |   4 +
 src/components/ui/scroll-area.tsx             |   2 +-
 src/scoring/code-arena-judge.ts               |  19 +-
 src/scoring/exact-match.ts                    |  50 +-
 src/scoring/judge-calibration.ts              | 274 ++++++++++
 src/scoring/numeric-tolerance.ts              |  32 +-
 src/services/codeArenaExecution.ts            |  34 +-
 src/services/execution.ts                     |  64 ++-
 src/stores/modelStore.ts                      |  50 ++
 src/stores/runStore.ts                        | 262 ++++++++-
 src/types/index.ts                            |  27 +
 25 files changed, 1616 insertions(+), 110 deletions(-)
 create mode 100644 TESTING_METHODOLOGY.md
 create mode 100644 src/components/results/MultiRunAnalysis.tsx
 create mode 100644 src/scoring/judge-calibration.ts

diff --git a/README.md b/README.md
index d1ae6ea..c7ecd88 100644
--- a/README.md
+++ b/README.md
@@ -186,7 +186,7 @@ Enter your OpenRouter API key in the Settings (gear icon in header).
 
 ### Updates
 - The app checks for updates on startup
-- Click the version button in the header (e.g. `v0.1.1`) to view update status, release notes, or manually re-check
+- Click the version button in the header (e.g. `v0.1.2`) to view update status, release notes, or manually re-check
 - Updates are pulled from GitHub Releases and expect a `Benchmaker-Portable.exe` asset on the latest tag
 
 ## Development
diff --git a/TESTING_METHODOLOGY.md b/TESTING_METHODOLOGY.md
new file mode 100644
index 0000000..8c14b1a
--- /dev/null
+++ b/TESTING_METHODOLOGY.md
@@ -0,0 +1,499 @@
+# Testing Methodology
+
+Internal reference for Benchmaker's evaluation system and how to achieve accurate, reproducible 0-100 scoring.
+
+---
+
+## Quick Reference
+
+| Scoring Method | Scale | Output Range | Reproducibility | Best For |
+|----------------|-------|--------------|-----------------|----------|
+| Exact Match | 0-1 → 0-100 | 0-100 continuous | Deterministic | Short, precise answers |
+| Regex Match | 0-1 → 0-100 | 0 or 100 | Deterministic | Pattern validation |
+| Numeric Tolerance | 0-1 → 0-100 | 0-100 continuous | Deterministic | Math/calculations |
+| Boolean | 0-1 → 0-100 | 0 or 100 | Deterministic | Contains check |
+| LLM Judge | 0-10 → 0-100 | 0-100 continuous | ±5-10% variance | Complex/subjective |
+| Code Arena Judge | 0-100 native | 0-100 continuous | ±5-15% variance | Frontend code |
+
+---
+
+## 1. Overview
+
+Benchmaker evaluates LLM responses to produce a **0-100 rating per task per model**. The system supports:
+
+- **Two benchmark modes**: Standard Arena and Code Arena
+- **Five scoring methods**: From deterministic (exact match) to AI-powered (LLM judge)
+- **Weighted test cases**: Prioritize important tests in aggregate scores
+- **Statistical analysis**: Multi-run variance, confidence intervals, model comparison
+- **Benchmark mode**: One-click setup for reproducible results (temp=0)
+
+### Core Principles
+
+1. All scores are **normalized to 0-1 internally**, displayed as **0-100**
+2. Each test case can use a different scoring method
+3. Test case weights affect aggregate scores
+4. Full execution context is persisted for reproducibility
+
+---
+
+## 2. Benchmark Modes
+
+### Standard Arena
+
+- **Purpose**: Evaluate models across multiple test cases in a test suite
+- **Flow**: Test Suite → Test Cases → Models → Scored Results
+- **Scoring**: Any of the 5 scoring methods per test case
+- **Aggregation**: Weighted average per model
+
+### Code Arena
+
+- **Purpose**: Compare frontend code generation with live preview
+- **Flow**: Prompt → Models → Code Extraction → LLM Judge
+- **Scoring**: Code Arena Judge (0-100 with weighted criteria)
+- **Output**: Side-by-side comparison with rendered previews
+
+---
+
+## 3. Score Calculation
+
+### Internal Representation
+
+```typescript
+interface ScoringResult {
+  score: number        // 0-1 normalized (primary score)
+  confidence?: number  // 0-1, scorer confidence
+  notes?: string       // Human-readable explanation
+  rawScore?: number    // Original scale (e.g., 0-10)
+  maxScore?: number    // Maximum on original scale
+}
+```
+
+### Display Conversion
+
+```
+Display Score = score × 100
+```
+
+Example: `score: 0.85` → **85/100**
+
+### Aggregate Scoring
+
+Three functions for different levels of detail:
+
+**Simple aggregate** - weighted average per model:
+```typescript
+getAggregateScores(runId, testCases?)
+// Returns Map<modelId, weightedAverage>
+```
+
+**Detailed aggregate** - full statistics:
+```typescript
+getDetailedAggregateScores(runId, testCases?)
+// Returns Map<modelId, AggregateScore>
+
+interface AggregateScore {
+  mean: number              // Weighted mean (0-1)
+  stdDev: number            // Standard deviation
+  min: number
+  max: number
+  count: number             // Number of scored results
+  totalWeight: number       // Sum of weights
+  confidence95: [number, number]  // 95% CI
+}
+```
+
+**Multi-run statistics** - variance across runs:
+```typescript
+getMultiRunStats(runIds, testCases?)
+// Returns Map<modelId, MultiRunStats>
+
+interface MultiRunStats {
+  runIds: string[]
+  modelId: string
+  scores: number[]          // Score from each run
+  mean: number
+  stdDev: number
+  min: number
+  max: number
+  confidence95: [number, number]
+}
+```
+
+### Test Case Weights
+
+Each test case has a `weight` field (default: 1). Higher weight = more impact on aggregate score.
+
+```typescript
+interface TestCase {
+  weight: number  // Default 1, increase for important tests
+  // ...
+}
+
+// Calculation:
+weightedAverage = Σ(score × weight) / Σ(weight)
+```
+
+---
+
+## 4. Scoring Methods
+
+### 4.1 Exact Match
+
+**Location**: `src/scoring/exact-match.ts`
+
+| Condition | Score | Display |
+|-----------|-------|---------|
+| Exact string match | 1.0 | 100 |
+| Case-insensitive match | 0.95 | 95 |
+| Expected within response | 0.60-0.95 | 60-95 (varies by extra content) |
+| Case-insensitive contains | 0.55-0.90 | 55-90 (varies by extra content) |
+| High similarity (>50%) | similarity × 0.7 | 35-70 (continuous) |
+| Low similarity (20-50%) | similarity × 0.4 | 8-20 (continuous) |
+| No match (<20% similar) | 0 | 0 |
+
+Uses Levenshtein distance for continuous partial matching. Scores adjust based on extra content ratio when expected output is found within a longer response. Best for short, precise answers.
+
+### 4.2 Regex Match
+
+**Location**: `src/scoring/regex-match.ts`
+
+| Condition | Score | Display |
+|-----------|-------|---------|
+| Pattern matches | 1.0 | 100 |
+| No match | 0 | 0 |
+
+Supports `/pattern/flags` or plain patterns. Best for format validation.
+
+### 4.3 Numeric Tolerance
+
+**Location**: `src/scoring/numeric-tolerance.ts`
+
+| Condition | Score | Display |
+|-----------|-------|---------|
+| Within 1% tolerance | 1.0 | 100 |
+| Within 25% | Continuous decay | 0-100 (smooth curve) |
+| Outside 25% | 0 | 0 |
+
+Uses continuous scoring with smooth exponential decay: `score = 1 - (error/0.25)^0.5`. This provides scores like: 0% error = 100, 1% = 80, 6.25% = 50, 25% = 0.
+
+Extracts all numbers from response (including scientific notation). Best for math problems.
+
+### 4.4 Boolean Match
+
+**Location**: `src/scoring/index.ts`
+
+| Condition | Score | Display |
+|-----------|-------|---------|
+| Expected substring found | 1.0 | 100 |
+| Not found | 0 | 0 |
+| No expected output | 1.0 | 100 (auto-pass) |
+
+Case-insensitive substring check. Best for simple contains/doesn't contain.
+
+### 4.5 LLM Judge
+
+**Location**: `src/scoring/llm-judge.ts`
+**Temperature**: 0.1
+
+| Score | Meaning |
+|-------|---------|
+| 10 | Perfect, fully correct and complete |
+| 8-9 | Excellent with minor issues |
+| 6-7 | Good but missing elements |
+| 4-5 | Partially correct, significant issues |
+| 2-3 | Mostly incorrect |
+| 0-1 | Completely wrong |
+
+Conversion: `displayScore = (rawScore / 10) × 100`
+
+Best for complex, subjective, or open-ended tasks.
+
+### 4.6 Code Arena Judge
+
+**Location**: `src/scoring/code-arena-judge.ts`
+**Temperature**: 0.3
+
+| Criterion | Weight |
+|-----------|--------|
+| Visual Accuracy | 40% |
+| Code Quality | 30% |
+| Functionality | 20% |
+| Responsiveness | 10% |
+
+Scores directly on 0-100 scale. Best for frontend code generation.
+
+---
+
+## 5. Reproducibility
+
+### Benchmark Mode
+
+Toggle in the Parameter Panel to enable reproducible benchmarking:
+
+- Temperature locked to **0**
+- Frequency penalty locked to **0**
+- Presence penalty locked to **0**
+
+```typescript
+// Programmatically
+modelStore.toggleBenchmarkMode()
+
+// Get effective parameters (respects benchmark mode)
+const params = modelStore.getEffectiveParameters()
+```
+
+### Default Parameters
+
+```typescript
+{
+  temperature: 0.7,
+  topP: 1,
+  maxTokens: 2048,
+  frequencyPenalty: 0,
+  presencePenalty: 0,
+  benchmarkMode: false,
+}
+```
+
+### Persisted Per Run
+
+| Variable | Storage |
+|----------|---------|
+| Model IDs | `RunResult.models` |
+| Parameters | `RunResult.parameters` |
+| System Prompt | `TestSuite.systemPrompt` |
+| Judge Prompt | `TestSuite.judgeSystemPrompt` |
+| Judge Model | `RunResult.judgeModel` |
+| Timestamps | `RunResult.startedAt/completedAt` |
+
+### Sources of Variance
+
+| Source | Impact | Mitigation |
+|--------|--------|------------|
+| Model temperature > 0 | High | Enable Benchmark Mode |
+| LLM judge | Medium (±5-10%) | Run multiple times |
+| API-side sampling | Low | Cannot control |
+| Code Arena judge | Medium (±5-15%) | Run multiple times |
+
+---
+
+## 6. Statistical Comparison
+
+Compare two models across multiple runs:
+
+```typescript
+compareModels(runIds, modelA, modelB, testCases?)
+// Returns ModelComparison | null
+
+interface ModelComparison {
+  modelA: string
+  modelB: string
+  meanA: number
+  meanB: number
+  scoreDiff: number       // meanA - meanB
+  pooledStdErr: number
+  tStatistic: number
+  pValue: number          // Two-tailed
+  isSignificant: boolean  // p < 0.05
+  effectSize: number      // Cohen's d
+}
+```
+
+### Interpretation
+
+- **pValue < 0.05**: Statistically significant difference
+- **Effect size (Cohen's d)**:
+  - |d| < 0.2: Negligible
+  - |d| 0.2-0.5: Small
+  - |d| 0.5-0.8: Medium
+  - |d| > 0.8: Large
+
+Requires at least 2 runs per model.
+
+---
+
+## 7. Judge Calibration
+
+Test LLM judges against known reference samples:
+
+```typescript
+import { calibrateJudge, interpretCalibrationResult } from '@/scoring/judge-calibration'
+
+const result = await calibrateJudge(client, judgeModelId)
+const interpretation = interpretCalibrationResult(result)
+```
+
+### Calibration Result
+
+```typescript
+interface CalibrationResult {
+  judgeModelId: string
+  timestamp: number
+  samples: CalibrationSampleResult[]
+  summary: {
+    totalSamples: number
+    passedSamples: number
+    passRate: number           // % within tolerance
+    meanAbsoluteError: number
+    maxError: number
+    bias: number               // Positive = overscoring
+    correlation: number        // Pearson correlation
+  }
+}
+```
+
+### Quality Ratings
+
+| Rating | Pass Rate | MAE | Correlation |
+|--------|-----------|-----|-------------|
+| Excellent | ≥90% | ≤10% | ≥0.9 |
+| Good | ≥75% | ≤15% | ≥0.8 |
+| Fair | ≥50% | ≤25% | ≥0.6 |
+| Poor | <50% | >25% | <0.6 |
+
+Default calibration includes 8 reference samples covering factual, empty, irrelevant, and explanation-type responses.
+
+---
+
+## 8. Execution Pipeline
+
+### Standard Arena
+
+```
+1. Create result entries for all (test case × model) combinations
+2. Execute in parallel (concurrency limit: 5)
+3. For each task:
+   a. Build messages (system prompt + user prompt)
+   b. Stream response with retry logic
+   c. Record latency, tokens, cost
+   d. Score using configured method
+4. Aggregate scores per model
+```
+
+### Retry Logic
+
+- Max retries: 2 for empty responses
+- Backoff: 400ms × (attempt + 1)
+- Fallback: Non-streaming request if streaming fails
+- **Request timeout**: 2 minutes per request (prevents hung API calls from blocking execution)
+
+### Cost Calculation
+
+```
+Cost = (prompt_tokens × prompt_price) + (completion_tokens × completion_price)
+```
+
+---
+
+## 9. Best Practices
+
+### For Accurate Results
+
+1. **Enable Benchmark Mode** for reproducible model responses
+2. **Use deterministic scoring** (exact, regex, numeric, boolean) when possible
+3. **Set test case weights** for important tests
+4. **Run 3-5 times** for LLM-judged tests
+5. **Calibrate your judge** before relying on LLM judge scores
+6. **Use statistical comparison** to verify differences are significant
+
+### Multi-Run Protocol
+
+**Using the UI:**
+1. In the Arena tab, click the dropdown arrow next to "Run Benchmark"
+2. Select "Run 3 times", "Run 5 times", or "Run 10 times"
+3. The system will execute the benchmark sequentially, showing progress
+4. In the Results tab, the "Multi-Run Analysis" panel will automatically appear
+5. View mean scores, standard deviations, confidence intervals, and statistical comparisons
+
+**Programmatic API:**
+```typescript
+// Run benchmark 3-5 times, collect run IDs
+const runIds = [run1.id, run2.id, run3.id]
+
+// Get multi-run statistics
+const stats = runStore.getMultiRunStats(runIds, testSuite.testCases)
+
+// Report results
+const modelStats = stats.get('gpt-4')
+console.log(`gpt-4: ${(modelStats.mean * 100).toFixed(1)} ± ${(modelStats.stdDev * 100).toFixed(1)}`)
+// Output: "gpt-4: 85.2 ± 3.1"
+```
+
+### Score Reporting Format
+
+```
+Model X: 85.2 ± 3.1 (95% CI: [82.1, 88.3])
+         ↑     ↑            ↑
+       mean  stdDev    confidence interval
+```
+
+### Confidence Guidelines
+
+| Scoring Method | Confidence | Runs Needed |
+|----------------|------------|-------------|
+| Exact Match | High | 1 |
+| Regex Match | High | 1 |
+| Numeric Tolerance | High | 1 |
+| Boolean | High | 1 |
+| LLM Judge | Medium | 3-5 |
+| Code Arena Judge | Medium | 3-5 |
+
+---
+
+## 10. Limitations
+
+### Inherent Constraints
+
+1. **LLM Judge Variance**: Even at temp=0.1, ~±5-10% variance exists
+2. **API Non-Determinism**: Some providers vary even at temp=0
+3. **Code Truncation in Quick Judge**: Codes >5000 chars are truncated (warning shown in results)
+
+### Future Improvements
+
+- Inter-rater reliability (multiple judges)
+- Elo-style rankings
+- Export/reporting
+
+---
+
+## Code Reference
+
+| Component | Location |
+|-----------|----------|
+| **Scoring** |
+| Exact match | `src/scoring/exact-match.ts` |
+| Regex match | `src/scoring/regex-match.ts` |
+| Numeric tolerance | `src/scoring/numeric-tolerance.ts` |
+| Boolean match | `src/scoring/index.ts` |
+| LLM judge | `src/scoring/llm-judge.ts` |
+| Code arena judge | `src/scoring/code-arena-judge.ts` |
+| Judge calibration | `src/scoring/judge-calibration.ts` |
+| **State** |
+| Aggregate scores | `src/stores/runStore.ts` → `getAggregateScores()` |
+| Detailed stats | `src/stores/runStore.ts` → `getDetailedAggregateScores()` |
+| Multi-run stats | `src/stores/runStore.ts` → `getMultiRunStats()` |
+| Model comparison | `src/stores/runStore.ts` → `compareModels()` |
+| Benchmark mode | `src/stores/modelStore.ts` → `toggleBenchmarkMode()` |
+| **Execution** |
+| Standard arena | `src/services/execution.ts` |
+| Code arena | `src/services/codeArenaExecution.ts` |
+| **UI** |
+| Parameter panel | `src/components/arena/ParameterPanel.tsx` |
+
+---
+
+## Summary
+
+Benchmaker produces accurate 0-100 scores per task per model with:
+
+| Capability | Usage |
+|------------|-------|
+| Deterministic scoring | Use exact/regex/numeric/boolean |
+| Weighted aggregation | Pass `testCases` to aggregate functions |
+| Benchmark mode | Toggle in Parameter Panel |
+| Multi-run statistics | `getMultiRunStats(runIds, testCases)` |
+| Confidence intervals | `getDetailedAggregateScores(runId, testCases)` |
+| Statistical comparison | `compareModels(runIds, modelA, modelB)` |
+| Judge calibration | `calibrateJudge(client, judgeModelId)` |
+
+**For reliable results**: Enable Benchmark Mode, use deterministic scoring where possible, run LLM-judged tests 3-5 times, and verify differences with `compareModels()`.
diff --git a/package-lock.json b/package-lock.json
index e8864e3..352f079 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "benchmaker",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "benchmaker",
-      "version": "0.1.1",
+      "version": "0.1.2",
       "dependencies": {
         "@monaco-editor/react": "^4.7.0",
         "@radix-ui/react-alert-dialog": "^1.1.15",
diff --git a/package.json b/package.json
index 4a41a16..98d3da0 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "benchmaker",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "type": "module",
   "scripts": {
     "dev": "vite",
diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock
index 454abf6..bc097e0 100644
--- a/src-tauri/Cargo.lock
+++ b/src-tauri/Cargo.lock
@@ -109,7 +109,7 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
 name = "benchmaker"
-version = "0.1.1"
+version = "0.1.2"
 dependencies = [
  "rusqlite",
  "serde",
diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml
index 7a7aa31..7e3eb0b 100644
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "benchmaker"
-version = "0.1.1"
+version = "0.1.2"
 description = "Benchmaker"
 authors = ["you"]
 edition = "2021"
diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json
index 9b7b292..fa12ce5 100644
--- a/src-tauri/tauri.conf.json
+++ b/src-tauri/tauri.conf.json
@@ -7,7 +7,7 @@
   },
   "package": {
     "productName": "Benchmaker",
-    "version": "0.1.1"
+    "version": "0.1.2"
   },
   "tauri": {
     "allowlist": {
diff --git a/src/components/arena/ExecutionControls.tsx b/src/components/arena/ExecutionControls.tsx
index 16010f1..9bf601a 100644
--- a/src/components/arena/ExecutionControls.tsx
+++ b/src/components/arena/ExecutionControls.tsx
@@ -1,6 +1,12 @@
 import { useState } from 'react'
-import { Play, Square, Loader2 } from 'lucide-react'
+import { Play, Square, Repeat, ChevronDown } from 'lucide-react'
 import { Button } from '@/components/ui/button'
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuTrigger,
+} from '@/components/ui/dropdown-menu'
 import { useToast } from '@/components/ui/use-toast'
 import { useSettingsStore } from '@/stores/settingsStore'
 import { useModelStore } from '@/stores/modelStore'
@@ -15,15 +21,33 @@ interface ExecutionControlsProps {
 export function ExecutionControls({ testSuite }: ExecutionControlsProps) {
   const { apiKey } = useSettingsStore()
   const { selectedModelIds, parameters, judgeModelId } = useModelStore()
-  const { createRun, updateRunStatus } = useRunStore()
+  const { createRun } = useRunStore()
   const { toast } = useToast()
 
   const [isRunning, setIsRunning] = useState(false)
   const [abortController, setAbortController] = useState<AbortController | null>(null)
+  const [currentRunIndex, setCurrentRunIndex] = useState(0)
+  const [totalRuns, setTotalRuns] = useState(1)
 
   const canRun = selectedModelIds.length > 0 && testSuite.testCases.length > 0
 
-  const handleRun = async () => {
+  const executeSingleRun = async (controller: AbortController): Promise<string> => {
+    const run = createRun({
+      testSuiteId: testSuite.id,
+      testSuiteName: testSuite.name,
+      models: selectedModelIds,
+      parameters,
+      results: [],
+      status: 'running',
+      startedAt: Date.now(),
+      judgeModel: judgeModelId || undefined,
+    })
+
+    await executeRun(run.id, testSuite, apiKey!, controller.signal)
+    return run.id
+  }
+
+  const handleRun = async (numRuns: number = 1) => {
     if (!canRun || !apiKey) {
       toast({
         title: 'Cannot start run',
@@ -36,33 +60,44 @@ export function ExecutionControls({ testSuite }: ExecutionControlsProps) {
     const controller = new AbortController()
     setAbortController(controller)
     setIsRunning(true)
+    setTotalRuns(numRuns)
+    setCurrentRunIndex(0)
 
-    const run = createRun({
-      testSuiteId: testSuite.id,
-      testSuiteName: testSuite.name,
-      models: selectedModelIds,
-      parameters,
-      results: [],
-      status: 'running',
-      startedAt: Date.now(),
-      judgeModel: judgeModelId || undefined,
-    })
+    const completedRunIds: string[] = []
+    let cancelled = false
 
     try {
-      await executeRun(run.id, testSuite, apiKey, controller.signal)
-      toast({
-        title: 'Run completed',
-        description: `Benchmarked ${selectedModelIds.length} models on ${testSuite.testCases.length} test cases`,
-      })
+      for (let i = 0; i < numRuns; i++) {
+        if (controller.signal.aborted) {
+          cancelled = true
+          break
+        }
+
+        setCurrentRunIndex(i + 1)
+        const runId = await executeSingleRun(controller)
+        completedRunIds.push(runId)
+      }
+
+      if (!cancelled) {
+        if (numRuns === 1) {
+          toast({
+            title: 'Run completed',
+            description: `Benchmarked ${selectedModelIds.length} models on ${testSuite.testCases.length} test cases`,
+          })
+        } else {
+          toast({
+            title: `${numRuns} runs completed`,
+            description: `Completed ${numRuns} benchmark runs. Use Results tab to analyze multi-run statistics.`,
+          })
+        }
+      }
     } catch (error) {
-      if (error instanceof Error && error.name === 'AbortError') {
-        updateRunStatus(run.id, 'cancelled')
+      if (error instanceof DOMException && error.name === 'AbortError') {
         toast({
-          title: 'Run cancelled',
-          description: 'The benchmark run was stopped',
+          title: 'Runs cancelled',
+          description: `Stopped after ${completedRunIds.length} of ${numRuns} runs`,
         })
       } else {
-        updateRunStatus(run.id, 'failed')
         toast({
           title: 'Run failed',
           description: error instanceof Error ? error.message : 'Unknown error',
@@ -72,6 +107,8 @@ export function ExecutionControls({ testSuite }: ExecutionControlsProps) {
     } finally {
       setIsRunning(false)
       setAbortController(null)
+      setCurrentRunIndex(0)
+      setTotalRuns(1)
     }
   }
 
@@ -81,22 +118,45 @@ export function ExecutionControls({ testSuite }: ExecutionControlsProps) {
     }
   }
 
+  const runOptions = [3, 5, 10]
+
   return (
     <div className="flex items-center gap-2">
       {isRunning ? (
         <Button variant="destructive" onClick={handleStop}>
           <Square className="h-4 w-4 mr-2" />
-          Stop
+          Stop {totalRuns > 1 ? `(${currentRunIndex}/${totalRuns})` : ''}
         </Button>
       ) : (
-        <Button onClick={handleRun} disabled={!canRun}>
-          {isRunning ? (
-            <Loader2 className="h-4 w-4 mr-2 animate-spin" />
-          ) : (
+        <div className="flex items-center">
+          <Button 
+            onClick={() => handleRun(1)} 
+            disabled={!canRun}
+            className="rounded-r-none"
+          >
             <Play className="h-4 w-4 mr-2" />
-          )}
-          Run Benchmark
-        </Button>
+            Run Benchmark
+          </Button>
+          <DropdownMenu>
+            <DropdownMenuTrigger asChild>
+              <Button 
+                variant="default" 
+                disabled={!canRun}
+                className="rounded-l-none border-l border-primary-foreground/20 px-2"
+              >
+                <ChevronDown className="h-4 w-4" />
+              </Button>
+            </DropdownMenuTrigger>
+            <DropdownMenuContent align="end">
+              {runOptions.map((n) => (
+                <DropdownMenuItem key={n} onClick={() => handleRun(n)}>
+                  <Repeat className="h-4 w-4 mr-2" />
+                  Run {n} times
+                </DropdownMenuItem>
+              ))}
+            </DropdownMenuContent>
+          </DropdownMenu>
+        </div>
       )}
 
       {selectedModelIds.length === 0 && (
diff --git a/src/components/arena/ParameterPanel.tsx b/src/components/arena/ParameterPanel.tsx
index ffde367..d275178 100644
--- a/src/components/arena/ParameterPanel.tsx
+++ b/src/components/arena/ParameterPanel.tsx
@@ -1,13 +1,14 @@
-import { RotateCcw } from 'lucide-react'
+import { RotateCcw, FlaskConical } from 'lucide-react'
 import { Button } from '@/components/ui/button'
 import { Input } from '@/components/ui/input'
 import { Label } from '@/components/ui/label'
 import { Slider } from '@/components/ui/slider'
+import { Switch } from '@/components/ui/switch'
 import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'
 import { useModelStore } from '@/stores/modelStore'
 
 export function ParameterPanel() {
-  const { parameters, setParameters, resetParameters } = useModelStore()
+  const { parameters, setParameters, resetParameters, toggleBenchmarkMode } = useModelStore()
 
   return (
     <Card>
@@ -24,19 +25,37 @@ export function ParameterPanel() {
         </div>
       </CardHeader>
       <CardContent className="space-y-3 sm:space-y-4">
+        {/* Benchmark Mode Toggle */}
+        <div className="flex items-center justify-between p-3 rounded-lg bg-muted/50 border">
+          <div className="flex items-center gap-2">
+            <FlaskConical className="h-4 w-4 text-muted-foreground" />
+            <div>
+              <Label htmlFor="benchmark-mode" className="cursor-pointer">Benchmark Mode</Label>
+              <p className="text-xs text-muted-foreground">Uses temp=0 for reproducible results</p>
+            </div>
+          </div>
+          <Switch
+            id="benchmark-mode"
+            checked={parameters.benchmarkMode ?? false}
+            onCheckedChange={toggleBenchmarkMode}
+          />
+        </div>
+
         <div className="space-y-2">
           <div className="flex items-center justify-between">
             <Label>Temperature</Label>
             <span className="text-sm text-muted-foreground">
-              {parameters.temperature.toFixed(2)}
+              {parameters.benchmarkMode ? '0.00 (locked)' : parameters.temperature.toFixed(2)}
             </span>
           </div>
           <Slider
-            value={[parameters.temperature]}
+            value={[parameters.benchmarkMode ? 0 : parameters.temperature]}
             onValueChange={([v]) => setParameters({ temperature: v })}
             min={0}
             max={2}
             step={0.01}
+            disabled={parameters.benchmarkMode}
+            className={parameters.benchmarkMode ? 'opacity-50' : ''}
           />
         </div>
 
@@ -74,15 +93,17 @@ export function ParameterPanel() {
           <div className="flex items-center justify-between">
             <Label>Frequency Penalty</Label>
             <span className="text-sm text-muted-foreground">
-              {parameters.frequencyPenalty.toFixed(2)}
+              {parameters.benchmarkMode ? '0.00 (locked)' : parameters.frequencyPenalty.toFixed(2)}
             </span>
           </div>
           <Slider
-            value={[parameters.frequencyPenalty]}
+            value={[parameters.benchmarkMode ? 0 : parameters.frequencyPenalty]}
             onValueChange={([v]) => setParameters({ frequencyPenalty: v })}
             min={-2}
             max={2}
             step={0.01}
+            disabled={parameters.benchmarkMode}
+            className={parameters.benchmarkMode ? 'opacity-50' : ''}
           />
         </div>
 
@@ -90,15 +111,17 @@ export function ParameterPanel() {
           <div className="flex items-center justify-between">
             <Label>Presence Penalty</Label>
             <span className="text-sm text-muted-foreground">
-              {parameters.presencePenalty.toFixed(2)}
+              {parameters.benchmarkMode ? '0.00 (locked)' : parameters.presencePenalty.toFixed(2)}
             </span>
           </div>
           <Slider
-            value={[parameters.presencePenalty]}
+            value={[parameters.benchmarkMode ? 0 : parameters.presencePenalty]}
             onValueChange={([v]) => setParameters({ presencePenalty: v })}
             min={-2}
             max={2}
             step={0.01}
+            disabled={parameters.benchmarkMode}
+            className={parameters.benchmarkMode ? 'opacity-50' : ''}
           />
         </div>
       </CardContent>
diff --git a/src/components/code-arena/CodeArena.tsx b/src/components/code-arena/CodeArena.tsx
index bef3bbb..4db25bb 100644
--- a/src/components/code-arena/CodeArena.tsx
+++ b/src/components/code-arena/CodeArena.tsx
@@ -2,6 +2,7 @@ import { useEffect } from 'react'
 import { Key, Code2 } from 'lucide-react'
 import { EmptyState } from '@/components/ui/empty-state'
 import { ResizablePanelGroup, ResizablePanel, ResizableHandle } from '@/components/ui/resizable'
+import { ScrollArea } from '@/components/ui/scroll-area'
 import { useSettingsStore } from '@/stores/settingsStore'
 import { useModelStore } from '@/stores/modelStore'
 import { getOpenRouterClient } from '@/services/openrouter'
@@ -97,15 +98,17 @@ export function CodeArena() {
         <ResizablePanelGroup direction="horizontal" className="h-full rounded-lg">
           {/* Left panel - Configuration */}
           <ResizablePanel defaultSize="25%" minSize="15%" maxSize="40%">
-            <div className="h-full flex flex-col gap-4 pr-2 overflow-auto">
-              <CodeArenaHeader />
-              <div className="flex-1 min-h-0">
-                <ModelSelector useCodeArenaStore={true} />
+            <ScrollArea className="h-full w-full">
+              <div className="flex flex-col gap-4 pr-2 w-full">
+                <CodeArenaHeader />
+                <div className="flex-1 min-h-0">
+                  <ModelSelector useCodeArenaStore={true} />
+                </div>
+                <div className="shrink-0">
+                  <CodeArenaJudgeSelector />
+                </div>
               </div>
-              <div className="shrink-0">
-                <CodeArenaJudgeSelector />
-              </div>
-            </div>
+            </ScrollArea>
           </ResizablePanel>
 
           <ResizableHandle withHandle />
diff --git a/src/components/code-arena/CodeArenaModelPanel.tsx b/src/components/code-arena/CodeArenaModelPanel.tsx
index d2b4de9..e7cd377 100644
--- a/src/components/code-arena/CodeArenaModelPanel.tsx
+++ b/src/components/code-arena/CodeArenaModelPanel.tsx
@@ -143,7 +143,7 @@ export function CodeArenaModelPanel({
         ) : isPreviewMode ? (
           <CodePreviewPanel code={displayCode} className="h-full" />
         ) : (
-          <CodeEditorView code={displayCode} className="h-full" />
+          <CodeEditorView code={displayCode} className="h-full" isStreaming={isLoading} />
         )}
       </CardContent>
 
diff --git a/src/components/code-arena/CodeEditorView.tsx b/src/components/code-arena/CodeEditorView.tsx
index 2a2f704..7408561 100644
--- a/src/components/code-arena/CodeEditorView.tsx
+++ b/src/components/code-arena/CodeEditorView.tsx
@@ -1,22 +1,34 @@
-import { useRef } from 'react'
+import { useRef, useEffect, useState } from 'react'
 import { Copy, Check } from 'lucide-react'
 import { Button } from '@/components/ui/button'
 import { ScrollArea } from '@/components/ui/scroll-area'
-import { useState } from 'react'
 
 interface CodeEditorViewProps {
   code: string
   className?: string
   showLineNumbers?: boolean
+  isStreaming?: boolean
 }
 
 export function CodeEditorView({ 
   code, 
   className = '',
-  showLineNumbers = true 
+  showLineNumbers = true,
+  isStreaming = false
 }: CodeEditorViewProps) {
   const [copied, setCopied] = useState(false)
   const codeRef = useRef<HTMLPreElement>(null)
+  const scrollAreaRef = useRef<HTMLDivElement>(null)
+
+  // Auto-scroll to bottom when streaming
+  useEffect(() => {
+    if (isStreaming && scrollAreaRef.current) {
+      const viewport = scrollAreaRef.current.querySelector('[data-radix-scroll-area-viewport]')
+      if (viewport) {
+        viewport.scrollTop = viewport.scrollHeight
+      }
+    }
+  }, [code, isStreaming])
 
   const handleCopy = async () => {
     try {
@@ -115,7 +127,7 @@ export function CodeEditorView({
         )}
       </Button>
 
-      <ScrollArea className="h-full">
+      <ScrollArea className="h-full" ref={scrollAreaRef}>
         <div className="flex text-sm font-mono">
           {/* Line numbers */}
           {showLineNumbers && (
diff --git a/src/components/results/MultiRunAnalysis.tsx b/src/components/results/MultiRunAnalysis.tsx
new file mode 100644
index 0000000..911500c
--- /dev/null
+++ b/src/components/results/MultiRunAnalysis.tsx
@@ -0,0 +1,184 @@
+import { useState, useMemo } from 'react'
+import { BarChart3, TrendingUp, AlertCircle } from 'lucide-react'
+import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
+import { Button } from '@/components/ui/button'
+import { Badge } from '@/components/ui/badge'
+import {
+  Collapsible,
+  CollapsibleContent,
+  CollapsibleTrigger,
+} from '@/components/ui/collapsible'
+import { useRunStore, type ModelComparison } from '@/stores/runStore'
+import { useTestSuiteStore } from '@/stores/testSuiteStore'
+import type { RunResult } from '@/types'
+
+interface MultiRunAnalysisProps {
+  currentRun: RunResult
+}
+
+export function MultiRunAnalysis({ currentRun }: MultiRunAnalysisProps) {
+  const { getRunsForTestSuite, getMultiRunStats, compareModels } = useRunStore()
+  const { testSuites } = useTestSuiteStore()
+  const [isOpen, setIsOpen] = useState(false)
+
+  const testSuite = testSuites.find((s) => s.id === currentRun.testSuiteId)
+  const relatedRuns = getRunsForTestSuite(currentRun.testSuiteId)
+
+  // Only show if there are multiple completed runs
+  if (relatedRuns.length < 2) {
+    return null
+  }
+
+  const runIds = relatedRuns.map((r) => r.id)
+  const multiRunStats = getMultiRunStats(runIds, testSuite?.testCases)
+
+  // Get all unique models across runs
+  const allModels = Array.from(multiRunStats.keys())
+
+  // Sort by mean score descending
+  const sortedModels = allModels.sort((a, b) => {
+    const statsA = multiRunStats.get(a)
+    const statsB = multiRunStats.get(b)
+    return (statsB?.mean || 0) - (statsA?.mean || 0)
+  })
+
+  // Calculate model comparisons for top 2 models
+  const topComparison: ModelComparison | null = useMemo(() => {
+    if (sortedModels.length < 2) return null
+    return compareModels(runIds, sortedModels[0], sortedModels[1], testSuite?.testCases) || null
+  }, [runIds, sortedModels, testSuite?.testCases, compareModels])
+
+  const formatScore = (score: number) => `${(score * 100).toFixed(1)}%`
+  const formatCI = (ci: [number, number]) => 
+    `[${(ci[0] * 100).toFixed(1)}, ${(ci[1] * 100).toFixed(1)}]`
+
+  return (
+    <Collapsible open={isOpen} onOpenChange={setIsOpen}>
+      <Card className="border-dashed">
+        <CollapsibleTrigger asChild>
+          <CardHeader className="cursor-pointer hover:bg-muted/50 transition-colors pb-3">
+            <div className="flex items-center justify-between">
+              <CardTitle className="text-sm font-medium flex items-center gap-2">
+                <BarChart3 className="h-4 w-4" />
+                Multi-Run Analysis
+                <Badge variant="secondary" className="ml-2">
+                  {relatedRuns.length} runs
+                </Badge>
+              </CardTitle>
+              <Button variant="ghost" size="sm">
+                {isOpen ? 'Hide' : 'Show'}
+              </Button>
+            </div>
+          </CardHeader>
+        </CollapsibleTrigger>
+
+        <CollapsibleContent>
+          <CardContent className="pt-0">
+            <div className="space-y-4">
+              {/* Model Statistics Table */}
+              <div className="rounded-lg border overflow-hidden">
+                <table className="w-full text-sm">
+                  <thead className="bg-muted/50">
+                    <tr>
+                      <th className="text-left p-2 font-medium">Model</th>
+                      <th className="text-right p-2 font-medium">Mean</th>
+                      <th className="text-right p-2 font-medium">Std Dev</th>
+                      <th className="text-right p-2 font-medium">95% CI</th>
+                      <th className="text-right p-2 font-medium">Range</th>
+                    </tr>
+                  </thead>
+                  <tbody>
+                    {sortedModels.map((modelId, index) => {
+                      const stats = multiRunStats.get(modelId)
+                      if (!stats) return null
+
+                      return (
+                        <tr 
+                          key={modelId} 
+                          className={index === 0 ? 'bg-emerald-50 dark:bg-emerald-950/20' : ''}
+                        >
+                          <td className="p-2 font-mono text-xs truncate max-w-[200px]" title={modelId}>
+                            {index === 0 && <TrendingUp className="h-3 w-3 inline mr-1 text-emerald-600" />}
+                            {modelId.split('/').pop()}
+                          </td>
+                          <td className="text-right p-2 font-semibold">
+                            {formatScore(stats.mean)}
+                          </td>
+                          <td className="text-right p-2 text-muted-foreground">
+                            ±{(stats.stdDev * 100).toFixed(1)}%
+                          </td>
+                          <td className="text-right p-2 text-muted-foreground text-xs">
+                            {formatCI(stats.confidence95)}
+                          </td>
+                          <td className="text-right p-2 text-muted-foreground text-xs">
+                            {formatScore(stats.min)} - {formatScore(stats.max)}
+                          </td>
+                        </tr>
+                      )
+                    })}
+                  </tbody>
+                </table>
+              </div>
+
+              {/* Statistical Comparison */}
+              {topComparison && (
+                <div className="rounded-lg border p-3 bg-muted/30">
+                  <h4 className="text-xs font-medium mb-2 flex items-center gap-1">
+                    <AlertCircle className="h-3 w-3" />
+                    Statistical Comparison: Top 2 Models
+                  </h4>
+                  <div className="grid grid-cols-2 gap-4 text-xs">
+                    <div>
+                      <span className="text-muted-foreground">Score Difference:</span>
+                      <span className="ml-2 font-semibold">
+                        {(topComparison.scoreDiff * 100).toFixed(2)}%
+                      </span>
+                    </div>
+                    <div>
+                      <span className="text-muted-foreground">p-value:</span>
+                      <span className="ml-2 font-semibold">
+                        {topComparison.pValue.toFixed(4)}
+                      </span>
+                    </div>
+                    <div>
+                      <span className="text-muted-foreground">Effect Size (Cohen's d):</span>
+                      <span className="ml-2 font-semibold">
+                        {topComparison.effectSize.toFixed(3)}
+                      </span>
+                    </div>
+                    <div>
+                      <span className="text-muted-foreground">Significant:</span>
+                      <Badge 
+                        variant={topComparison.isSignificant ? 'success' : 'secondary'}
+                        className="ml-2"
+                      >
+                        {topComparison.isSignificant ? 'Yes (p < 0.05)' : 'No'}
+                      </Badge>
+                    </div>
+                  </div>
+                  {topComparison.isSignificant && (
+                    <p className="text-xs text-muted-foreground mt-2">
+                      {sortedModels[0].split('/').pop()} significantly outperforms{' '}
+                      {sortedModels[1].split('/').pop()} with a{' '}
+                      {Math.abs(topComparison.effectSize) > 0.8
+                        ? 'large'
+                        : Math.abs(topComparison.effectSize) > 0.5
+                        ? 'medium'
+                        : 'small'}{' '}
+                      effect size.
+                    </p>
+                  )}
+                </div>
+              )}
+
+              <p className="text-xs text-muted-foreground">
+                Based on {relatedRuns.length} completed runs of "{currentRun.testSuiteName}".
+                Run more benchmarks for higher statistical confidence.
+              </p>
+            </div>
+          </CardContent>
+        </CollapsibleContent>
+      </Card>
+    </Collapsible>
+  )
+}
diff --git a/src/components/results/ReportSummary.tsx b/src/components/results/ReportSummary.tsx
index e85272f..341808c 100644
--- a/src/components/results/ReportSummary.tsx
+++ b/src/components/results/ReportSummary.tsx
@@ -124,15 +124,20 @@ export function ReportSummary({ run }: ReportSummaryProps) {
               <span className="text-base sm:text-lg font-semibold">{completedCount}</span>
             </div>
             {failedCount > 0 && (
-              <div className="flex items-center gap-1">
+              <div className="flex items-center gap-1" title={run.errorSummary || `${failedCount} task(s) failed`}>
                 <XCircle className="h-3.5 w-3.5 sm:h-4 sm:w-4 text-rose-500" />
                 <span className="text-base sm:text-lg font-semibold">{failedCount}</span>
               </div>
             )}
           </div>
           <p className="text-[10px] sm:text-xs text-muted-foreground">
-            {run.models.length} models × {totalCount} tests
+            {run.models.length} models × {Math.round(totalCount / run.models.length)} tests
           </p>
+          {run.errorSummary && (
+            <p className="text-[10px] sm:text-xs text-rose-500 mt-1 line-clamp-2" title={run.errorSummary}>
+              {run.errorSummary}
+            </p>
+          )}
         </CardContent>
       </Card>
 
diff --git a/src/components/results/Results.tsx b/src/components/results/Results.tsx
index b9dbb98..eaa6037 100644
--- a/src/components/results/Results.tsx
+++ b/src/components/results/Results.tsx
@@ -15,6 +15,7 @@ import { useTestSuiteStore } from '@/stores/testSuiteStore'
 import { useSettingsStore } from '@/stores/settingsStore'
 import { ComparisonGrid } from './ComparisonGrid'
 import { ReportSummary } from './ReportSummary'
+import { MultiRunAnalysis } from './MultiRunAnalysis'
 
 export function Results() {
   const { runs, currentRunId, setCurrentRun, deleteRun } = useRunStore()
@@ -113,6 +114,9 @@ export function Results() {
           <div className="shrink-0">
             <ReportSummary run={currentRun} />
           </div>
+          <div className="shrink-0">
+            <MultiRunAnalysis currentRun={currentRun} />
+          </div>
           <div className="flex-1 min-h-0">
             <ComparisonGrid run={currentRun} />
           </div>
diff --git a/src/components/ui/scroll-area.tsx b/src/components/ui/scroll-area.tsx
index b96134c..f5ceed9 100644
--- a/src/components/ui/scroll-area.tsx
+++ b/src/components/ui/scroll-area.tsx
@@ -11,7 +11,7 @@ const ScrollArea = React.forwardRef<
     className={cn("relative overflow-hidden", className)}
     {...props}
   >
-    <ScrollAreaPrimitive.Viewport className="h-full w-full rounded-[inherit] min-w-0">
+    <ScrollAreaPrimitive.Viewport className="h-full w-full rounded-[inherit] [&>div]:!block [&>div]:!min-w-0">
       {children}
     </ScrollAreaPrimitive.Viewport>
     <ScrollBar />
diff --git a/src/scoring/code-arena-judge.ts b/src/scoring/code-arena-judge.ts
index 5feed72..b310b41 100644
--- a/src/scoring/code-arena-judge.ts
+++ b/src/scoring/code-arena-judge.ts
@@ -82,7 +82,8 @@ export async function scoreCodeArenaOutput(
       }
 
       return {
-        score: 0.5, // Default to 50% if we can't parse
+        score: 0, // Default to 0 if we can't parse - don't inflate scores
+        confidence: 0, // Low confidence since we couldn't parse
         notes: 'Could not parse judge response: ' + content.slice(0, 200),
       }
     }
@@ -119,6 +120,8 @@ Code:
 
 Reply with just a number from 0-100.`
 
+const QUICK_SCORE_CODE_LIMIT = 5000
+
 export async function quickScoreCodeArenaOutput(
   prompt: string,
   code: string,
@@ -132,10 +135,13 @@ export async function quickScoreCodeArenaOutput(
     }
   }
 
+  const wasTruncated = code.length > QUICK_SCORE_CODE_LIMIT
+  const truncatedCode = wasTruncated ? code.slice(0, QUICK_SCORE_CODE_LIMIT) : code
+
   try {
     const judgePrompt = SIMPLE_JUDGE_PROMPT
       .replace('{prompt}', prompt)
-      .replace('{code}', code.slice(0, 5000)) // Limit code length for quick scoring
+      .replace('{code}', truncatedCode)
 
     const response = await client.createChatCompletion({
       model: judgeModelId,
@@ -154,18 +160,23 @@ export async function quickScoreCodeArenaOutput(
 
     if (!scoreMatch) {
       return {
-        score: 0.5,
+        score: 0, // Default to 0 if we can't parse - don't inflate scores
+        confidence: 0,
         notes: 'Could not parse quick score',
       }
     }
 
     const rawScore = Math.min(100, Math.max(0, parseInt(scoreMatch[1], 10)))
+    const truncationWarning = wasTruncated 
+      ? ` (Warning: code truncated from ${code.length} to ${QUICK_SCORE_CODE_LIMIT} chars)` 
+      : ''
 
     return {
       score: rawScore / 100,
       rawScore,
       maxScore: 100,
-      notes: 'Quick evaluation',
+      confidence: wasTruncated ? 0.8 : 1, // Lower confidence when truncated
+      notes: `Quick evaluation${truncationWarning}`,
     }
   } catch (error) {
     console.error('Failed quick score:', error)
diff --git a/src/scoring/exact-match.ts b/src/scoring/exact-match.ts
index 17ba2e6..de2b0cc 100644
--- a/src/scoring/exact-match.ts
+++ b/src/scoring/exact-match.ts
@@ -23,29 +23,59 @@ export function scoreExactMatch(response: string, expected: string): ScoringResu
   // Case-insensitive match
   if (normalizedResponse.toLowerCase() === normalizedExpected.toLowerCase()) {
     return {
-      score: 0.9,
+      score: 0.95, // Very high but not perfect (case matters slightly)
       confidence: 1,
       notes: 'Case-insensitive match',
     }
   }
 
+  // Calculate similarity using Levenshtein distance
+  const similarity = calculateSimilarity(normalizedResponse, normalizedExpected)
+  
   // Check if response contains the expected (for longer responses)
+  // Score based on how much extra content surrounds the expected output
   if (normalizedResponse.includes(normalizedExpected)) {
+    // Penalize based on how much extra content there is
+    const extraContentRatio = 1 - (normalizedExpected.length / normalizedResponse.length)
+    // Score from 0.95 (exact length match) down to 0.6 (lots of extra content)
+    const containsScore = Math.max(0.6, 0.95 - (extraContentRatio * 0.35))
     return {
-      score: 0.7,
-      confidence: 0.8,
-      notes: 'Expected output found within response',
+      score: containsScore,
+      confidence: 0.9,
+      notes: `Expected output found within response (${(extraContentRatio * 100).toFixed(0)}% extra content)`,
     }
   }
 
-  // Calculate similarity for partial matches
-  const similarity = calculateSimilarity(normalizedResponse, normalizedExpected)
+  // Case-insensitive contains check
+  if (normalizedResponse.toLowerCase().includes(normalizedExpected.toLowerCase())) {
+    const extraContentRatio = 1 - (normalizedExpected.length / normalizedResponse.length)
+    const containsScore = Math.max(0.55, 0.90 - (extraContentRatio * 0.35))
+    return {
+      score: containsScore,
+      confidence: 0.85,
+      notes: `Expected output found (case-insensitive, ${(extraContentRatio * 100).toFixed(0)}% extra content)`,
+    }
+  }
+
+  // Use similarity for partial matches - continuous scoring
+  // Similarity of 1.0 = perfect match, 0.0 = completely different
+  if (similarity > 0.5) {
+    // Scale similarity to 0-0.7 range for partial matches
+    // This ensures partial matches never score higher than contains matches
+    const score = similarity * 0.7
+    return {
+      score,
+      confidence: Math.max(0.4, similarity * 0.8),
+      notes: `Partial similarity: ${(similarity * 100).toFixed(1)}%`,
+    }
+  }
 
-  if (similarity > 0.8) {
+  // Very low similarity - use raw similarity scaled down
+  if (similarity > 0.2) {
     return {
-      score: similarity * 0.8,
-      confidence: 0.6,
-      notes: `High similarity (${(similarity * 100).toFixed(1)}%)`,
+      score: similarity * 0.4, // Max 0.2 score for low similarity
+      confidence: 0.3,
+      notes: `Low similarity: ${(similarity * 100).toFixed(1)}%`,
     }
   }
 
diff --git a/src/scoring/judge-calibration.ts b/src/scoring/judge-calibration.ts
new file mode 100644
index 0000000..28b50a6
--- /dev/null
+++ b/src/scoring/judge-calibration.ts
@@ -0,0 +1,274 @@
+import { scoreLLMJudge } from './llm-judge'
+import type { OpenRouterClient } from '@/services/openrouter'
+
+/**
+ * Reference sample for judge calibration
+ * Contains known-good evaluations to measure judge accuracy
+ */
+export interface CalibrationSample {
+  id: string
+  prompt: string
+  response: string
+  expectedOutput?: string
+  expectedScore: number  // Known correct score (0-1)
+  tolerance: number      // Acceptable deviation (e.g., 0.1 = ±10%)
+  category: string       // e.g., 'factual', 'creative', 'code'
+}
+
+/**
+ * Result of calibrating a judge against reference samples
+ */
+export interface CalibrationResult {
+  judgeModelId: string
+  timestamp: number
+  samples: CalibrationSampleResult[]
+  summary: {
+    totalSamples: number
+    passedSamples: number
+    passRate: number         // Percentage within tolerance
+    meanAbsoluteError: number  // Average |expected - actual|
+    maxError: number
+    bias: number             // Average (actual - expected), positive = overscoring
+    correlation: number      // Pearson correlation coefficient
+  }
+}
+
+export interface CalibrationSampleResult {
+  sampleId: string
+  expectedScore: number
+  actualScore: number
+  error: number           // actual - expected
+  absoluteError: number
+  withinTolerance: boolean
+  notes?: string
+}
+
+/**
+ * Default calibration samples covering different response types
+ */
+export const DEFAULT_CALIBRATION_SAMPLES: CalibrationSample[] = [
+  {
+    id: 'perfect-factual',
+    prompt: 'What is 2 + 2?',
+    response: '4',
+    expectedOutput: '4',
+    expectedScore: 1.0,
+    tolerance: 0.1,
+    category: 'factual',
+  },
+  {
+    id: 'wrong-factual',
+    prompt: 'What is 2 + 2?',
+    response: '5',
+    expectedOutput: '4',
+    expectedScore: 0.0,
+    tolerance: 0.15,
+    category: 'factual',
+  },
+  {
+    id: 'partial-factual',
+    prompt: 'What is the capital of France?',
+    response: 'Paris is a major city in France known for the Eiffel Tower.',
+    expectedOutput: 'Paris',
+    expectedScore: 0.8,
+    tolerance: 0.15,
+    category: 'factual',
+  },
+  {
+    id: 'verbose-correct',
+    prompt: 'What is 10 * 5?',
+    response: 'To calculate 10 multiplied by 5, we need to add 10 five times: 10 + 10 + 10 + 10 + 10 = 50. Therefore, 10 * 5 = 50.',
+    expectedOutput: '50',
+    expectedScore: 0.9,
+    tolerance: 0.1,
+    category: 'factual',
+  },
+  {
+    id: 'empty-response',
+    prompt: 'What is the meaning of life?',
+    response: '',
+    expectedScore: 0.0,
+    tolerance: 0.05,
+    category: 'empty',
+  },
+  {
+    id: 'irrelevant-response',
+    prompt: 'What is the speed of light?',
+    response: 'I like pizza.',
+    expectedOutput: '299,792,458 meters per second',
+    expectedScore: 0.0,
+    tolerance: 0.1,
+    category: 'irrelevant',
+  },
+  {
+    id: 'good-explanation',
+    prompt: 'Explain why the sky is blue in simple terms.',
+    response: 'The sky appears blue because of a phenomenon called Rayleigh scattering. When sunlight enters Earth\'s atmosphere, it collides with gas molecules. Blue light has a shorter wavelength, so it gets scattered more than other colors, making the sky look blue to us.',
+    expectedScore: 0.9,
+    tolerance: 0.1,
+    category: 'explanation',
+  },
+  {
+    id: 'mediocre-explanation',
+    prompt: 'Explain why the sky is blue in simple terms.',
+    response: 'The sky is blue because of the sun and the air.',
+    expectedScore: 0.4,
+    tolerance: 0.2,
+    category: 'explanation',
+  },
+]
+
+/**
+ * Run calibration tests against a judge model
+ */
+export async function calibrateJudge(
+  client: OpenRouterClient,
+  judgeModelId: string,
+  samples: CalibrationSample[] = DEFAULT_CALIBRATION_SAMPLES,
+  judgeSystemPrompt?: string
+): Promise<CalibrationResult> {
+  const results: CalibrationSampleResult[] = []
+
+  for (const sample of samples) {
+    try {
+      const scoringResult = await scoreLLMJudge(
+        sample.prompt,
+        sample.response,
+        sample.expectedOutput,
+        client,
+        judgeModelId,
+        judgeSystemPrompt
+      )
+
+      const actualScore = scoringResult.score
+      const error = actualScore - sample.expectedScore
+      const absoluteError = Math.abs(error)
+      const withinTolerance = absoluteError <= sample.tolerance
+
+      results.push({
+        sampleId: sample.id,
+        expectedScore: sample.expectedScore,
+        actualScore,
+        error,
+        absoluteError,
+        withinTolerance,
+        notes: scoringResult.notes,
+      })
+    } catch (error) {
+      results.push({
+        sampleId: sample.id,
+        expectedScore: sample.expectedScore,
+        actualScore: 0,
+        error: -sample.expectedScore,
+        absoluteError: sample.expectedScore,
+        withinTolerance: false,
+        notes: `Error: ${error instanceof Error ? error.message : 'Unknown error'}`,
+      })
+    }
+  }
+
+  // Calculate summary statistics
+  const passedSamples = results.filter(r => r.withinTolerance).length
+  const meanAbsoluteError = results.reduce((sum, r) => sum + r.absoluteError, 0) / results.length
+  const maxError = Math.max(...results.map(r => r.absoluteError))
+  const bias = results.reduce((sum, r) => sum + r.error, 0) / results.length
+
+  // Calculate Pearson correlation
+  const correlation = calculateCorrelation(
+    results.map(r => r.expectedScore),
+    results.map(r => r.actualScore)
+  )
+
+  return {
+    judgeModelId,
+    timestamp: Date.now(),
+    samples: results,
+    summary: {
+      totalSamples: results.length,
+      passedSamples,
+      passRate: passedSamples / results.length,
+      meanAbsoluteError,
+      maxError,
+      bias,
+      correlation,
+    },
+  }
+}
+
+/**
+ * Calculate Pearson correlation coefficient
+ */
+function calculateCorrelation(x: number[], y: number[]): number {
+  const n = x.length
+  if (n === 0) return 0
+
+  const sumX = x.reduce((a, b) => a + b, 0)
+  const sumY = y.reduce((a, b) => a + b, 0)
+  const sumXY = x.reduce((sum, xi, i) => sum + xi * y[i], 0)
+  const sumX2 = x.reduce((sum, xi) => sum + xi * xi, 0)
+  const sumY2 = y.reduce((sum, yi) => sum + yi * yi, 0)
+
+  const numerator = n * sumXY - sumX * sumY
+  const denominator = Math.sqrt((n * sumX2 - sumX * sumX) * (n * sumY2 - sumY * sumY))
+
+  if (denominator === 0) return 0
+  return numerator / denominator
+}
+
+/**
+ * Interpret calibration results
+ */
+export function interpretCalibrationResult(result: CalibrationResult): {
+  quality: 'excellent' | 'good' | 'fair' | 'poor'
+  recommendation: string
+  details: string[]
+} {
+  const { summary } = result
+  const details: string[] = []
+
+  // Assess overall quality
+  let quality: 'excellent' | 'good' | 'fair' | 'poor'
+  
+  if (summary.passRate >= 0.9 && summary.meanAbsoluteError <= 0.1 && summary.correlation >= 0.9) {
+    quality = 'excellent'
+  } else if (summary.passRate >= 0.75 && summary.meanAbsoluteError <= 0.15 && summary.correlation >= 0.8) {
+    quality = 'good'
+  } else if (summary.passRate >= 0.5 && summary.meanAbsoluteError <= 0.25 && summary.correlation >= 0.6) {
+    quality = 'fair'
+  } else {
+    quality = 'poor'
+  }
+
+  // Generate details
+  details.push(`Pass rate: ${(summary.passRate * 100).toFixed(1)}% (${summary.passedSamples}/${summary.totalSamples} within tolerance)`)
+  details.push(`Mean absolute error: ${(summary.meanAbsoluteError * 100).toFixed(1)}%`)
+  details.push(`Max error: ${(summary.maxError * 100).toFixed(1)}%`)
+  details.push(`Correlation: ${summary.correlation.toFixed(3)}`)
+  
+  if (summary.bias > 0.05) {
+    details.push(`Bias: +${(summary.bias * 100).toFixed(1)}% (tends to overscore)`)
+  } else if (summary.bias < -0.05) {
+    details.push(`Bias: ${(summary.bias * 100).toFixed(1)}% (tends to underscore)`)
+  } else {
+    details.push(`Bias: ${(summary.bias * 100).toFixed(1)}% (minimal)`)
+  }
+
+  // Generate recommendation
+  let recommendation: string
+  switch (quality) {
+    case 'excellent':
+      recommendation = 'This judge model is highly reliable for scoring. Results can be trusted with minimal variance.'
+      break
+    case 'good':
+      recommendation = 'This judge model is reliable for most use cases. Consider running 2-3 times for LLM-judged critical tasks.'
+      break
+    case 'fair':
+      recommendation = 'This judge model shows moderate reliability. Run 3-5 times and average scores for better accuracy.'
+      break
+    case 'poor':
+      recommendation = 'This judge model is unreliable. Consider using a different judge model or deterministic scoring methods.'
+      break
+  }
+
+  return { quality, recommendation, details }
+}
diff --git a/src/scoring/numeric-tolerance.ts b/src/scoring/numeric-tolerance.ts
index 279c530..aae0690 100644
--- a/src/scoring/numeric-tolerance.ts
+++ b/src/scoring/numeric-tolerance.ts
@@ -54,27 +54,29 @@ export function scoreNumericTolerance(
   const diff = Math.abs(closest - expectedNumber)
   const relativeDiff = expectedNumber !== 0 ? diff / Math.abs(expectedNumber) : diff
 
-  // Partial credit based on how close the answer is
-  if (relativeDiff < 0.1) {
+  // Continuous scoring based on relative difference
+  // Score decreases smoothly from 1.0 to 0 as error increases
+  // At 25% error, score is 0. Uses exponential decay for smooth curve.
+  if (relativeDiff >= 0.25) {
     return {
-      score: 0.8,
-      confidence: 0.9,
-      notes: `Close match: ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`,
+      score: 0,
+      confidence: 1,
+      notes: `No match: closest was ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`,
     }
   }
 
-  if (relativeDiff < 0.25) {
-    return {
-      score: 0.5,
-      confidence: 0.8,
-      notes: `Partial match: ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`,
-    }
-  }
+  // Continuous score: 1.0 at 0% error, decays to 0 at 25% error
+  // Using formula: score = 1 - (relativeDiff / 0.25)^0.5 for smooth decay
+  // This gives: 0% error = 1.0, 1% = 0.8, 6.25% = 0.5, 25% = 0
+  const score = Math.max(0, 1 - Math.pow(relativeDiff / 0.25, 0.5))
+  
+  // Confidence decreases as the error increases
+  const confidence = Math.max(0.5, 1 - relativeDiff * 2)
 
   return {
-    score: 0,
-    confidence: 1,
-    notes: `No match: closest was ${closest} (expected ${expectedNumber})`,
+    score,
+    confidence,
+    notes: `${score >= 0.99 ? 'Exact' : score >= 0.8 ? 'Close' : 'Partial'} match: ${closest} (expected ${expectedNumber}, diff: ${(relativeDiff * 100).toFixed(1)}%)`,
   }
 }
 
diff --git a/src/services/codeArenaExecution.ts b/src/services/codeArenaExecution.ts
index 1e65de1..4e10cc2 100644
--- a/src/services/codeArenaExecution.ts
+++ b/src/services/codeArenaExecution.ts
@@ -6,6 +6,30 @@ import { extractCodeFromStreamingContent, extractCodeFromResponse } from './code
 import { scoreCodeArenaOutput } from '@/scoring/code-arena-judge'
 import type { ChatMessage, ModelParameters, OpenRouterModel, CodeArenaOutput } from '@/types'
 
+// Per-request timeout in milliseconds (2 minutes)
+const REQUEST_TIMEOUT_MS = 120000
+
+/**
+ * Wraps a promise with a timeout. If the timeout expires, throws an error.
+ */
+function withTimeout<T>(promise: Promise<T>, timeoutMs: number, errorMessage: string): Promise<T> {
+  return new Promise((resolve, reject) => {
+    const timeoutId = setTimeout(() => {
+      reject(new Error(errorMessage))
+    }, timeoutMs)
+
+    promise
+      .then((result) => {
+        clearTimeout(timeoutId)
+        resolve(result)
+      })
+      .catch((error) => {
+        clearTimeout(timeoutId)
+        reject(error)
+      })
+  })
+}
+
 function calculateCost(
   usage: StreamResult['usage'],
   model: OpenRouterModel | undefined
@@ -87,9 +111,9 @@ export async function executeCodeArenaRun(
           content: prompt,
         })
 
-        // Stream the response
+        // Stream the response with timeout
         let streamedContent = ''
-        const result = await client.createChatCompletionStreamWithUsage(
+        const streamPromise = client.createChatCompletionStreamWithUsage(
           {
             model: modelId,
             messages,
@@ -115,6 +139,12 @@ export async function executeCodeArenaRun(
             })
           }
         )
+        
+        const result = await withTimeout(
+          streamPromise,
+          REQUEST_TIMEOUT_MS,
+          `Request timed out after ${REQUEST_TIMEOUT_MS / 1000}s`
+        )
 
         const latencyMs = Date.now() - startTime
         const model = modelMap.get(modelId)
diff --git a/src/services/execution.ts b/src/services/execution.ts
index eb7fc44..958db71 100644
--- a/src/services/execution.ts
+++ b/src/services/execution.ts
@@ -27,7 +27,10 @@ export async function executeRun(
 ): Promise<void> {
   const client = getOpenRouterClient(apiKey)
   const { addResult, updateResult, setResultScore, completeRun } = useRunStore.getState()
-  const { selectedModelIds, parameters, judgeModelId, availableModels } = useModelStore.getState()
+  const { selectedModelIds, judgeModelId, availableModels, getEffectiveParameters } = useModelStore.getState()
+  
+  // Use effective parameters (respects benchmark mode)
+  const parameters = getEffectiveParameters()
 
   // Create a map for quick model lookup
   const modelMap = new Map(availableModels.map(m => [m.id, m]))
@@ -127,19 +130,46 @@ export async function executeRun(
   }
 
   // Execute with concurrency limit
-  await executeWithConcurrency(tasks, concurrencyLimit, signal)
+  const executionErrors = await executeWithConcurrency(tasks, concurrencyLimit, signal)
 
-  completeRun(runId)
+  // Complete run with error summary if any errors occurred
+  completeRun(runId, executionErrors.count > 0 ? {
+    errorCount: executionErrors.count,
+    errorSummary: executionErrors.summary,
+  } : undefined)
 }
 
 const MAX_EMPTY_RESPONSE_RETRIES = 2
 const EMPTY_RESPONSE_BACKOFF_MS = 400
+// Per-request timeout in milliseconds (2 minutes)
+const REQUEST_TIMEOUT_MS = 120000
 
 interface ResponseWithUsage {
   content: string
   usage?: StreamResult['usage']
 }
 
+/**
+ * Wraps a promise with a timeout. If the timeout expires, throws an error.
+ */
+function withTimeout<T>(promise: Promise<T>, timeoutMs: number, errorMessage: string): Promise<T> {
+  return new Promise((resolve, reject) => {
+    const timeoutId = setTimeout(() => {
+      reject(new Error(errorMessage))
+    }, timeoutMs)
+
+    promise
+      .then((result) => {
+        clearTimeout(timeoutId)
+        resolve(result)
+      })
+      .catch((error) => {
+        clearTimeout(timeoutId)
+        reject(error)
+      })
+  })
+}
+
 async function generateResponseWithRetries(
   client: ReturnType<typeof getOpenRouterClient>,
   modelId: string,
@@ -158,7 +188,7 @@ async function generateResponseWithRetries(
     }
 
     let streamedContent = ''
-    const result = await client.createChatCompletionStreamWithUsage(
+    const streamPromise = client.createChatCompletionStreamWithUsage(
       {
         model: modelId,
         messages,
@@ -175,6 +205,13 @@ async function generateResponseWithRetries(
         })
       }
     )
+    
+    // Apply timeout to the streaming request
+    const result = await withTimeout(
+      streamPromise,
+      REQUEST_TIMEOUT_MS,
+      `Request timed out after ${REQUEST_TIMEOUT_MS / 1000}s`
+    )
 
     if (result.content.trim().length > 0) {
       return result
@@ -243,11 +280,17 @@ function delay(ms: number): Promise<void> {
   })
 }
 
+export interface ExecutionErrors {
+  count: number
+  summary: string
+  details: Error[]
+}
+
 async function executeWithConcurrency(
   tasks: Array<() => Promise<void>>,
   limit: number,
   signal: AbortSignal
-): Promise<void> {
+): Promise<ExecutionErrors> {
   const executing = new Set<Promise<void>>()
   const errors: Error[] = []
 
@@ -279,7 +322,18 @@ async function executeWithConcurrency(
 
   await Promise.all(executing)
 
+  // Build error summary
+  const errorSummary = errors.length > 0 
+    ? `${errors.length} task(s) failed: ${[...new Set(errors.map(e => e.message))].join('; ')}`
+    : ''
+
   if (errors.length > 0) {
     console.error('Some tasks failed:', errors)
   }
+
+  return {
+    count: errors.length,
+    summary: errorSummary,
+    details: errors,
+  }
 }
diff --git a/src/stores/modelStore.ts b/src/stores/modelStore.ts
index b35aeec..65f6a46 100644
--- a/src/stores/modelStore.ts
+++ b/src/stores/modelStore.ts
@@ -26,6 +26,8 @@ interface ModelState {
   // Parameter Actions
   setParameters: (params: Partial<ModelParameters>) => void
   resetParameters: () => void
+  toggleBenchmarkMode: () => void
+  getEffectiveParameters: () => ModelParameters
 
   // Getters
   getSelectedModels: () => OpenRouterModel[]
@@ -38,6 +40,14 @@ const defaultParameters: ModelParameters = {
   maxTokens: 2048,
   frequencyPenalty: 0,
   presencePenalty: 0,
+  benchmarkMode: false,
+}
+
+// Benchmark mode uses temp=0 for reproducibility
+const benchmarkModeParameters: Partial<ModelParameters> = {
+  temperature: 0,
+  frequencyPenalty: 0,
+  presencePenalty: 0,
 }
 
 export const useModelStore = create<ModelState>()(
@@ -88,6 +98,46 @@ export const useModelStore = create<ModelState>()(
         const state = get()
         return state.availableModels.find((m) => m.id === state.judgeModelId) || null
       },
+
+      toggleBenchmarkMode: () => {
+        set((state) => {
+          const newBenchmarkMode = !state.parameters.benchmarkMode
+          if (newBenchmarkMode) {
+            // Enable benchmark mode: apply benchmark parameters
+            return {
+              parameters: {
+                ...state.parameters,
+                ...benchmarkModeParameters,
+                benchmarkMode: true,
+              },
+            }
+          } else {
+            // Disable benchmark mode: restore defaults but keep maxTokens and topP
+            return {
+              parameters: {
+                ...state.parameters,
+                temperature: defaultParameters.temperature,
+                frequencyPenalty: defaultParameters.frequencyPenalty,
+                presencePenalty: defaultParameters.presencePenalty,
+                benchmarkMode: false,
+              },
+            }
+          }
+        })
+      },
+
+      getEffectiveParameters: () => {
+        const state = get()
+        if (state.parameters.benchmarkMode) {
+          return {
+            ...state.parameters,
+            temperature: 0, // Always 0 in benchmark mode
+            frequencyPenalty: 0,
+            presencePenalty: 0,
+          }
+        }
+        return state.parameters
+      },
     }),
     {
       name: 'benchmaker-models',
diff --git a/src/stores/runStore.ts b/src/stores/runStore.ts
index 1e37ef1..346e099 100644
--- a/src/stores/runStore.ts
+++ b/src/stores/runStore.ts
@@ -1,5 +1,23 @@
 import { create } from 'zustand'
-import type { RunResult, TestCaseResult, ExecutionStatus, ScoringResult } from '@/types'
+import type { RunResult, TestCaseResult, ExecutionStatus, ScoringResult, TestCase, AggregateScore, MultiRunStats } from '@/types'
+
+export interface ModelComparison {
+  modelA: string
+  modelB: string
+  meanA: number
+  meanB: number
+  scoreDiff: number      // meanA - meanB
+  pooledStdErr: number   // Standard error of the difference
+  tStatistic: number     // t-statistic for significance test
+  pValue: number         // Two-tailed p-value
+  isSignificant: boolean // p < 0.05
+  effectSize: number     // Cohen's d
+}
+
+interface ErrorInfo {
+  errorCount: number
+  errorSummary: string
+}
 
 interface RunState {
   runs: RunResult[]
@@ -8,7 +26,7 @@ interface RunState {
   // Run Actions
   createRun: (run: Omit<RunResult, 'id'>) => RunResult
   updateRunStatus: (runId: string, status: ExecutionStatus) => void
-  completeRun: (runId: string) => void
+  completeRun: (runId: string, errorInfo?: ErrorInfo) => void
   deleteRun: (runId: string) => void
   clearAllRuns: () => void
   setCurrentRun: (runId: string | null) => void
@@ -24,9 +42,15 @@ interface RunState {
   getRunById: (runId: string) => RunResult | null
   getResultsForTestCase: (runId: string, testCaseId: string) => TestCaseResult[]
   getResultsForModel: (runId: string, modelId: string) => TestCaseResult[]
-  getAggregateScores: (runId: string) => Map<string, number>
+  getAggregateScores: (runId: string, testCases?: TestCase[]) => Map<string, number>
+  getDetailedAggregateScores: (runId: string, testCases?: TestCase[]) => Map<string, AggregateScore>
   getAggregateCosts: (runId: string) => Map<string, number>
   getTotalCost: (runId: string) => number
+
+  // Multi-run analysis
+  getMultiRunStats: (runIds: string[], testCases?: TestCase[]) => Map<string, MultiRunStats>
+  compareModels: (runIds: string[], modelA: string, modelB: string, testCases?: TestCase[]) => ModelComparison | null
+  getRunsForTestSuite: (testSuiteId: string) => RunResult[]
 }
 
 function generateId(): string {
@@ -55,11 +79,16 @@ export const useRunStore = create<RunState>()((set, get) => ({
     }))
   },
 
-  completeRun: (runId) => {
+  completeRun: (runId, errorInfo) => {
     set((state) => ({
       runs: state.runs.map((run) =>
         run.id === runId
-          ? { ...run, status: 'completed' as ExecutionStatus, completedAt: Date.now() }
+          ? { 
+              ...run, 
+              status: 'completed' as ExecutionStatus, 
+              completedAt: Date.now(),
+              ...(errorInfo && { errorCount: errorInfo.errorCount, errorSummary: errorInfo.errorSummary })
+            }
           : run
       ),
     }))
@@ -156,26 +185,100 @@ export const useRunStore = create<RunState>()((set, get) => ({
     return run?.results.filter((r) => r.modelId === modelId) || []
   },
 
-  getAggregateScores: (runId) => {
+  getAggregateScores: (runId, testCases) => {
     const run = get().runs.find((r) => r.id === runId)
     const scores = new Map<string, number>()
 
     if (!run) return scores
 
-    const modelScores = new Map<string, { total: number; count: number }>()
+    // Build weight map from test cases
+    const weightMap = new Map<string, number>()
+    if (testCases) {
+      for (const tc of testCases) {
+        weightMap.set(tc.id, tc.weight || 1)
+      }
+    }
+
+    const modelScores = new Map<string, { weightedTotal: number; totalWeight: number }>()
 
     for (const result of run.results) {
       if (result.score) {
-        const existing = modelScores.get(result.modelId) || { total: 0, count: 0 }
+        const weight = weightMap.get(result.testCaseId) || 1
+        const existing = modelScores.get(result.modelId) || { weightedTotal: 0, totalWeight: 0 }
         modelScores.set(result.modelId, {
-          total: existing.total + result.score.score,
-          count: existing.count + 1,
+          weightedTotal: existing.weightedTotal + (result.score.score * weight),
+          totalWeight: existing.totalWeight + weight,
         })
       }
     }
 
-    for (const [modelId, { total, count }] of modelScores) {
-      scores.set(modelId, count > 0 ? total / count : 0)
+    for (const [modelId, { weightedTotal, totalWeight }] of modelScores) {
+      scores.set(modelId, totalWeight > 0 ? weightedTotal / totalWeight : 0)
+    }
+
+    return scores
+  },
+
+  getDetailedAggregateScores: (runId, testCases) => {
+    const run = get().runs.find((r) => r.id === runId)
+    const scores = new Map<string, AggregateScore>()
+
+    if (!run) return scores
+
+    // Build weight map from test cases
+    const weightMap = new Map<string, number>()
+    if (testCases) {
+      for (const tc of testCases) {
+        weightMap.set(tc.id, tc.weight || 1)
+      }
+    }
+
+    // Group results by model with weights
+    const modelResults = new Map<string, Array<{ score: number; weight: number }>>()
+
+    for (const result of run.results) {
+      if (result.score) {
+        const weight = weightMap.get(result.testCaseId) || 1
+        const existing = modelResults.get(result.modelId) || []
+        existing.push({ score: result.score.score, weight })
+        modelResults.set(result.modelId, existing)
+      }
+    }
+
+    // Calculate detailed statistics for each model
+    for (const [modelId, results] of modelResults) {
+      if (results.length === 0) continue
+
+      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0)
+      const weightedMean = results.reduce((sum, r) => sum + r.score * r.weight, 0) / totalWeight
+
+      // Calculate weighted standard deviation
+      const weightedVariance = results.reduce((sum, r) => {
+        return sum + r.weight * Math.pow(r.score - weightedMean, 2)
+      }, 0) / totalWeight
+      const stdDev = Math.sqrt(weightedVariance)
+
+      const scoreValues = results.map(r => r.score)
+      const min = Math.min(...scoreValues)
+      const max = Math.max(...scoreValues)
+
+      // 95% confidence interval (assuming normal distribution)
+      // CI = mean ± (1.96 * stdDev / sqrt(n))
+      const marginOfError = results.length > 1 ? (1.96 * stdDev) / Math.sqrt(results.length) : 0
+      const confidence95: [number, number] = [
+        Math.max(0, weightedMean - marginOfError),
+        Math.min(1, weightedMean + marginOfError)
+      ]
+
+      scores.set(modelId, {
+        mean: weightedMean,
+        stdDev,
+        min,
+        max,
+        count: results.length,
+        totalWeight,
+        confidence95,
+      })
     }
 
     return scores
@@ -205,4 +308,139 @@ export const useRunStore = create<RunState>()((set, get) => ({
       return total + (result.cost || 0)
     }, 0)
   },
+
+  getRunsForTestSuite: (testSuiteId) => {
+    return get().runs.filter((r) => r.testSuiteId === testSuiteId && r.status === 'completed')
+  },
+
+  getMultiRunStats: (runIds, testCases) => {
+    const stats = new Map<string, MultiRunStats>()
+    const state = get()
+
+    // Get all runs
+    const runs = runIds
+      .map((id) => state.runs.find((r) => r.id === id))
+      .filter((r): r is RunResult => r !== undefined && r.status === 'completed')
+
+    if (runs.length === 0) return stats
+
+    // Collect all models across runs
+    const allModels = new Set<string>()
+    for (const run of runs) {
+      for (const modelId of run.models) {
+        allModels.add(modelId)
+      }
+    }
+
+    // Calculate stats for each model
+    for (const modelId of allModels) {
+      const scores: number[] = []
+      const validRunIds: string[] = []
+
+      for (const run of runs) {
+        // Get aggregate score for this model in this run
+        const aggregateScores = state.getAggregateScores(run.id, testCases)
+        const score = aggregateScores.get(modelId)
+        if (score !== undefined) {
+          scores.push(score)
+          validRunIds.push(run.id)
+        }
+      }
+
+      if (scores.length === 0) continue
+
+      const mean = scores.reduce((a, b) => a + b, 0) / scores.length
+      const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length
+      const stdDev = Math.sqrt(variance)
+      const min = Math.min(...scores)
+      const max = Math.max(...scores)
+
+      // 95% CI = mean ± (1.96 * stdDev / sqrt(n))
+      const marginOfError = scores.length > 1 ? (1.96 * stdDev) / Math.sqrt(scores.length) : 0
+      const confidence95: [number, number] = [
+        Math.max(0, mean - marginOfError),
+        Math.min(1, mean + marginOfError)
+      ]
+
+      stats.set(modelId, {
+        runIds: validRunIds,
+        modelId,
+        scores,
+        mean,
+        stdDev,
+        min,
+        max,
+        confidence95,
+      })
+    }
+
+    return stats
+  },
+
+  compareModels: (runIds, modelA, modelB, testCases) => {
+    const state = get()
+    const multiRunStats = state.getMultiRunStats(runIds, testCases)
+
+    const statsA = multiRunStats.get(modelA)
+    const statsB = multiRunStats.get(modelB)
+
+    if (!statsA || !statsB) return null
+    if (statsA.scores.length < 2 || statsB.scores.length < 2) return null
+
+    const nA = statsA.scores.length
+    const nB = statsB.scores.length
+    const meanA = statsA.mean
+    const meanB = statsB.mean
+
+    // Calculate pooled standard error
+    const varA = statsA.scores.reduce((sum, s) => sum + Math.pow(s - meanA, 2), 0) / (nA - 1)
+    const varB = statsB.scores.reduce((sum, s) => sum + Math.pow(s - meanB, 2), 0) / (nB - 1)
+    
+    // Pooled variance for two-sample t-test
+    const pooledVar = ((nA - 1) * varA + (nB - 1) * varB) / (nA + nB - 2)
+    const pooledStdErr = Math.sqrt(pooledVar * (1/nA + 1/nB))
+
+    // t-statistic
+    const tStatistic = pooledStdErr > 0 ? (meanA - meanB) / pooledStdErr : 0
+
+    // Approximate p-value using normal distribution (good for n > 30, acceptable for smaller)
+    // For a more accurate p-value, we'd need a t-distribution table or library
+    const pValue = 2 * (1 - normalCDF(Math.abs(tStatistic)))
+
+    // Cohen's d effect size
+    const pooledStdDev = Math.sqrt(pooledVar)
+    const effectSize = pooledStdDev > 0 ? (meanA - meanB) / pooledStdDev : 0
+
+    return {
+      modelA,
+      modelB,
+      meanA,
+      meanB,
+      scoreDiff: meanA - meanB,
+      pooledStdErr,
+      tStatistic,
+      pValue,
+      isSignificant: pValue < 0.05,
+      effectSize,
+    }
+  },
 }))
+
+// Helper function for normal CDF approximation
+function normalCDF(x: number): number {
+  // Approximation of the cumulative distribution function for standard normal
+  const a1 =  0.254829592
+  const a2 = -0.284496736
+  const a3 =  1.421413741
+  const a4 = -1.453152027
+  const a5 =  1.061405429
+  const p  =  0.3275911
+
+  const sign = x < 0 ? -1 : 1
+  x = Math.abs(x) / Math.sqrt(2)
+
+  const t = 1.0 / (1.0 + p * x)
+  const y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x)
+
+  return 0.5 * (1.0 + sign * y)
+}
diff --git a/src/types/index.ts b/src/types/index.ts
index 2beb5f6..75873dc 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -51,6 +51,29 @@ export interface ScoringConfig {
   rubric?: string
 }
 
+// Aggregate scoring with statistics
+export interface AggregateScore {
+  mean: number           // Weighted mean score (0-1)
+  stdDev: number         // Standard deviation
+  min: number            // Minimum score
+  max: number            // Maximum score
+  count: number          // Number of scored results
+  totalWeight: number    // Sum of weights used
+  confidence95?: [number, number]  // 95% confidence interval
+}
+
+// Multi-run statistics
+export interface MultiRunStats {
+  runIds: string[]
+  modelId: string
+  scores: number[]       // Individual run scores
+  mean: number
+  stdDev: number
+  min: number
+  max: number
+  confidence95: [number, number]
+}
+
 // Model Types
 export interface OpenRouterModel {
   id: string
@@ -73,6 +96,7 @@ export interface ModelParameters {
   maxTokens: number
   frequencyPenalty: number
   presencePenalty: number
+  benchmarkMode?: boolean  // When true, uses temp=0 for reproducibility
 }
 
 // Execution Types
@@ -104,6 +128,9 @@ export interface RunResult {
   startedAt: number
   completedAt?: number
   judgeModel?: string
+  // Error tracking for surfacing in UI
+  errorCount?: number
+  errorSummary?: string
 }
 
 // Code Arena Types