From 93eacbf0f2abb70733cfee4b00f8c3bdaa2851e3 Mon Sep 17 00:00:00 2001 From: oshtz Date: Tue, 23 Dec 2025 22:37:59 +0200 Subject: [PATCH] chore: update version to 0.0.9 and add new dependencies feat: enhance ModelSelector with price and context filters feat: implement import/export functionality for test cases in TestCaseList feat: add cost calculation and display in ReportSummary refactor: improve execution service to handle usage data and costs feat: add dropdown menu component for better UI interactions fix: update runStore and testSuiteStore to manage costs and batch test case additions --- README.md | 2 +- package-lock.json | 92 ++++++- package.json | 7 +- src-tauri/Cargo.lock | 2 +- src-tauri/Cargo.toml | 2 +- src-tauri/tauri.conf.json | 2 +- src/components/arena/ModelSelector.tsx | 138 +++++++++- .../prompt-manager/TestCaseList.tsx | 251 +++++++++++++++++- src/components/results/ReportSummary.tsx | 39 ++- src/components/ui/dropdown-menu.tsx | 189 +++++++++++++ src/services/execution.ts | 99 ++++--- src/services/openrouter.ts | 95 +++++++ src/stores/runStore.ts | 27 ++ src/stores/testSuiteStore.ts | 20 ++ src/types/index.ts | 3 + 15 files changed, 904 insertions(+), 64 deletions(-) create mode 100644 src/components/ui/dropdown-menu.tsx diff --git a/README.md b/README.md index f2b2a71..437fc39 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ Enter your OpenRouter API key in the Settings tab. ### Updates - The app checks for updates on startup. -- Click the version button in the header (e.g. `v0.0.8`) to view update status, release notes, or manually re-check. +- Click the version button in the header (e.g. `v0.0.9`) to view update status, release notes, or manually re-check. - Updates are pulled from GitHub Releases and expect a `Benchmaker-Portable.exe` asset on the latest tag. ## Development diff --git a/package-lock.json b/package-lock.json index 064bbaf..c6e4f3b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,17 +1,18 @@ { "name": "benchmaker", - "version": "0.0.8", + "version": "0.0.9", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "benchmaker", - "version": "0.0.8", + "version": "0.0.9", "dependencies": { "@monaco-editor/react": "^4.7.0", "@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-dialog": "^1.1.15", + "@radix-ui/react-dropdown-menu": "^2.1.16", "@radix-ui/react-label": "^2.1.8", "@radix-ui/react-progress": "^1.1.8", "@radix-ui/react-scroll-area": "^1.2.10", @@ -1193,6 +1194,35 @@ } } }, + "node_modules/@radix-ui/react-dropdown-menu": { + "version": "2.1.16", + "resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.16.tgz", + "integrity": "sha512-1PLGQEynI/3OX/ftV54COn+3Sud/Mn8vALg2rWnBLnRaGtJDduNW/22XjlGgPdpcIbiQxjKtb7BkcjP00nqfJw==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-menu": "2.1.16", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-focus-guards": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz", @@ -1297,6 +1327,64 @@ } } }, + "node_modules/@radix-ui/react-menu": { + "version": "2.1.16", + "resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.16.tgz", + "integrity": "sha512-72F2T+PLlphrqLcAotYPp0uJMr5SjP5SL01wfEspJbru5Zs5vQaSHb4VB3ZMJPimgHHCHG7gMOeOB9H3Hdmtxg==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-collection": "1.1.7", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-direction": "1.1.1", + "@radix-ui/react-dismissable-layer": "1.1.11", + "@radix-ui/react-focus-guards": "1.1.3", + "@radix-ui/react-focus-scope": "1.1.7", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-popper": "1.2.8", + "@radix-ui/react-portal": "1.1.9", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-roving-focus": "1.1.11", + "@radix-ui/react-slot": "1.2.3", + "@radix-ui/react-use-callback-ref": "1.1.1", + "aria-hidden": "^1.2.4", + "react-remove-scroll": "^2.6.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-slot": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.3.tgz", + "integrity": "sha512-aeNmHnBxbi2St0au6VBVC7JXFlhLlOnvIIlePNniyUNAClzmtAUEY8/pBiK3iHjufOlwA+c20/8jngo7xcrg8A==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-compose-refs": "1.1.2" + }, + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-popper": { "version": "1.2.8", "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz", diff --git a/package.json b/package.json index 64ddc0b..cb7cc0a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "benchmaker", - "version": "0.0.8", + "version": "0.0.9", "type": "module", "scripts": { "dev": "vite", @@ -11,11 +11,11 @@ "tauri:build": "tauri build" }, "dependencies": { - "@tauri-apps/api": "^1.6.0", "@monaco-editor/react": "^4.7.0", "@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-dialog": "^1.1.15", + "@radix-ui/react-dropdown-menu": "^2.1.16", "@radix-ui/react-label": "^2.1.8", "@radix-ui/react-progress": "^1.1.8", "@radix-ui/react-scroll-area": "^1.2.10", @@ -25,6 +25,7 @@ "@radix-ui/react-switch": "^1.2.6", "@radix-ui/react-tabs": "^1.1.13", "@radix-ui/react-toast": "^1.2.15", + "@tauri-apps/api": "^1.6.0", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-react": "^0.562.0", @@ -34,9 +35,9 @@ "zustand": "^5.0.9" }, "devDependencies": { - "@tauri-apps/cli": "^1.6.0", "@tailwindcss/postcss": "^4.1.18", "@tailwindcss/vite": "^4.1.18", + "@tauri-apps/cli": "^1.6.0", "@types/react": "^19.2.7", "@types/react-dom": "^19.2.3", "@vitejs/plugin-react": "^5.1.2", diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index e49a491..7686c1d 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -109,7 +109,7 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "benchmaker" -version = "0.0.8" +version = "0.0.9" dependencies = [ "rusqlite", "serde", diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index f1349f8..37e851a 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmaker" -version = "0.0.8" +version = "0.0.9" description = "Benchmaker" authors = ["you"] edition = "2021" diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index 1cd768b..2ee9d22 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -7,7 +7,7 @@ }, "package": { "productName": "Benchmaker", - "version": "0.0.8" + "version": "0.0.9" }, "tauri": { "allowlist": { diff --git a/src/components/arena/ModelSelector.tsx b/src/components/arena/ModelSelector.tsx index 0b723dc..bf2bb47 100644 --- a/src/components/arena/ModelSelector.tsx +++ b/src/components/arena/ModelSelector.tsx @@ -1,5 +1,5 @@ import { useState, useMemo } from 'react' -import { Search, Loader2, X } from 'lucide-react' +import { Search, Loader2, X, SlidersHorizontal, Filter } from 'lucide-react' import { Input } from '@/components/ui/input' import { Button } from '@/components/ui/button' import { Badge } from '@/components/ui/badge' @@ -8,6 +8,25 @@ import { ScrollArea } from '@/components/ui/scroll-area' import { Checkbox } from '@/components/ui/checkbox' import { useModelStore } from '@/stores/modelStore' +type PriceRange = 'all' | 'free' | 'cheap' | 'medium' | 'expensive' +type ContextRange = 'all' | '8k+' | '32k+' | '128k+' | '200k+' + +const PRICE_RANGES: { value: PriceRange; label: string; max?: number; min?: number }[] = [ + { value: 'all', label: 'All Prices' }, + { value: 'free', label: 'Free', max: 0 }, + { value: 'cheap', label: '<$1/M', max: 0.000001 }, + { value: 'medium', label: '$1-10/M', min: 0.000001, max: 0.00001 }, + { value: 'expensive', label: '>$10/M', min: 0.00001 }, +] + +const CONTEXT_RANGES: { value: ContextRange; label: string; min?: number }[] = [ + { value: 'all', label: 'Any Context' }, + { value: '8k+', label: '8K+', min: 8000 }, + { value: '32k+', label: '32K+', min: 32000 }, + { value: '128k+', label: '128K+', min: 128000 }, + { value: '200k+', label: '200K+', min: 200000 }, +] + export function ModelSelector() { const { availableModels, @@ -20,6 +39,9 @@ export function ModelSelector() { const [searchQuery, setSearchQuery] = useState('') const [providerFilter, setProviderFilter] = useState(null) + const [priceFilter, setPriceFilter] = useState('all') + const [contextFilter, setContextFilter] = useState('all') + const [showFilters, setShowFilters] = useState(false) const searchTerm = searchQuery.trim().toLowerCase() // Extract unique providers @@ -49,9 +71,34 @@ export function ModelSelector() { const matchesProvider = !providerFilter || model.id.startsWith(`${providerFilter}/`) - return matchesSearch && matchesProvider + // Price filter + const promptPrice = parseFloat(model.pricing.prompt) || 0 + const priceRange = PRICE_RANGES.find((r) => r.value === priceFilter) + let matchesPrice = true + if (priceRange && priceFilter !== 'all') { + if (priceRange.max !== undefined && priceRange.min !== undefined) { + matchesPrice = promptPrice >= priceRange.min && promptPrice <= priceRange.max + } else if (priceRange.max !== undefined) { + matchesPrice = promptPrice <= priceRange.max + } else if (priceRange.min !== undefined) { + matchesPrice = promptPrice >= priceRange.min + } + } + + // Context length filter + const contextRange = CONTEXT_RANGES.find((r) => r.value === contextFilter) + const matchesContext = + contextFilter === 'all' || (contextRange?.min !== undefined && model.context_length >= contextRange.min) + + return matchesSearch && matchesProvider && matchesPrice && matchesContext }) - }, [availableModels, searchTerm, providerFilter]) + }, [availableModels, searchTerm, providerFilter, priceFilter, contextFilter]) + + const activeFilterCount = [ + priceFilter !== 'all', + contextFilter !== 'all', + providerFilter !== null, + ].filter(Boolean).length const providersForTags = useMemo(() => { if (providerFilter && !providers.includes(providerFilter)) { @@ -96,6 +143,13 @@ export function ModelSelector() { ) } + const clearAllFilters = () => { + setProviderFilter(null) + setPriceFilter('all') + setContextFilter('all') + setSearchQuery('') + } + return ( @@ -103,16 +157,32 @@ export function ModelSelector() {
Model Selection - {selectedModelIds.length} model{selectedModelIds.length !== 1 ? 's' : ''}{' '} - queued + {filteredModels.length} of {availableModels.length} models + {selectedModelIds.length > 0 && ` • ${selectedModelIds.length} selected`}
- {selectedModelIds.length > 0 && ( - - )} + {selectedModelIds.length > 0 && ( + + )} +
@@ -129,6 +199,54 @@ export function ModelSelector() { + {showFilters && ( +
+
+ + + Filters + + {activeFilterCount > 0 && ( + + )} +
+ +
+ Price (per 1M tokens) +
+ {PRICE_RANGES.map((range) => ( + setPriceFilter(range.value)} + > + {range.label} + + ))} +
+
+ +
+ Context Length +
+ {CONTEXT_RANGES.map((range) => ( + setContextFilter(range.value)} + > + {range.label} + + ))} +
+
+
+ )} +
({ + prompt: tc.prompt, + expectedOutput: tc.expectedOutput, + scoringMethod: tc.scoringMethod, + weight: tc.weight, + category: tc.metadata.category, + difficulty: tc.metadata.difficulty, + tags: tc.metadata.tags, + })) + return JSON.stringify(exportData, null, 2) +} + +function exportToCSV(testCases: TestCase[]): string { + const headers = ['prompt', 'expectedOutput', 'scoringMethod', 'weight', 'category', 'difficulty', 'tags'] + const escapeCSV = (value: string | undefined | null): string => { + if (value === undefined || value === null) return '' + const str = String(value) + if (str.includes(',') || str.includes('"') || str.includes('\n')) { + return `"${str.replace(/"/g, '""')}"` + } + return str + } + + const rows = testCases.map((tc) => [ + escapeCSV(tc.prompt), + escapeCSV(tc.expectedOutput), + tc.scoringMethod, + String(tc.weight), + escapeCSV(tc.metadata.category), + tc.metadata.difficulty || '', + tc.metadata.tags.join(';'), + ]) + + return [headers.join(','), ...rows.map((row) => row.join(','))].join('\n') +} + +function parseJSON(content: string): Omit[] { + const data = JSON.parse(content) + const items = Array.isArray(data) ? data : [data] + + return items.map((item) => ({ + prompt: item.prompt || '', + expectedOutput: item.expectedOutput, + scoringMethod: item.scoringMethod || 'exact-match', + weight: item.weight ?? 1, + metadata: { + category: item.category, + difficulty: item.difficulty, + tags: Array.isArray(item.tags) ? item.tags : [], + }, + })) +} + +function parseCSV(content: string): Omit[] { + const lines = content.split('\n').filter((line) => line.trim()) + if (lines.length < 2) return [] + + const parseCSVLine = (line: string): string[] => { + const result: string[] = [] + let current = '' + let inQuotes = false + + for (let i = 0; i < line.length; i++) { + const char = line[i] + if (char === '"') { + if (inQuotes && line[i + 1] === '"') { + current += '"' + i++ + } else { + inQuotes = !inQuotes + } + } else if (char === ',' && !inQuotes) { + result.push(current) + current = '' + } else { + current += char + } + } + result.push(current) + return result + } + + const headers = parseCSVLine(lines[0]).map((h) => h.trim().toLowerCase()) + const promptIdx = headers.indexOf('prompt') + const expectedIdx = headers.indexOf('expectedoutput') + const scoringIdx = headers.indexOf('scoringmethod') + const weightIdx = headers.indexOf('weight') + const categoryIdx = headers.indexOf('category') + const difficultyIdx = headers.indexOf('difficulty') + const tagsIdx = headers.indexOf('tags') + + return lines.slice(1).map((line) => { + const values = parseCSVLine(line) + return { + prompt: values[promptIdx] || '', + expectedOutput: values[expectedIdx] || undefined, + scoringMethod: (values[scoringIdx] as ScoringMethod) || 'exact-match', + weight: parseFloat(values[weightIdx]) || 1, + metadata: { + category: values[categoryIdx] || undefined, + difficulty: (['easy', 'medium', 'hard'].includes(values[difficultyIdx]) + ? values[difficultyIdx] + : undefined) as 'easy' | 'medium' | 'hard' | undefined, + tags: values[tagsIdx] ? values[tagsIdx].split(';').filter(Boolean) : [], + }, + } + }) +} + export function TestCaseList({ testSuite }: TestCaseListProps) { - const { deleteTestCase } = useTestSuiteStore() + const { deleteTestCase, addTestCases } = useTestSuiteStore() + const { toast } = useToast() + const fileInputRef = useRef(null) const [expandedIds, setExpandedIds] = useState>(new Set()) const [editingTestCase, setEditingTestCase] = useState(null) const [isCreating, setIsCreating] = useState(false) @@ -48,6 +177,77 @@ export function TestCaseList({ testSuite }: TestCaseListProps) { deleteTestCase(testSuite.id, testCaseId) } + const handleExportJSON = () => { + const content = exportToJSON(testSuite.testCases) + const blob = new Blob([content], { type: 'application/json' }) + const url = URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url + a.download = `${testSuite.name.replace(/[^a-z0-9]/gi, '_')}_test_cases.json` + document.body.appendChild(a) + a.click() + document.body.removeChild(a) + URL.revokeObjectURL(url) + toast({ title: 'Exported', description: `${testSuite.testCases.length} test cases exported to JSON` }) + } + + const handleExportCSV = () => { + const content = exportToCSV(testSuite.testCases) + const blob = new Blob([content], { type: 'text/csv' }) + const url = URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url + a.download = `${testSuite.name.replace(/[^a-z0-9]/gi, '_')}_test_cases.csv` + document.body.appendChild(a) + a.click() + document.body.removeChild(a) + URL.revokeObjectURL(url) + toast({ title: 'Exported', description: `${testSuite.testCases.length} test cases exported to CSV` }) + } + + const handleImport = (event: React.ChangeEvent) => { + const file = event.target.files?.[0] + if (!file) return + + const reader = new FileReader() + reader.onload = (e) => { + try { + const content = e.target?.result as string + let testCases: Omit[] + + if (file.name.endsWith('.json')) { + testCases = parseJSON(content) + } else if (file.name.endsWith('.csv')) { + testCases = parseCSV(content) + } else { + throw new Error('Unsupported file format. Use JSON or CSV.') + } + + if (testCases.length === 0) { + throw new Error('No valid test cases found in file') + } + + addTestCases(testSuite.id, testCases) + toast({ + title: 'Imported', + description: `${testCases.length} test case${testCases.length !== 1 ? 's' : ''} imported successfully`, + }) + } catch (error) { + toast({ + title: 'Import failed', + description: error instanceof Error ? error.message : 'Failed to parse file', + variant: 'destructive', + }) + } + } + reader.readAsText(file) + + // Reset the input so the same file can be selected again + if (fileInputRef.current) { + fileInputRef.current.value = '' + } + } + const getScoringBadgeVariant = (method: string) => { switch (method) { case 'exact-match': @@ -80,6 +280,13 @@ export function TestCaseList({ testSuite }: TestCaseListProps) { return ( <> +
@@ -90,10 +297,38 @@ export function TestCaseList({ testSuite }: TestCaseListProps) { {testSuite.testCases.length !== 1 ? 's' : ''}
- +
+ + + + + + + + Export as JSON + + + + Export as CSV + + + + + +
diff --git a/src/components/results/ReportSummary.tsx b/src/components/results/ReportSummary.tsx index 8ccb985..e85272f 100644 --- a/src/components/results/ReportSummary.tsx +++ b/src/components/results/ReportSummary.tsx @@ -1,4 +1,4 @@ -import { Clock, CheckCircle, XCircle, AlertCircle } from 'lucide-react' +import { Clock, CheckCircle, XCircle, AlertCircle, DollarSign } from 'lucide-react' import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card' import { Badge } from '@/components/ui/badge' import { Progress } from '@/components/ui/progress' @@ -9,9 +9,18 @@ interface ReportSummaryProps { run: RunResult } +function formatCost(cost: number): string { + if (cost === 0) return '$0.00' + if (cost < 0.0001) return '<$0.0001' + if (cost < 0.01) return `$${cost.toFixed(4)}` + return `$${cost.toFixed(2)}` +} + export function ReportSummary({ run }: ReportSummaryProps) { - const { getAggregateScores } = useRunStore() + const { getAggregateScores, getTotalCost, getAggregateCosts } = useRunStore() const scores = getAggregateScores(run.id) + const totalCost = getTotalCost(run.id) + const modelCosts = getAggregateCosts(run.id) const duration = run.completedAt ? ((run.completedAt - run.startedAt) / 1000).toFixed(1) @@ -59,8 +68,11 @@ export function ReportSummary({ run }: ReportSummaryProps) { const sortedModels = Array.from(scores.entries()).sort((a, b) => b[1] - a[1]) + // Sort models by cost (cheapest first) for display + const sortedByCost = Array.from(modelCosts.entries()).sort((a, b) => a[1] - b[1]) + return ( -
+
@@ -124,7 +136,7 @@ export function ReportSummary({ run }: ReportSummaryProps) { - + Top Model @@ -145,6 +157,25 @@ export function ReportSummary({ run }: ReportSummaryProps) { )} + + + + + + Total Cost + + + +
+ {formatCost(totalCost)} +
+ {sortedByCost.length > 0 && ( +

+ Cheapest: {sortedByCost[0][0].split('/').pop()} ({formatCost(sortedByCost[0][1])}) +

+ )} +
+
) } diff --git a/src/components/ui/dropdown-menu.tsx b/src/components/ui/dropdown-menu.tsx new file mode 100644 index 0000000..2bb7fa4 --- /dev/null +++ b/src/components/ui/dropdown-menu.tsx @@ -0,0 +1,189 @@ +import * as React from "react" +import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu" +import { Check, ChevronRight, Circle } from "lucide-react" +import { cn } from "@/lib/utils" + +const DropdownMenu = DropdownMenuPrimitive.Root +const DropdownMenuTrigger = DropdownMenuPrimitive.Trigger +const DropdownMenuGroup = DropdownMenuPrimitive.Group +const DropdownMenuPortal = DropdownMenuPrimitive.Portal +const DropdownMenuSub = DropdownMenuPrimitive.Sub +const DropdownMenuRadioGroup = DropdownMenuPrimitive.RadioGroup + +const DropdownMenuSubTrigger = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef & { + inset?: boolean + } +>(({ className, inset, children, ...props }, ref) => ( + + {children} + + +)) +DropdownMenuSubTrigger.displayName = DropdownMenuPrimitive.SubTrigger.displayName + +const DropdownMenuSubContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)) +DropdownMenuSubContent.displayName = DropdownMenuPrimitive.SubContent.displayName + +const DropdownMenuContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, sideOffset = 4, ...props }, ref) => ( + + + +)) +DropdownMenuContent.displayName = DropdownMenuPrimitive.Content.displayName + +const DropdownMenuItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef & { + inset?: boolean + } +>(({ className, inset, ...props }, ref) => ( + +)) +DropdownMenuItem.displayName = DropdownMenuPrimitive.Item.displayName + +const DropdownMenuCheckboxItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, children, checked, ...props }, ref) => ( + + + + + + + {children} + +)) +DropdownMenuCheckboxItem.displayName = DropdownMenuPrimitive.CheckboxItem.displayName + +const DropdownMenuRadioItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, children, ...props }, ref) => ( + + + + + + + {children} + +)) +DropdownMenuRadioItem.displayName = DropdownMenuPrimitive.RadioItem.displayName + +const DropdownMenuLabel = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef & { + inset?: boolean + } +>(({ className, inset, ...props }, ref) => ( + +)) +DropdownMenuLabel.displayName = DropdownMenuPrimitive.Label.displayName + +const DropdownMenuSeparator = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)) +DropdownMenuSeparator.displayName = DropdownMenuPrimitive.Separator.displayName + +const DropdownMenuShortcut = ({ + className, + ...props +}: React.HTMLAttributes) => { + return ( + + ) +} +DropdownMenuShortcut.displayName = "DropdownMenuShortcut" + +export { + DropdownMenu, + DropdownMenuTrigger, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuCheckboxItem, + DropdownMenuRadioItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuShortcut, + DropdownMenuGroup, + DropdownMenuPortal, + DropdownMenuSub, + DropdownMenuSubContent, + DropdownMenuSubTrigger, + DropdownMenuRadioGroup, +} diff --git a/src/services/execution.ts b/src/services/execution.ts index ee778aa..eb7fc44 100644 --- a/src/services/execution.ts +++ b/src/services/execution.ts @@ -1,8 +1,23 @@ -import { getOpenRouterClient } from './openrouter' +import { getOpenRouterClient, type StreamResult } from './openrouter' import { useRunStore } from '@/stores/runStore' import { useModelStore } from '@/stores/modelStore' import { scoreResponse } from '@/scoring' -import type { TestSuite, TestCaseResult, ChatMessage, ModelParameters } from '@/types' +import type { TestSuite, TestCaseResult, ChatMessage, ModelParameters, OpenRouterModel } from '@/types' + +function calculateCost( + usage: StreamResult['usage'], + model: OpenRouterModel | undefined +): number | undefined { + if (!usage || !model) return undefined + + const promptPrice = parseFloat(model.pricing.prompt) || 0 + const completionPrice = parseFloat(model.pricing.completion) || 0 + + const promptCost = usage.prompt_tokens * promptPrice + const completionCost = usage.completion_tokens * completionPrice + + return promptCost + completionCost +} export async function executeRun( runId: string, @@ -12,7 +27,10 @@ export async function executeRun( ): Promise { const client = getOpenRouterClient(apiKey) const { addResult, updateResult, setResultScore, completeRun } = useRunStore.getState() - const { selectedModelIds, parameters, judgeModelId } = useModelStore.getState() + const { selectedModelIds, parameters, judgeModelId, availableModels } = useModelStore.getState() + + // Create a map for quick model lookup + const modelMap = new Map(availableModels.map(m => [m.id, m])) // Create initial result entries for all test case + model combinations for (const testCase of testSuite.testCases) { @@ -59,7 +77,7 @@ export async function executeRun( content: testCase.prompt, }) - const fullResponse = await generateResponseWithRetries( + const { content: fullResponse, usage } = await generateResponseWithRetries( client, modelId, messages, @@ -71,11 +89,16 @@ export async function executeRun( ) const latencyMs = Date.now() - startTime + const model = modelMap.get(modelId) + const cost = calculateCost(usage, model) updateResult(runId, testCase.id, modelId, { response: fullResponse, status: 'completed', latencyMs, + promptTokens: usage?.prompt_tokens, + completionTokens: usage?.completion_tokens, + cost, }) // Score the response @@ -112,6 +135,11 @@ export async function executeRun( const MAX_EMPTY_RESPONSE_RETRIES = 2 const EMPTY_RESPONSE_BACKOFF_MS = 400 +interface ResponseWithUsage { + content: string + usage?: StreamResult['usage'] +} + async function generateResponseWithRetries( client: ReturnType, modelId: string, @@ -121,47 +149,49 @@ async function generateResponseWithRetries( testCaseId: string, signal: AbortSignal, updateResult: (runId: string, testCaseId: string, modelId: string, updates: Partial) => void -): Promise { +): Promise { for (let attempt = 0; attempt <= MAX_EMPTY_RESPONSE_RETRIES; attempt++) { - let fullResponse = '' updateResult(runId, testCaseId, modelId, { streamedContent: '' }) - const stream = client.createChatCompletionStream({ - model: modelId, - messages, - temperature: parameters.temperature, - top_p: parameters.topP, - max_tokens: parameters.maxTokens, - frequency_penalty: parameters.frequencyPenalty, - presence_penalty: parameters.presencePenalty, - }) + if (signal.aborted) { + throw new DOMException('Aborted', 'AbortError') + } - for await (const chunk of stream) { - if (signal.aborted) { - throw new DOMException('Aborted', 'AbortError') + let streamedContent = '' + const result = await client.createChatCompletionStreamWithUsage( + { + model: modelId, + messages, + temperature: parameters.temperature, + top_p: parameters.topP, + max_tokens: parameters.maxTokens, + frequency_penalty: parameters.frequencyPenalty, + presence_penalty: parameters.presencePenalty, + }, + (chunk) => { + streamedContent += chunk + updateResult(runId, testCaseId, modelId, { + streamedContent, + }) } - fullResponse += chunk - updateResult(runId, testCaseId, modelId, { - streamedContent: fullResponse, - }) - } + ) - if (fullResponse.trim().length > 0) { - return fullResponse + if (result.content.trim().length > 0) { + return result } - const fallbackResponse = await fetchNonStreamingResponse( + const fallbackResult = await fetchNonStreamingResponse( client, modelId, messages, parameters, signal ) - if (fallbackResponse.trim().length > 0) { + if (fallbackResult.content.trim().length > 0) { updateResult(runId, testCaseId, modelId, { - streamedContent: fallbackResponse, + streamedContent: fallbackResult.content, }) - return fallbackResponse + return fallbackResult } if (attempt < MAX_EMPTY_RESPONSE_RETRIES) { @@ -172,7 +202,7 @@ async function generateResponseWithRetries( } } - return '' + return { content: '' } } async function fetchNonStreamingResponse( @@ -181,7 +211,7 @@ async function fetchNonStreamingResponse( messages: ChatMessage[], parameters: ModelParameters, signal: AbortSignal -): Promise { +): Promise { if (signal.aborted) { throw new DOMException('Aborted', 'AbortError') } @@ -198,9 +228,12 @@ async function fetchNonStreamingResponse( }) const content = completion.choices?.[0]?.message?.content - return typeof content === 'string' ? content : '' + return { + content: typeof content === 'string' ? content : '', + usage: completion.usage, + } } catch { - return '' + return { content: '' } } } diff --git a/src/services/openrouter.ts b/src/services/openrouter.ts index 6da9a8d..02d83e0 100644 --- a/src/services/openrouter.ts +++ b/src/services/openrouter.ts @@ -4,6 +4,15 @@ import type { ChatCompletionResponse, } from '@/types' +export interface StreamResult { + content: string + usage?: { + prompt_tokens: number + completion_tokens: number + total_tokens: number + } +} + const OPENROUTER_API_URL = 'https://openrouter.ai/api/v1' export class OpenRouterClient { @@ -152,6 +161,92 @@ export class OpenRouterClient { return false } } + + async createChatCompletionStreamWithUsage( + request: ChatCompletionRequest, + onChunk?: (content: string) => void + ): Promise { + const response = await fetch(`${OPENROUTER_API_URL}/chat/completions`, { + method: 'POST', + headers: this.getHeaders(), + body: JSON.stringify({ ...request, stream: true }), + }) + + if (!response.ok) { + const error = await response.json().catch(() => ({})) + throw new Error(error.error?.message || `API request failed: ${response.status}`) + } + + const reader = response.body?.getReader() + if (!reader) { + throw new Error('Failed to get response reader') + } + + const decoder = new TextDecoder() + let buffer = '' + let fullContent = '' + let usage: StreamResult['usage'] | undefined + + try { + while (true) { + const { done, value } = await reader.read() + if (done) break + + const chunk = decoder.decode(value, { stream: true }) + buffer += chunk + const lines = buffer.split('\n') + buffer = lines.pop() || '' + + for (const line of lines) { + const trimmed = line.trim() + if (!trimmed || !trimmed.startsWith('data:')) continue + if (trimmed === 'data: [DONE]') continue + if (!trimmed.startsWith('data: ')) continue + + try { + const json = JSON.parse(trimmed.slice(6)) + const content = + json.choices?.[0]?.delta?.content ?? + json.choices?.[0]?.message?.content + if (content) { + fullContent += content + onChunk?.(content) + } + // Capture usage if present (usually in final chunk) + if (json.usage) { + usage = json.usage + } + } catch { + // Skip invalid JSON + } + } + } + + // Process remaining buffer + const tail = buffer.trim() + if (tail.startsWith('data: ') && tail !== 'data: [DONE]') { + try { + const json = JSON.parse(tail.slice(6)) + const content = + json.choices?.[0]?.delta?.content ?? + json.choices?.[0]?.message?.content + if (content) { + fullContent += content + onChunk?.(content) + } + if (json.usage) { + usage = json.usage + } + } catch { + // Skip invalid JSON + } + } + + return { content: fullContent, usage } + } finally { + reader.releaseLock() + } + } } // Singleton instance management diff --git a/src/stores/runStore.ts b/src/stores/runStore.ts index 6f9c062..1e37ef1 100644 --- a/src/stores/runStore.ts +++ b/src/stores/runStore.ts @@ -25,6 +25,8 @@ interface RunState { getResultsForTestCase: (runId: string, testCaseId: string) => TestCaseResult[] getResultsForModel: (runId: string, modelId: string) => TestCaseResult[] getAggregateScores: (runId: string) => Map + getAggregateCosts: (runId: string) => Map + getTotalCost: (runId: string) => number } function generateId(): string { @@ -178,4 +180,29 @@ export const useRunStore = create()((set, get) => ({ return scores }, + + getAggregateCosts: (runId) => { + const run = get().runs.find((r) => r.id === runId) + const costs = new Map() + + if (!run) return costs + + for (const result of run.results) { + if (result.cost !== undefined) { + const existing = costs.get(result.modelId) || 0 + costs.set(result.modelId, existing + result.cost) + } + } + + return costs + }, + + getTotalCost: (runId) => { + const run = get().runs.find((r) => r.id === runId) + if (!run) return 0 + + return run.results.reduce((total, result) => { + return total + (result.cost || 0) + }, 0) + }, })) diff --git a/src/stores/testSuiteStore.ts b/src/stores/testSuiteStore.ts index e1b7c13..a332eb6 100644 --- a/src/stores/testSuiteStore.ts +++ b/src/stores/testSuiteStore.ts @@ -18,6 +18,7 @@ interface TestSuiteState { // Test Case Actions addTestCase: (testSuiteId: string, testCase: Omit) => TestCase + addTestCases: (testSuiteId: string, testCases: Omit[]) => TestCase[] updateTestCase: (testSuiteId: string, testCaseId: string, updates: Partial>) => void deleteTestCase: (testSuiteId: string, testCaseId: string) => void reorderTestCases: (testSuiteId: string, testCaseIds: string[]) => void @@ -113,6 +114,25 @@ export const useTestSuiteStore = create()((set, get) => ({ return newTestCase }, + addTestCases: (testSuiteId, testCasesData) => { + const newTestCases: TestCase[] = testCasesData.map((tc) => ({ + id: generateId(), + ...tc, + })) + set((state) => ({ + testSuites: state.testSuites.map((suite) => + suite.id === testSuiteId + ? { + ...suite, + testCases: [...suite.testCases, ...newTestCases], + updatedAt: Date.now(), + } + : suite + ), + })) + return newTestCases + }, + updateTestCase: (testSuiteId, testCaseId, updates) => { set((state) => ({ testSuites: state.testSuites.map((suite) => diff --git a/src/types/index.ts b/src/types/index.ts index 862ca8f..22aeb25 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -83,6 +83,9 @@ export interface TestCaseResult { modelId: string response: string tokenCount?: number + promptTokens?: number + completionTokens?: number + cost?: number // Cost in USD latencyMs?: number status: ExecutionStatus error?: string