From bf2daf73d72839e823a5256fc33e10048e79d96d Mon Sep 17 00:00:00 2001 From: oshtz Date: Mon, 22 Dec 2025 17:00:33 +0200 Subject: [PATCH 1/3] feat(tabs): add Analytics tab and update layout for improved UI Added a new "Analytics" tab to the MainTabs component, including its corresponding icon and content. Adjusted the layout to accommodate the new tab by increasing the grid column count from 4 to 5, enhancing the overall user interface and functionality. --- src/components/analytics/Analytics.tsx | 561 ++++++++++++++++++++++ src/components/layout/MainTabs.tsx | 12 +- src/services/analytics.ts | 638 +++++++++++++++++++++++++ 3 files changed, 1209 insertions(+), 2 deletions(-) create mode 100644 src/components/analytics/Analytics.tsx create mode 100644 src/services/analytics.ts diff --git a/src/components/analytics/Analytics.tsx b/src/components/analytics/Analytics.tsx new file mode 100644 index 0000000..92c3b43 --- /dev/null +++ b/src/components/analytics/Analytics.tsx @@ -0,0 +1,561 @@ +import { useMemo, useState } from 'react' +import { + Trophy, + TrendingUp, + Zap, + Target, + BarChart3, + Clock, + Award, + Flame, + ChevronDown, + ChevronUp, + Sparkles, + Medal, + Activity, +} from 'lucide-react' +import { Card, CardContent, CardHeader, CardTitle, CardDescription } from '@/components/ui/card' +import { Badge } from '@/components/ui/badge' +import { Progress } from '@/components/ui/progress' +import { Button } from '@/components/ui/button' +import { EmptyState } from '@/components/ui/empty-state' +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select' +import { useRunStore } from '@/stores/runStore' +import { useTestSuiteStore } from '@/stores/testSuiteStore' +import { computeAnalytics, type AnalyticsData, type InterestingFact } from '@/services/analytics' + +function FactIcon({ type }: { type: InterestingFact['type'] }) { + switch (type) { + case 'improvement': + return + case 'streak': + return + case 'record': + return + case 'comparison': + return + case 'insight': + return + default: + return + } +} + +function getRankBadge(rank: number) { + switch (rank) { + case 1: + return + case 2: + return + case 3: + return + default: + return #{rank} + } +} + +function OverallStats({ analytics }: { analytics: AnalyticsData }) { + return ( +
+ + + + + Total Runs + + + +
{analytics.totalRuns}
+

+ Completed benchmarks +

+
+
+ + + + + + Total Tests + + + +
{analytics.totalTests}
+

+ Individual evaluations +

+
+
+ + + + + + Models Tested + + + +
{analytics.totalModels}
+

+ Unique models evaluated +

+
+
+ + + + + + Average Score + + + +
+ {(analytics.avgScoreOverall * 100).toFixed(1)}% +
+

+ Across all models +

+
+
+
+ ) +} + +function InterestingFacts({ facts }: { facts: InterestingFact[] }) { + const [showAll, setShowAll] = useState(false) + const displayedFacts = showAll ? facts : facts.slice(0, 4) + + if (facts.length === 0) return null + + return ( + + + + + Interesting Facts + + + Insights and highlights from your benchmark data + + + +
+ {displayedFacts.map((fact, index) => ( +
+
+ +
+
+
+ {fact.title} + {fact.value && ( + + {fact.value} + + )} +
+

+ {fact.description} +

+
+
+ ))} +
+ {facts.length > 4 && ( + + )} +
+
+ ) +} + +function Leaderboard({ analytics }: { analytics: AnalyticsData }) { + const [selectedCategory, setSelectedCategory] = useState('overall') + + const categories = useMemo(() => { + const cats = ['overall', ...Array.from(analytics.categoryLeaderboards.keys())] + return cats + }, [analytics.categoryLeaderboards]) + + const leaderboard = useMemo(() => { + if (selectedCategory === 'overall') { + return analytics.overallLeaderboard + } + return analytics.categoryLeaderboards.get(selectedCategory) || [] + }, [selectedCategory, analytics]) + + if (analytics.overallLeaderboard.length === 0) { + return null + } + + return ( + + +
+
+ + + Model Leaderboard + + + Rankings based on average benchmark scores + +
+ {categories.length > 1 && ( + + )} +
+
+ +
+ {leaderboard.map((entry) => ( +
+
+ {getRankBadge(entry.rank)} +
+
+
{entry.modelName}
+
+ {entry.totalTests} tests + {entry.winRate > 0 && ( + + {(entry.winRate * 100).toFixed(0)}% win rate + + )} + {entry.consistency > 0 && ( + ±{(entry.consistency * 100).toFixed(1)}% variance + )} +
+
+
+
+ {(entry.avgScore * 100).toFixed(1)}% +
+ +
+
+ ))} +
+
+
+ ) +} + +function DifficultyBreakdown({ analytics }: { analytics: AnalyticsData }) { + if (analytics.difficultyStats.length === 0) return null + + const difficultyColors = { + easy: 'text-emerald-500 bg-emerald-500/10', + medium: 'text-yellow-500 bg-yellow-500/10', + hard: 'text-rose-500 bg-rose-500/10', + } + + return ( + + + + + Performance by Difficulty + + + How models perform across different difficulty levels + + + +
+ {analytics.difficultyStats.map((stat) => ( +
+
+ + {stat.difficulty} + + + {stat.totalTests} tests + +
+
+ {(stat.avgScore * 100).toFixed(1)}% +
+
+ Top: + {stat.topModel} + ({(stat.topModelScore * 100).toFixed(0)}%) +
+
+ ))} +
+
+
+ ) +} + +function ModelPerformanceDetails({ analytics }: { analytics: AnalyticsData }) { + const [expanded, setExpanded] = useState(false) + const modelStats = Array.from(analytics.modelStats.values()) + .sort((a, b) => b.avgScore - a.avgScore) + + if (modelStats.length === 0) return null + + const displayedModels = expanded ? modelStats : modelStats.slice(0, 5) + + return ( + + + + + Detailed Model Statistics + + + Comprehensive performance metrics for each model + + + +
+ + + + + + + + + + + + + + {displayedModels.map((stat) => ( + + + + + + + + + + ))} + +
ModelAvg ScoreTestsRunsAvg LatencySuccess RateWins
+ {stat.modelName} + + + {(stat.avgScore * 100).toFixed(1)}% + + + {stat.totalTests} + + {stat.totalRuns} + + {stat.avgLatency > 0 ? `${stat.avgLatency.toFixed(0)}ms` : '-'} + + = 0.9 ? 'text-emerald-600' : 'text-yellow-600'}> + {(stat.successRate * 100).toFixed(0)}% + + + {stat.winCount > 0 && ( + + {stat.winCount} + + )} +
+
+ {modelStats.length > 5 && ( + + )} +
+
+ ) +} + +function TimelineChart({ analytics }: { analytics: AnalyticsData }) { + if (analytics.timeSeriesData.length < 2) return null + + const maxScore = Math.max(...analytics.timeSeriesData.map(d => d.avgScore)) + const minScore = Math.min(...analytics.timeSeriesData.map(d => d.avgScore)) + const range = maxScore - minScore || 0.1 + + return ( + + + + + Performance Over Time + + + Average scores across benchmark runs + + + +
+ {analytics.timeSeriesData.map((point, index) => { + const height = ((point.avgScore - minScore) / range) * 100 + return ( +
+
+
+
+
{point.date}
+
+ Avg: {(point.avgScore * 100).toFixed(1)}% +
+
+
+
+ ) + })} +
+
+ {analytics.timeSeriesData[0]?.date} + {analytics.timeSeriesData[analytics.timeSeriesData.length - 1]?.date} +
+ + + ) +} + +export function Analytics() { + const { runs } = useRunStore() + const { testSuites } = useTestSuiteStore() + + const analytics = useMemo(() => { + return computeAnalytics(runs, testSuites) + }, [runs, testSuites]) + + if (runs.length === 0 || analytics.totalRuns === 0) { + return ( + s.testCases.length > 0), + }, + { + number: 2, + title: 'Run benchmarks', + description: 'Execute benchmarks with multiple models in the Arena', + completed: false, + }, + { + number: 3, + title: 'View insights', + description: 'Analytics will appear here after completing runs', + completed: false, + }, + ]} + /> + ) + } + + return ( +
+
+

Analytics

+

+ Insights and leaderboards from all your benchmark data + {analytics.dateRange && ( + + ({new Date(analytics.dateRange.start).toLocaleDateString()} - {new Date(analytics.dateRange.end).toLocaleDateString()}) + + )} +

+
+ + + + + +
+ + +
+ + + + +
+ ) +} diff --git a/src/components/layout/MainTabs.tsx b/src/components/layout/MainTabs.tsx index 39cf9bb..30294fb 100644 --- a/src/components/layout/MainTabs.tsx +++ b/src/components/layout/MainTabs.tsx @@ -1,21 +1,23 @@ import { TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs' import { cn } from '@/lib/utils' -import { BarChart3, Database, FileText, Play } from 'lucide-react' +import { BarChart3, Database, FileText, Play, PieChart } from 'lucide-react' import { PromptManager } from '@/components/prompt-manager/PromptManager' import { Arena } from '@/components/arena/Arena' import { Results } from '@/components/results/Results' +import { Analytics } from '@/components/analytics/Analytics' import { DataManager } from '@/components/data/DataManager' const tabs = [ { value: 'prompts', label: 'Prompts', Icon: FileText }, { value: 'arena', label: 'Arena', Icon: Play }, { value: 'results', label: 'Results', Icon: BarChart3 }, + { value: 'analytics', label: 'Analytics', Icon: PieChart }, { value: 'data', label: 'Data', Icon: Database }, ] export function MainTabsList({ className }: { className?: string }) { return ( - + {tabs.map(({ value, label, Icon }) => ( @@ -47,6 +49,12 @@ export function MainTabs() {
+ +
+ +
+
+
diff --git a/src/services/analytics.ts b/src/services/analytics.ts new file mode 100644 index 0000000..d755792 --- /dev/null +++ b/src/services/analytics.ts @@ -0,0 +1,638 @@ +import type { RunResult, TestCaseResult, TestSuite, TestCase } from '@/types' + +// Types for analytics data +export interface ModelStats { + modelId: string + modelName: string + totalRuns: number + totalTests: number + avgScore: number + avgLatency: number + avgTokens: number + successRate: number + winCount: number + scores: number[] +} + +export interface CategoryStats { + category: string + totalTests: number + avgScore: number + topModel: string + topModelScore: number + modelScores: Map +} + +export interface DifficultyStats { + difficulty: 'easy' | 'medium' | 'hard' + totalTests: number + avgScore: number + topModel: string + topModelScore: number +} + +export interface TimeSeriesDataPoint { + date: string + timestamp: number + modelScores: Map + avgScore: number +} + +export interface OverallLeaderboard { + rank: number + modelId: string + modelName: string + avgScore: number + totalTests: number + winRate: number + consistency: number // standard deviation of scores (lower is better) +} + +export interface InterestingFact { + type: 'improvement' | 'streak' | 'record' | 'comparison' | 'insight' + title: string + description: string + value?: string | number + icon?: string +} + +export interface AnalyticsData { + overallLeaderboard: OverallLeaderboard[] + categoryLeaderboards: Map + difficultyStats: DifficultyStats[] + modelStats: Map + timeSeriesData: TimeSeriesDataPoint[] + interestingFacts: InterestingFact[] + totalRuns: number + totalTests: number + totalModels: number + avgScoreOverall: number + dateRange: { start: number; end: number } | null +} + +// Helper to extract model name from full ID +function getModelName(modelId: string): string { + const parts = modelId.split('/') + return parts[parts.length - 1] || modelId +} + +// Calculate standard deviation +function calculateStdDev(values: number[]): number { + if (values.length === 0) return 0 + const mean = values.reduce((a, b) => a + b, 0) / values.length + const squaredDiffs = values.map(v => Math.pow(v - mean, 2)) + return Math.sqrt(squaredDiffs.reduce((a, b) => a + b, 0) / values.length) +} + +// Build model statistics from all runs +function buildModelStats(runs: RunResult[]): Map { + const stats = new Map() + const modelWins = new Map() + + // Count wins per run + for (const run of runs) { + if (run.status !== 'completed') continue + + const runScores = new Map() + + for (const result of run.results) { + if (result.score) { + const existing = runScores.get(result.modelId) || { total: 0, count: 0 } + runScores.set(result.modelId, { + total: existing.total + result.score.score, + count: existing.count + 1, + }) + } + } + + // Find winner of this run + let maxScore = -1 + let winner = '' + for (const [modelId, { total, count }] of runScores) { + const avg = count > 0 ? total / count : 0 + if (avg > maxScore) { + maxScore = avg + winner = modelId + } + } + if (winner) { + modelWins.set(winner, (modelWins.get(winner) || 0) + 1) + } + } + + // Build comprehensive stats + for (const run of runs) { + if (run.status !== 'completed') continue + + for (const result of run.results) { + let stat = stats.get(result.modelId) + if (!stat) { + stat = { + modelId: result.modelId, + modelName: getModelName(result.modelId), + totalRuns: 0, + totalTests: 0, + avgScore: 0, + avgLatency: 0, + avgTokens: 0, + successRate: 0, + winCount: modelWins.get(result.modelId) || 0, + scores: [], + } + stats.set(result.modelId, stat) + } + + stat.totalTests++ + if (result.score) { + stat.scores.push(result.score.score) + } + if (result.latencyMs) { + stat.avgLatency = (stat.avgLatency * (stat.totalTests - 1) + result.latencyMs) / stat.totalTests + } + if (result.tokenCount) { + stat.avgTokens = (stat.avgTokens * (stat.totalTests - 1) + result.tokenCount) / stat.totalTests + } + if (result.status === 'completed') { + stat.successRate = (stat.successRate * (stat.totalTests - 1) + 1) / stat.totalTests + } + } + } + + // Calculate average scores + for (const stat of stats.values()) { + if (stat.scores.length > 0) { + stat.avgScore = stat.scores.reduce((a, b) => a + b, 0) / stat.scores.length + } + // Count unique runs + const runsWithModel = new Set( + runs.filter(r => r.models.includes(stat.modelId)).map(r => r.id) + ) + stat.totalRuns = runsWithModel.size + } + + return stats +} + +// Build category-based statistics +function buildCategoryStats( + runs: RunResult[], + testSuites: TestSuite[] +): Map { + const categoryStats = new Map() + + // Create a map of test case IDs to their categories + const testCaseCategories = new Map() + for (const suite of testSuites) { + for (const testCase of suite.testCases) { + if (testCase.metadata.category) { + testCaseCategories.set(testCase.id, testCase.metadata.category) + } + } + } + + // Aggregate scores by category and model + const categoryModelScores = new Map>() + + for (const run of runs) { + if (run.status !== 'completed') continue + + for (const result of run.results) { + const category = testCaseCategories.get(result.testCaseId) + if (!category || !result.score) continue + + if (!categoryModelScores.has(category)) { + categoryModelScores.set(category, new Map()) + } + const modelScores = categoryModelScores.get(category)! + const existing = modelScores.get(result.modelId) || { total: 0, count: 0 } + modelScores.set(result.modelId, { + total: existing.total + result.score.score, + count: existing.count + 1, + }) + } + } + + // Build category stats + for (const [category, modelScores] of categoryModelScores) { + let totalTests = 0 + let totalScore = 0 + let topModel = '' + let topModelScore = 0 + const avgModelScores = new Map() + + for (const [modelId, { total, count }] of modelScores) { + totalTests += count + totalScore += total + const avg = count > 0 ? total / count : 0 + avgModelScores.set(modelId, avg) + if (avg > topModelScore) { + topModelScore = avg + topModel = modelId + } + } + + categoryStats.set(category, { + category, + totalTests, + avgScore: totalTests > 0 ? totalScore / totalTests : 0, + topModel: getModelName(topModel), + topModelScore, + modelScores: avgModelScores, + }) + } + + return categoryStats +} + +// Build difficulty-based statistics +function buildDifficultyStats( + runs: RunResult[], + testSuites: TestSuite[] +): DifficultyStats[] { + const difficultyScores = new Map>() + + // Create a map of test case IDs to their difficulty + const testCaseDifficulty = new Map() + for (const suite of testSuites) { + for (const testCase of suite.testCases) { + if (testCase.metadata.difficulty) { + testCaseDifficulty.set(testCase.id, testCase.metadata.difficulty) + } + } + } + + for (const run of runs) { + if (run.status !== 'completed') continue + + for (const result of run.results) { + const difficulty = testCaseDifficulty.get(result.testCaseId) + if (!difficulty || !result.score) continue + + if (!difficultyScores.has(difficulty)) { + difficultyScores.set(difficulty, new Map()) + } + const modelScores = difficultyScores.get(difficulty)! + const existing = modelScores.get(result.modelId) || { total: 0, count: 0 } + modelScores.set(result.modelId, { + total: existing.total + result.score.score, + count: existing.count + 1, + }) + } + } + + const stats: DifficultyStats[] = [] + for (const difficulty of ['easy', 'medium', 'hard'] as const) { + const modelScores = difficultyScores.get(difficulty) + if (!modelScores) continue + + let totalTests = 0 + let totalScore = 0 + let topModel = '' + let topModelScore = 0 + + for (const [modelId, { total, count }] of modelScores) { + totalTests += count + totalScore += total + const avg = count > 0 ? total / count : 0 + if (avg > topModelScore) { + topModelScore = avg + topModel = modelId + } + } + + stats.push({ + difficulty, + totalTests, + avgScore: totalTests > 0 ? totalScore / totalTests : 0, + topModel: getModelName(topModel), + topModelScore, + }) + } + + return stats +} + +// Build time series data for trend analysis +function buildTimeSeriesData(runs: RunResult[]): TimeSeriesDataPoint[] { + const completedRuns = runs + .filter(r => r.status === 'completed' && r.completedAt) + .sort((a, b) => a.startedAt - b.startedAt) + + const dataPoints: TimeSeriesDataPoint[] = [] + + for (const run of completedRuns) { + const modelScores = new Map() + const scoresByModel = new Map() + + for (const result of run.results) { + if (result.score) { + const existing = scoresByModel.get(result.modelId) || { total: 0, count: 0 } + scoresByModel.set(result.modelId, { + total: existing.total + result.score.score, + count: existing.count + 1, + }) + } + } + + let totalScore = 0 + let totalCount = 0 + for (const [modelId, { total, count }] of scoresByModel) { + modelScores.set(modelId, count > 0 ? total / count : 0) + totalScore += total + totalCount += count + } + + dataPoints.push({ + date: new Date(run.startedAt).toLocaleDateString(), + timestamp: run.startedAt, + modelScores, + avgScore: totalCount > 0 ? totalScore / totalCount : 0, + }) + } + + return dataPoints +} + +// Build overall leaderboard +function buildOverallLeaderboard(modelStats: Map): OverallLeaderboard[] { + const leaderboard: OverallLeaderboard[] = [] + const totalWins = Array.from(modelStats.values()).reduce((sum, s) => sum + s.winCount, 0) + + for (const stat of modelStats.values()) { + const consistency = calculateStdDev(stat.scores) + leaderboard.push({ + rank: 0, + modelId: stat.modelId, + modelName: stat.modelName, + avgScore: stat.avgScore, + totalTests: stat.totalTests, + winRate: totalWins > 0 ? stat.winCount / totalWins : 0, + consistency, + }) + } + + // Sort by average score descending + leaderboard.sort((a, b) => b.avgScore - a.avgScore) + + // Assign ranks + leaderboard.forEach((entry, index) => { + entry.rank = index + 1 + }) + + return leaderboard +} + +// Build category leaderboards +function buildCategoryLeaderboards( + categoryStats: Map, + modelStats: Map +): Map { + const leaderboards = new Map() + + for (const [category, stats] of categoryStats) { + const leaderboard: OverallLeaderboard[] = [] + + for (const [modelId, score] of stats.modelScores) { + const modelStat = modelStats.get(modelId) + leaderboard.push({ + rank: 0, + modelId, + modelName: getModelName(modelId), + avgScore: score, + totalTests: modelStat?.totalTests || 0, + winRate: 0, + consistency: modelStat ? calculateStdDev(modelStat.scores) : 0, + }) + } + + leaderboard.sort((a, b) => b.avgScore - a.avgScore) + leaderboard.forEach((entry, index) => { + entry.rank = index + 1 + }) + + leaderboards.set(category, leaderboard) + } + + return leaderboards +} + +// Generate interesting facts from the data +function generateInterestingFacts( + runs: RunResult[], + modelStats: Map, + timeSeriesData: TimeSeriesDataPoint[], + overallLeaderboard: OverallLeaderboard[] +): InterestingFact[] { + const facts: InterestingFact[] = [] + + // Fact: Most consistent model + if (overallLeaderboard.length > 0) { + const mostConsistent = [...overallLeaderboard].sort((a, b) => a.consistency - b.consistency)[0] + if (mostConsistent.consistency < 0.2) { + facts.push({ + type: 'insight', + title: 'Most Consistent Performer', + description: `${mostConsistent.modelName} shows the most consistent performance with minimal score variation.`, + value: `±${(mostConsistent.consistency * 100).toFixed(1)}%`, + icon: '🎯', + }) + } + } + + // Fact: Biggest improver over time + if (timeSeriesData.length >= 3) { + const modelImprovements = new Map() + const firstHalf = timeSeriesData.slice(0, Math.floor(timeSeriesData.length / 2)) + const secondHalf = timeSeriesData.slice(Math.floor(timeSeriesData.length / 2)) + + const allModels = new Set() + timeSeriesData.forEach(dp => dp.modelScores.forEach((_, m) => allModels.add(m))) + + for (const modelId of allModels) { + const firstScores = firstHalf.filter(dp => dp.modelScores.has(modelId)).map(dp => dp.modelScores.get(modelId)!) + const secondScores = secondHalf.filter(dp => dp.modelScores.has(modelId)).map(dp => dp.modelScores.get(modelId)!) + + if (firstScores.length > 0 && secondScores.length > 0) { + const firstAvg = firstScores.reduce((a, b) => a + b, 0) / firstScores.length + const secondAvg = secondScores.reduce((a, b) => a + b, 0) / secondScores.length + modelImprovements.set(modelId, secondAvg - firstAvg) + } + } + + const biggestImprover = [...modelImprovements.entries()].sort((a, b) => b[1] - a[1])[0] + if (biggestImprover && biggestImprover[1] > 0.05) { + facts.push({ + type: 'improvement', + title: 'Rising Star', + description: `${getModelName(biggestImprover[0])} has shown significant improvement over recent benchmarks.`, + value: `+${(biggestImprover[1] * 100).toFixed(1)}%`, + icon: '📈', + }) + } + } + + // Fact: Speed champion + const speedStats = Array.from(modelStats.values()).filter(s => s.avgLatency > 0) + if (speedStats.length > 0) { + const fastest = speedStats.sort((a, b) => a.avgLatency - b.avgLatency)[0] + facts.push({ + type: 'record', + title: 'Speed Champion', + description: `${fastest.modelName} has the fastest average response time.`, + value: `${fastest.avgLatency.toFixed(0)}ms`, + icon: '⚡', + }) + } + + // Fact: Most tested model + const mostTested = Array.from(modelStats.values()).sort((a, b) => b.totalTests - a.totalTests)[0] + if (mostTested && mostTested.totalTests > 10) { + facts.push({ + type: 'insight', + title: 'Most Tested', + description: `${mostTested.modelName} has been evaluated the most across all benchmarks.`, + value: `${mostTested.totalTests} tests`, + icon: '🔬', + }) + } + + // Fact: Perfect scores + const perfectScoreModels = Array.from(modelStats.values()).filter(s => + s.scores.some(score => score === 1) + ) + if (perfectScoreModels.length > 0) { + const perfectCounts = perfectScoreModels.map(s => ({ + model: s.modelName, + count: s.scores.filter(score => score === 1).length, + })).sort((a, b) => b.count - a.count) + + if (perfectCounts[0].count > 0) { + facts.push({ + type: 'record', + title: 'Perfect Score Leader', + description: `${perfectCounts[0].model} has achieved the most perfect scores.`, + value: `${perfectCounts[0].count} perfect`, + icon: '🏆', + }) + } + } + + // Fact: Head-to-head comparison + if (overallLeaderboard.length >= 2) { + const top2 = overallLeaderboard.slice(0, 2) + const scoreDiff = top2[0].avgScore - top2[1].avgScore + if (scoreDiff < 0.05) { + facts.push({ + type: 'comparison', + title: 'Close Competition', + description: `${top2[0].modelName} and ${top2[1].modelName} are in a tight race for the top spot.`, + value: `${(scoreDiff * 100).toFixed(1)}% gap`, + icon: '🏁', + }) + } + } + + // Fact: Total benchmarking stats + const completedRuns = runs.filter(r => r.status === 'completed') + if (completedRuns.length > 0) { + const totalDuration = completedRuns.reduce((sum, r) => + sum + (r.completedAt ? r.completedAt - r.startedAt : 0), 0 + ) + if (totalDuration > 60000) { + facts.push({ + type: 'insight', + title: 'Benchmarking Time', + description: 'Total time spent running benchmarks across all sessions.', + value: `${(totalDuration / 60000).toFixed(1)} min`, + icon: '⏱️', + }) + } + } + + // Fact: Win streak + if (timeSeriesData.length >= 3) { + const modelWinStreaks = new Map() + let currentStreak = new Map() + + for (const dp of timeSeriesData) { + let maxScore = -1 + let winner = '' + for (const [modelId, score] of dp.modelScores) { + if (score > maxScore) { + maxScore = score + winner = modelId + } + } + if (winner) { + const streak = (currentStreak.get(winner) || 0) + 1 + currentStreak.set(winner, streak) + // Reset other streaks + for (const modelId of currentStreak.keys()) { + if (modelId !== winner) { + currentStreak.set(modelId, 0) + } + } + // Track max streak + const maxStreak = modelWinStreaks.get(winner) || 0 + if (streak > maxStreak) { + modelWinStreaks.set(winner, streak) + } + } + } + + const longestStreak = [...modelWinStreaks.entries()].sort((a, b) => b[1] - a[1])[0] + if (longestStreak && longestStreak[1] >= 3) { + facts.push({ + type: 'streak', + title: 'Winning Streak', + description: `${getModelName(longestStreak[0])} achieved the longest consecutive win streak.`, + value: `${longestStreak[1]} wins`, + icon: '🔥', + }) + } + } + + return facts +} + +// Main analytics computation function +export function computeAnalytics( + runs: RunResult[], + testSuites: TestSuite[] +): AnalyticsData { + const completedRuns = runs.filter(r => r.status === 'completed') + + // Build all statistics + const modelStats = buildModelStats(runs) + const categoryStats = buildCategoryStats(runs, testSuites) + const difficultyStats = buildDifficultyStats(runs, testSuites) + const timeSeriesData = buildTimeSeriesData(runs) + const overallLeaderboard = buildOverallLeaderboard(modelStats) + const categoryLeaderboards = buildCategoryLeaderboards(categoryStats, modelStats) + const interestingFacts = generateInterestingFacts(runs, modelStats, timeSeriesData, overallLeaderboard) + + // Calculate overall stats + const allScores = Array.from(modelStats.values()).flatMap(s => s.scores) + const avgScoreOverall = allScores.length > 0 + ? allScores.reduce((a, b) => a + b, 0) / allScores.length + : 0 + + const timestamps = completedRuns.map(r => r.startedAt) + const dateRange = timestamps.length > 0 + ? { start: Math.min(...timestamps), end: Math.max(...timestamps) } + : null + + return { + overallLeaderboard, + categoryLeaderboards, + difficultyStats, + modelStats, + timeSeriesData, + interestingFacts, + totalRuns: completedRuns.length, + totalTests: allScores.length, + totalModels: modelStats.size, + avgScoreOverall, + dateRange, + } +} From 38986db7650cbba070a5db2a7ab40fe630ba6c64 Mon Sep 17 00:00:00 2001 From: oshtz Date: Mon, 22 Dec 2025 17:10:53 +0200 Subject: [PATCH 2/3] feat: update component styles and icons for improved UI consistency - Changed width of SelectTrigger in Leaderboard component for better layout. - Adjusted min and max width of div in TimelineChart for responsive design. - Replaced BarChart3 with BarChart and PieChart with LineChart in MainTabs for accurate representation of data. - Increased icon size in MainTabs for better visibility. --- src/components/analytics/Analytics.tsx | 4 ++-- src/components/layout/MainTabs.tsx | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/components/analytics/Analytics.tsx b/src/components/analytics/Analytics.tsx index 92c3b43..c1eea58 100644 --- a/src/components/analytics/Analytics.tsx +++ b/src/components/analytics/Analytics.tsx @@ -230,7 +230,7 @@ function Leaderboard({ analytics }: { analytics: AnalyticsData }) {
{categories.length > 1 && (