diff --git a/clawbench/dynamics.py b/clawbench/dynamics.py index 7086c1c..30c91d1 100644 --- a/clawbench/dynamics.py +++ b/clawbench/dynamics.py @@ -60,6 +60,7 @@ class Dynamics: pca_trajectory: np.ndarray | None = None # (n_steps, 2) bigram_transitions: dict[str, dict[str, float]] = field(default_factory=dict) memory_depth: float = 0.0 # I(X_t; X_{t-2} | X_{t-1}) + renyi_d2: float = 0.0 @dataclass @@ -287,6 +288,7 @@ def compute_dynamics(transcript: Transcript) -> Dynamics: } ci = 0.5 + renyi_d2 = 0.0 if n > 2: cov = np.cov(X.T) eigvals = np.maximum(np.linalg.eigvalsh(cov), 0) @@ -295,6 +297,8 @@ def compute_dynamics(transcript: Transcript) -> Dynamics: p = eigvals / tv pr = 1.0 / np.sum(p**2) ci = 1.0 - (pr - 1) / (X.shape[1] - 1) + sum_p2 = np.sum(p**2) + renyi_d2 = float(-np.log2(sum_p2)) if sum_p2 > 0 else 0.0 h = _entropy(dict(fam_acc)) er = err_count / n if n else 0 @@ -320,6 +324,7 @@ def compute_dynamics(transcript: Transcript) -> Dynamics: constraint_index=ci, bigram_transitions=_compute_bigram_transitions(families), memory_depth=_conditional_mi(families), + renyi_d2=renyi_d2, ) diff --git a/clawbench/dynamics_archive.py b/clawbench/dynamics_archive.py index 929e3e6..4602550 100644 --- a/clawbench/dynamics_archive.py +++ b/clawbench/dynamics_archive.py @@ -102,11 +102,16 @@ def discover_model_roots(archive_dir: Path) -> dict[str, Path]: if _is_task_collection_root(archive_dir): return {archive_dir.name: archive_dir} - roots = { - child.name: child - for child in sorted(archive_dir.iterdir()) - if child.is_dir() and _is_task_collection_root(child) - } + roots = {} + for child in sorted(archive_dir.iterdir()): + if not child.is_dir(): + continue + if _is_task_collection_root(child): + roots[child.name] = child + else: + for subchild in sorted(child.iterdir()): + if subchild.is_dir() and _is_task_collection_root(subchild): + roots[f"{child.name}/{subchild.name}"] = subchild return roots diff --git a/docs/long_term_dynamics.md b/docs/long_term_dynamics.md new file mode 100644 index 0000000..62ffbbf --- /dev/null +++ b/docs/long_term_dynamics.md @@ -0,0 +1,182 @@ +# When Large Language Models Are Dreaming, Where Do They Go? +## Investigating the Long-Term Dynamics of Long-Running LLM Reasoning Systems + +Long-running LLM-based agents are increasingly used for autonomous planning and reasoning, yet their behavior is typically studied only over short horizons. When an LLM repeatedly conditions on its own outputs, it forms an iterative stochastic process whose long-term dynamics remain poorly characterized. This document outlines an empirical framework that treats LLM reasoning/agent loops as dynamical systems and studies their asymptotic behavior under varying degrees of prompt constraint. + +--- + +## 1. Introduction: The Need for Dynamical Diagnostics + +**Key question: what happens if we keep an LLM agent running?** + +Large language models (LLMs) are increasingly deployed within long-running reasoning and agentic systems that iteratively plan, reflect, and revise in natural language. In these settings, a model repeatedly conditions on its own outputs, forming an iterative stochastic process whose behavior extends far beyond single-step inference. Despite extensive work on short-horizon accuracy and capability, we lack a principled understanding of the **long-term dynamics** of such systems: whether they converge to stable behaviors, enter cycles, drift semantically, or exhibit sensitivity to small perturbations when constraints weaken. + +This gap is especially important for **reliability and safety**. Long-horizon instability may manifest as goal drift, runaway loops, incoherence, or brittle behavior under minor prompt changes. Conversely, stable attractor-like behavior may explain why some agentic systems remain controllable over long durations. We therefore treat long-running LLM reasoning not merely as next-token prediction, but as a **dynamical system evolving in semantic space**. + +--- + +## 2. Methodology: Experiment & Formulation + +### 2.1 System Definition (Rollouts) +Fix a model $M$, a loop template $\mathcal{T}$, sampling parameters $\theta$ (e.g., temperature/top-$p$), a horizon $H$, and a random seed $r$. Starting from a query $q$, generate a trajectory $\tau=(x_t)_{t=0}^{H}$ by repeated self-conditioning. Conceptually, this defines an observed stochastic dynamical system: + +$$ x_{t+1} \sim \mathcal{K}_{M,\mathcal{T},\theta}(\,\cdot \mid x_t, q\,) $$ + +where $\mathcal{K}$ is the transition kernel induced by the model, template, and decoding. + +### 2.2 Query Design and the Constraint Index $C(q)$ +We construct a controlled prompt set spanning general-purpose vs. domain-specific, open-ended vs. closed objective, and self-referential vs. task-oriented instructions. For each query $q$, we compute a **Constraint Index** $C(q)$ using three measurable components: + +1. **Topic Coverage (Participation Ratio / PCA Dimension)** + Embed an initial batch of responses to $q$ (or short rollouts), compute covariance $\Sigma_q$, and define effective dimension: + $$ \mathrm{PR}(q) = \frac{\bigl(\mathrm{tr}(\Sigma_q)\bigr)^2}{\mathrm{tr}(\Sigma_q^2)} $$ +2. **Ambiguity / Diversity (Entropy Proxy)** + We measure action-space diversity using **Shannon Entropy ($H$) over tool-family categorical distributions** across the transcript steps, acting as a proxy for the ambiguity of the prompt. +3. **Repetition / Predictability (Bayesian Optimal Prediction Score - BOPS)** + Quantify predictability via a BOPS computed from an optimal predictor over the observed history. Higher values indicate stronger repetitive structure. + +We combine these components (e.g., z-scored weighted sum) into $C(q)$ and retain each component for ablations. + +> **Implementation:** Computed in `scripts/posterior/2_compute_constraint_index.py` and powered by `clawbench.dynamics.compute_dynamics`. + +### 2.3 State Representations (Behavioral Action-Space Embeddings) +At each step, we map text $x_t$ to a semantic state. Rather than relying on dense pre-trained textual NLU embeddings (which can dilute intent), we use a structured **10-dimensional Behavioral Feature Matrix**. +* **Embedding space:** Extracted directly from the agent's actions, features include: `[0:6]` proportions of tool-family usage (e.g., `browser`, `execute`, `search`), `[6]` success/error flags, `[7]` normalized token consumption, `[8]` normalized text length, and `[9]` temporal trajectory progress. + +We compute uncertainty (logit entropy/self-consistency), drift and step size ($\|e_t-e_1\|$, $\|e_t-e_{t-1}\|$), recurrence (kNN revisits), and distance to an early-step centroid. + +> **Implementation:** Computed in `clawbench.dynamics.Dynamics` representations. + +### 2.4 Effective Volume and Manifold-Aware Support +For a window $E=\{e_t\}_{t=1}^T$, we treat "volume" as a proxy for support size/coverage. With empirical covariance $\Sigma$: +$$ \mathrm{Vol}_{\log}(E) = \log\det(\Sigma + \varepsilon I) $$ +We also estimate intrinsic dimension $\widehat{m}$ and a robust radius $r$ (median kNN distance), yielding $V_{\mathrm{eff}} \propto r^{\widehat{m}}$. + +> **Implementation:** Computed via covariance matrices within `clawbench.dynamics.compute_dynamics`. + +### 2.5 Clustering Tasks via PCA Participation Ratio +We use the Participation Ratio ($PR$) to mathematically cluster tasks based on the size of their dynamic attractors: +* **High $PR$ Clusters (Diffusive/Wandering)**: Tasks with ambiguous instructions. The variance is distributed across many principal components, implying isotropic diffusion across a wide semantic space. +* **Low $PR$ Clusters (Trapped/Convergent)**: Highly constrained tasks with clear checks. The variance is dominated by a few components, showing rapid collapse to a specific path or limit-cycle. +By calculating the distance between centroids of these clusters in PCA space, we determine if similar tasks converge to the same dynamical basin, and observe how perturbations shift trajectories within or across these clusters. + +> **Implementation:** PR values are extracted via `clawbench.dynamics.compute_dynamics` and aggregated in `scripts/posterior/2_compute_constraint_index.py`. + +--- + +## 3. Perturbation Sensitivity ($\widehat{\lambda}(t)$) + +For each query $q$, we create perturbed variants $q'$ (lexical/syntactic paraphrases and controlled semantic nudges). We run matched rollouts and compare trajectories via $D_t=d(e_t,e'_t)$ and a Lyapunov-like divergence-rate proxy: + +$$ \widehat{\lambda}(t) = \frac{1}{t}\log\frac{D_t+\epsilon}{D_0+\epsilon} $$ + +A positive $\widehat{\lambda}(t)$ indicates extreme sensitivity, where tiny changes in prompt conditions lead to exponentially diverging behavior sequences over the horizon, often resulting in regime switching. + +> **Implementation:** Computed directly via `clawbench.dynamics.compute_sensitivity`. + +--- + +## 4. Theory-Guided Signatures and Expected Regimes + +We expect distinct empirical dynamical regimes across the landscape of tasks and models: +1. **Trapped/Attractor-like:** low support size (small $\mathrm{Vol}_{\log}$), high recurrence, high predictability (high BOPS). +2. **Limit-cycle-like:** high recurrence with bounded drift and quasi-periodic revisits. +3. **Diffusive/Wandering:** increasing support size and drift with low recurrence. +4. **High Sensitivity:** small perturbation $\delta(q,q')$ yields large long-horizon divergence (large $\widehat{\lambda}(t)$). + +Empirically, weaker constraints (lower $C(q)$) increase long-run sensitivity and diffusion, while stronger constraints induce bounded behavior. The trajectory $S_t = \phi(x_t)$ induces an approximate time-homogeneous Markov kernel $P(S_t, \cdot)$, yielding testable hypotheses: + +### Ergodicity and Convergence Rates +If $P$ is ergodic with stationary distribution $\pi$: +$$ \frac{1}{T}\sum_{t=1}^T f(S_t) \;\xrightarrow[T\to\infty]{}\; \mathbb{E}_{\pi}[f] $$ +When a contraction-like bound holds (e.g., Dobrushin coefficient $<1$), windowed metrics rapidly stabilize. *Diagnostic:* Windowed averages flatten; shrinking seed-to-seed dispersion. + +> **Implementation:** Bound approximations are verified via variance reductions in `clawbench.dynamics.StratifiedAssessment.reweight`. + +### Mixing Diagnostics via Dependence Coefficients +Decay of dependence reveals mixing vs. periodicity: +$$ I(S_t;S_{t+k}) \;\to\; 0 \quad (k\to\infty) $$ +*Diagnostic:* Autocorrelation curves and return-time plots. + +> **Implementation:** Autocovariance logic forms the core of `clawbench.dynamics._classify_regime`. + +### Information-Theoretic Structure & Guidance +The entropy rate limits predictability: +$$ h = \lim_{t\to\infty} H(S_{t+1}\mid S_{1:t}) \le H(S_{t+1}) $$ +Innovation is separated from memory via $I(S_{t+1};S_{1:t})$. Lower decoding temperatures generally reduce entropy proxies but empirically we must verify if this yields "healthy stabilization" or collapses into repetitive traps. + +> **Implementation:** Entropy calculation relies on `clawbench.dynamics.compute_dynamics` (`tool_entropy`). + +### R\'enyi and Correlation Dimensions +For the correlation integral $C_T(r)$, the correlation dimension is: +$$ D_2 = \lim_{r\downarrow 0}\frac{d\log C_T(r)}{d\log r} $$ +More generally, R\'enyi dimensions $D_q$ reveal attractor complexity. *Diagnostic:* Saturation of $PR$ and $D_q$ implies attraction to a low-dimensional set. + +> **Implementation:** PCA eigenvalue saturation evaluated in `clawbench.dynamics.compute_dynamics`. + +### Bayesian Optimal Prediction Score (BOPS) +The expected one-step log-loss equals conditional entropy: +$$ \inf_{\hat p_t}\;\mathbb{E}\bigl[-\log \hat p_t(S_{t+1})\bigr] = H(S_{t+1}\mid S_{1:t}) $$ +Normalized into a predictive probability score (BOPS), it reveals when a process becomes algorithmically predictable. Furthermore, for each step, measuring the entropy of the next action predicted by the model alongside its argmax allows us to bound (via a Lagrangian relaxation) how much information is lost by taking the Bayesian optimal or greedy action. + +> **Implementation:** Integrated into the $C(q)$ calculation within `scripts/posterior/2_compute_constraint_index.py`. + +### Survival Analysis & Latent-State Markov Models +Treating failure (e.g., incoherence/runaway) as an absorbing event $T_F$, survival statistics quantify long-term resilience: +$$ \mathsf{S}(t) = \mathbb{P}(T_F > t), \qquad h(t) = \mathbb{P}(T_F = t \mid T_F \ge t) $$ + +> **Implementation:** Extracted and plotted via `clawbench.dynamics.kaplan_meier` and aggregated in `scripts/survival_analysis.py`. + +### Queueing-Style Stability (Foster-Lyapunov Drift) +If the loop maintains a backlog $Q_t$ of unresolved subgoals: +$$ \mathbb{E}[V(Q_{t+1})-V(Q_t)\mid Q_t] \le b - \epsilon\,\mathbf{1}\{Q_t>0\} $$ +Negative drift ensures stability, while positive drift mathematically aligns with runaway "hallucination" narratives. + +> **Implementation:** Evaluated analytically as drift metrics in `clawbench.dynamics._classify_regime`. + +--- + +## 5. Pipeline Implementation: Posterior Computation + +The theoretical framework is operationalized through the `run_posterior_dynamics_pipeline.py` script. This pipeline sequentially calls several specialized analysis scripts on the cached execution traces to map the raw behavior onto the dynamical concepts: + +* **`scripts/posterior/2_compute_constraint_index.py`**: Computes the task-level Constraint Index $C(q)$. It calculates the PCA Participation Ratio ($PR$), tool-family entropy ($H$), and Bayesian Optimal Prediction Score (BOPS) to quantify how tightly the prompt constraints bind the model's exploration. +* **`classify_regimes.py`**: Operationalizes the regime signatures. It classifies each individual run into one of the theoretical regimes (`trapped`, `convergent`, `diffusive`, `chaotic`, `limit_cycle`, or `unknown`) using thresholds on entropy, drift variance, and step-size autocovariance. +* **`variance_decomp.py`**: Separates performance variance into *seed noise* versus actual *capability signal*. This quantifies the Signal-to-Noise Ratio (SNR) of the task, isolating the dynamical sensitivity to stochasticity from true deterministic performance. +* **`survival_analysis.py`**: Implements the latent-state failure modeling. It computes Kaplan-Meier survival curves $S(t)$ and hazard functions $h(t)$, defining "failure" $T_F$ as an absorbing event (like a runaway loop or an unrecoverable `tool_misuse`), plotting model resilience over the turn horizon. +* **`snr_weighted_ranking.py`**: Computes an alternative task-weighted ranking. Instead of a flat mean, it weights tasks based on their signal density: $w_q = \max(0, \text{SNR}(q)) \times |C(q)|$. This penalizes models specifically for failing on highly-constrained, low-noise tasks. +* **`generate_dynamical_report.py`**: Handles **Visualization and Reporting**. It aggregates the mathematical diagnostics across all scripts into a comprehensive markdown summary report (`EVAL_REPORT_DYNAMICAL.md`). This renders comparative tables for Kaplan-Meier survival curves, SNR-weighted rankings, and regime distributions, setting up the visualizations needed to compare the geometry of the dynamical basins. + +--- + +## 6. Interpretation and Impact + +Framing long-running LLM agents as dynamical systems yields practical diagnostics for reliability. By triangulating results across embedding geometry, uncertainty signals, and survival curves, this framework exposes why some agentic architectures succeed while others wander off-task. + +For LLM Agent Researchers and End-Users, these metrics translate directly to operational guarantees: + +* **Lyapunov Sensitivity and Attractor Dimensions (The Kaplan-Yorke connection)**: If an agent's behavioral dimension (Rényi $D_2$) and maximal Lyapunov proxy ($\widehat{\lambda}$) are high, the agent lacks a robust "point attractor" (a definitive solution). For researchers, this means the agent is exploring chaotically and is highly fragile to prompt wording. For users, it means the agent's behavior is fundamentally unpredictable and shouldn't be trusted for deterministic workflows. +* **Ergodicity and Markovian Traps**: Because LLMs have absorbing states (e.g., max-turn limits, task completion), they are generally non-ergodic. However, when an agent falls into a "trapped" limit cycle (repeating a failed tool call), it suffers from context blindness, collapsing into a destructive Markovian state. For researchers, detecting non-ergodic trapping is the key to designing better early-stopping or self-reflection triggers. +* **Task-Sensitivity Mutual Information $I(q; \lambda)$**: There is massive mutual information between the initial task's constraint index $C(q)$ and the resulting perturbation sensitivity $\widehat{\lambda}$. Tightly constrained tasks (high $C(q)$, e.g., "fix a specific syntax error") yield deep attractor basins with near-zero sensitivity. Open-ended tasks (low $C(q)$, e.g., "refactor this module") yield flat basins where tiny prompt changes cause exponential divergence. For users, this proves that *prompt engineering is most critical on loosely constrained tasks*, whereas highly constrained tasks are structurally robust to variations. + + +--- + +## 7. Space-Time Decomposition + +Our raw time-series metrics treat all tasks in the benchmark equally. However, benchmarks rarely reflect true user workloads. To correct this, we integrate the temporal dynamics computed here with the spatial Task Distribution Reweighting framework. + +By taking the Radon-Nikodym derivatives (Importance Weights $\rho_i$) representing the true user distribution, we compute the Hajek estimators for all dynamic properties. This **Space-Time Decomposition** yields the expected real-world probability of an agent entering a specific dynamical regime (like a chaotic wandering state) and the debiased expected Constraint Index $C(q)$ under operational conditions. + +> **Implementation:** Computed by `scripts/compute_debiased_dynamics.py`, which fuses the NLU-based importance weights with the raw posterior dynamics artifacts generated by this pipeline. + +--- + +## 8. Inspired By + +The theoretical framework and diagnostics outlined in this document draw inspiration from the following works: + +* [Understanding Chain-of-Thought in LLMs through Information Theory](https://arxiv.org/html/2411.11984v2) (arXiv:2411.11984) +* [Is Chain-of-Thought Reasoning of LLMs a Mirage? A Data Distribution Lens](https://arxiv.org/html/2508.01191v3) (arXiv:2508.01191) +* [Uncovering Meanings of Embeddings via Partial Orthogonality](https://arxiv.org/abs/2310.17611) (arXiv:2310.17611) +* [Skewed Memorization in Large Language Models: Quantification and Decomposition](https://arxiv.org/abs/2502.01187) (arXiv:2502.01187) diff --git a/docs/semantic_spatiotemporal_dynamics.md b/docs/semantic_spatiotemporal_dynamics.md new file mode 100644 index 0000000..32333bf --- /dev/null +++ b/docs/semantic_spatiotemporal_dynamics.md @@ -0,0 +1,100 @@ +# Semantic Spatio-Temporal Dynamics Analysis + +## 1. Introduction: Bridging Space and Time + +Evaluating iterative, long-running Large Language Model (LLM) agents requires understanding two fundamentally different axes of their behavior: +1. **The Semantic Space (What the agent is doing)**: The distribution of tasks, intents, and prompts the agent interacts with. +2. **The Temporal Dynamics (How the agent evolves)**: The trajectory of the agent over time, characterized by its ability to converge on solutions versus drifting into unrecoverable hallucination loops. + +Historically, evaluating these dimensions in isolation creates a blind spot. **Raw temporal dynamics metrics treat all tasks in an arbitrary benchmark equally.** If a benchmark dataset over-represents simple, tightly constrained tasks, the agent's overall dynamic stability will look artificially robust. Conversely, if it over-indexes on open-ended creative tasks, the agent might look chaotic. + +The **Semantic Spatio-Temporal Dynamics** framework solves this by fusing these two methodologies. It maps the geometry of the agent's time-series trajectories directly onto a debiased, user-aligned semantic manifold, projecting abstract mathematical stability metrics onto concrete operational realities. + +--- + +## 2. The Spatial Dimension: Task Distribution Reweighting + +Evaluation datasets ($Q$) inherently suffer from distribution shifts compared to true real-world usage ($P$). To correct this, we stratify and reweight the semantic space of tasks. + +### 2.1 NLU/NLI Semantic Clustering +We embed the natural language instructions of each task $q_i$ using Dense NLU models to capture semantic intent, and employ Natural Language Inference (NLI) to confirm entailment and redundancy. +Using clustering algorithms (e.g., HDBSCAN), we partition the dataset into $K$ distinct functional stratums: $\mathcal{C} = \{C_1, C_2, \dots, C_K\}$. + +### 2.2 Importance Weighting (Radon-Nikodym Derivatives) +Let $Q(C_k)$ be the empirical fraction of the evaluation dataset belonging to cluster $C_k$, and $P(C_k)$ be the target real-world probability of that cluster. We compute the importance weight (Radon-Nikodym derivative) for any task $i$ in stratum $k_i$ as: +$$ \rho_{k_i} = \frac{P(C_{k_i})}{Q(C_{k_i})} $$ +This scaling factor ensures that over-represented tasks are suppressed, and under-represented but critical real-world tasks are amplified. + +--- + +## 3. The Temporal Dimension: Long-Term Trajectory Dynamics + +As an agent iteratively reasons and invokes tools, its transcript generates a sequence of discrete actions $x_t$. We project this sequence into a continuous $d$-dimensional behavioral feature space to analyze its geometry. + +### 3.1 Attractor Geometry and The Constraint Index $C(q)$ +For a given task $q$, we measure how tightly the agent's trajectory is bound to an attractor basin using three core metrics: +* **Participation Ratio (PR) & Rényi Dimension ($D_2$)**: We extract the eigenspectrum of the trajectory's covariance matrix. The Rényi correlation dimension $D_2 = -\log_2 \sum p_i^2$ measures the structural volume/complexity of the phase space explored by the agent. +* **Response Entropy ($H$)**: The Shannon entropy over the eigenspectrum (or discrete action distribution) measuring the intrinsic uncertainty and diffusion of the agent. +* **Bayesian Optimal Prediction Score (BOPS)**: A measure of inter-run predictability, proxying how consistently the agent targets the maximum a posteriori (MAP) trajectory. + +These are standardized and fused into the **Constraint Index $C(q)$**, where a high $C(q)$ implies tight bounded behavior (a strong point attractor). + +### 3.2 Perturbation Sensitivity (Lyapunov Proxy) +To test robustness, we generate semantically identical but lexically perturbed prompts $q'$. We track the divergence between the original trajectory $e_t$ and the perturbed trajectory $e'_t$ over time, extracting a Lyapunov-like proxy: +$$ \widehat{\lambda}(t) = \frac{1}{t}\log\frac{D_t+\epsilon}{D_0+\epsilon} $$ +A positive $\widehat{\lambda}(t)$ indicates chaotic sensitivity, where tiny prompt variations cause exponentially diverging behavior. + +### 3.3 Dynamical Regimes +Trajectories are ultimately classified into distinct kinetic states: +* **Trapped**: Collapsing into a highly recurrent, localized subset of actions. +* **Limit Cycle**: Bounded drift with quasi-periodic revisits to states. +* **Wandering/Diffusive**: Unbounded expansion with low predictability and high entropy. + +--- + +## 4. Spatio-Temporal Fusion: The Hajek Estimator + +The core theoretical leap is applying the Spatial weights ($\rho_i$) to the Temporal properties ($D_i$) to estimate the *true expected real-world dynamics*. + +For any dynamic property $D$, the debiased expectation under the real-world user distribution $P$ is given by the asymptotically efficient Hajek estimator: +$$ \mathbb{E}_{P}[D] \approx \frac{\sum_{i=1}^N \rho_{k_i} D_i}{\sum_{i=1}^N \rho_{k_i}} $$ + +### Key Fused Metrics +1. **Expected Regime Probability ($E_P[\text{Regime} = r]$)**: Instead of stating "20% of benchmark trajectories hit a chaotic wandering regime," this calculates the exact probability that a *deployed user* will experience that failure mode. +2. **Debiased Survival Curves ($S_{debiased}(t)$)**: A weighted Kaplan-Meier estimation. If simple, high-survival tasks are overrepresented in the benchmark, the raw curve is falsely optimistic. The debiased curve corrects this, providing a true expected time-to-failure. +3. **Expected Chaos ($E_P[\widehat{\lambda}]$) & Predictability ($E_P[C(q)]$)**: The true weighted average of prompt fragility and system volatility. + +--- + +## 5. Expanding the Spatial Definition: State, Action, and Conditioned Survival + +While the standard formulation defines "Space" via the NLU embedding of the *initial prompt*, this framework is naturally extensible to other spatial dimensions of the trajectory: + +* **Action Space (Tools Called)**: Stratifying trajectories based on the specific tools invoked (e.g., isolating all runs where `edit_file` or `bash` was called). +* **Intermediate State Space**: Stratifying based on the environment state or agent memory (e.g., isolating runs where a `SyntaxError` was encountered). + +This is where **Time-to-Event (Survival Analysis)** breaks back in with immense power. Because ClawBench logs the full trajectory state, we can compute dynamically conditioned expected properties. Rather than just asking "What is the expected survival time of this task?", we can condition on any arbitrary combination of parameters: +* $\mathbb{E}[\text{Time-to-Failure} \mid \text{Tool} = \text{bash}]$ +* $\mathbb{E}[\text{Probability of Limit Cycle} \mid \text{State} = \text{SyntaxError}]$ + +By using Stratified Kaplan-Meier curves or Cox Proportional Hazards models with time-dependent covariates, researchers can isolate the exact state-action transitions that induce catastrophic drift. + +--- + +## 6. Interpretation and Impact for Researchers + +Merging these dimensions unlocks powerful theoretical and practical insights: + +* **Kaplan-Yorke and Hidden Fragility**: If the Spatio-Temporal fusion reveals a high expected Rényi dimension $D_2$ and high Lyapunov sensitivity $\widehat{\lambda}$, the deployed agent lacks a definitive "point attractor" for real-world tasks. An agent might appear stable on a benchmark, but if its chaotic trajectories align heavily with the most frequent user tasks, its operational stability is critically low. +* **Ergodicity and Markovian Traps**: LLMs are generally non-ergodic due to absorbing states (completing a task or hitting turn limits). However, when trapped in a limit cycle, they suffer from context blindness, collapsing into a destructive Markovian loop. The Spatio-Temporal framework identifies exactly *which semantic regions* trigger these non-ergodic traps, allowing researchers to surgically apply early-stopping heuristics rather than blanket constraints. +* **Task-Sensitivity Mutual Information $I(q; \lambda)$**: There is massive mutual information between a task's Constraint Index $C(q)$ and its perturbation sensitivity. Tightly constrained tasks yield deep attractor basins with near-zero sensitivity. The Spatio-Temporal framework proves mathematically where *prompt engineering matters most*—specifically on the loosely constrained tasks that dominate a user's target distribution. + +--- + +## 7. Implementation Pipeline + +The Spatio-Temporal decomposition is fully operationalized through a bridging script that ingests the outputs of both upstream modules: + +1. **Spatial Baseline**: `scripts/compute_posterior_weights.py` computes the weights $\rho_i$ based on NLU clusters and user schemas. +2. **Temporal Baseline**: `scripts/run_posterior_dynamics_pipeline.py` computes the unweighted survival, regimes, and constraint indices. +3. **Spatio-Temporal Fusion**: `scripts/compute_debiased_dynamics.py` applies the Hajek estimators to produce the final `debiased_regimes_probability` and `debiased_expected_C_q`. diff --git a/docs/task_distribution_reweighting.md b/docs/task_distribution_reweighting.md new file mode 100644 index 0000000..f5e41d9 --- /dev/null +++ b/docs/task_distribution_reweighting.md @@ -0,0 +1,93 @@ +# Aligning LLM Evaluations with Reality: Debiasing via Task Distribution Reweighting +## Investigating Semantic Task Clustering and Stratified Reweighting for Real-World Accuracy + +Evaluation benchmarks often suffer from severe distribution shifts compared to real-world usage. A dataset might consist of 80% mathematics tasks and 20% coding tasks, whereas an actual user's interaction distribution might be exactly the opposite (20% math, 80% code). Evaluating an LLM on the raw dataset yields a biased performance estimate that over-indexes on specific capabilities while under-representing others. This document outlines an empirical framework to debias evaluation scores by clustering tasks using Natural Language Understanding (NLU) and Natural Language Inference (NLI) models, and reweighting these task strata to match true usage distributions. + +--- + +## 1. Introduction: The Need for Distribution Alignment + +**Key question: Does our benchmark score actually reflect the user's experience?** + +Standard evaluation paradigms treat every task in a dataset equally, computing an unweighted mean over all instances. However, evaluation datasets are typically constructed via programmatic generation or scraping, leading to arbitrary internal distributions that do not reflect operational reality. + +If a system is deployed where coding represents the vast majority of user queries, a math-heavy benchmark will misjudge the model's practical utility. We therefore treat the evaluation dataset as a biased sample from a broader semantic space, and apply **stratified reweighting** to correct this bias, moving from a static dataset score to a dynamic, user-aligned capability metric. + +--- + +## 2. Methodology: Clustering and Stratification + +### 2.1 Task Representation and NLU Clustering +To reweight a dataset, we first need to map its internal composition. We map each task/prompt $q_i$ into a semantic space using pre-trained NLU models to identify latent capabilities. + +* **Dense NLU Embeddings:** We extract representations for each task instruction using modern embedding models to capture semantic intent. +* **NLI for Semantic Equivalence:** We employ Natural Language Inference (NLI) models to evaluate pairs of tasks. If task $A$ entails the capabilities required by task $B$, we can aggressively group similar prompts to prevent over-counting highly redundant queries. +* **Stratification:** We apply clustering algorithms (e.g., HDBSCAN) on the semantic representations to partition the dataset into $K$ distinct functional clusters (stratums), $\mathcal{C} = \{C_1, C_2, \dots, C_K\}$, representing distinct capability areas (e.g., "Math Word Problems", "Code Refactoring", "Information Retrieval"). + +> **Implementation:** Computed in `scripts/cluster_tasks_nlu.py` using embedding and NLI models to output a cluster assignment mapping for all benchmark tasks. + +### 2.2 Estimating True Usage Distributions +Let $P_{eval}$ be the empirical distribution of tasks in the evaluation dataset, and $P_{user}$ be the target real-world usage distribution. We determine the proportion of each cluster $k$ in both: +* $w_{eval}^{(k)}$: The fraction of tasks in the evaluation set that belong to cluster $C_k$. +* $w_{user}^{(k)}$: The fraction of tasks in the expected user distribution that belong to cluster $C_k$. + +If a cluster makes up 80% of the benchmark but only 20% of user interactions, it is heavily over-represented. + +> **Implementation:** Computed in `scripts/compute_distribution_weights.py` by comparing the empirical cluster sizes against a provided user telemetry schema. + +### 2.3 Stratified Importance Reweighting +We compute a debiased performance metric by applying Inverse Probability Weighting (IPW) to the task strata. If a model achieves an average success rate $S_k$ on cluster $C_k$, the naive unweighted dataset score is simply $\sum_k w_{eval}^{(k)} S_k$. + +The debiased, user-aligned score corrects for this by scaling by the true usage rates: + +$$ S_{debiased} = \sum_{k=1}^K w_{user}^{(k)} S_k $$ + +Alternatively, we can assign an importance weight $\rho_i$ to each individual task $i$ belonging to cluster $C_k$: + +$$ \rho_i = \frac{w_{user}^{(k)}}{w_{eval}^{(k)}} $$ + +Yielding the weighted expected score: $\mathbb{E}_{q \sim P_{user}} [ \text{Score}(q) ] \approx \frac{1}{N} \sum_{i=1}^N \rho_i \text{Score}(q_i)$. + +> **Implementation:** Weights are integrated during metric aggregation in `clawbench.evaluation.debiased_metrics`. + +--- + +## 3. Advanced Capabilities: Inter-Task Similarity and Overlap + +Beyond simple clustering, NLU and NLI models allow us to construct a full **Task Similarity Graph**. + +1. **Redundancy Penalties:** If a cluster contains highly identical tasks (as measured by bidirectional NLI entailment), we can down-weight individual tasks within that cluster to avoid "capability farming" where a model succeeds only because the same question is asked 50 times in slightly different ways. +2. **Cross-Cluster Leakage:** Tasks may not neatly fit into orthogonal clusters. By computing soft-assignments or probabilities $P(C_k \mid q_i)$, we can allocate fractional weights, allowing complex multi-step reasoning tasks to contribute to the scores of multiple capabilities (e.g., a prompt requiring both Python coding and mathematical proofs). + +> **Implementation:** Computed via graph-based adjacency matrices in `clawbench.evaluation.task_graph`. + +--- + +## 4. Pipeline Implementation: Debiasing Computation + +The theoretical framework is operationalized through a series of analysis scripts designed to run sequentially after the core evaluation rollouts are complete: + +* **`cluster_tasks_nlu.py`**: Embeds task instructions and clusters them into distinct semantic stratums. Uses NLI models to verify similarity within clusters and builds the Task Similarity Graph. +* **`compute_distribution_weights.py`**: Compares the cluster assignments against a reference user distribution profile to compute the importance weights $\rho_i$ for each task. +* **`debiased_evaluation.py`**: Aggregates the raw execution traces and applies the computed importance weights to produce the final, debiased performance metrics. +* **`generate_reweighting_report.py`**: Renders the comparative diagnostics into a markdown summary (`EVAL_REPORT_DEBIASED.md`), highlighting which capabilities were inflated by dataset bias and presenting the true expected performance under user conditions. + +--- + +## 5. Interpretation and Impact + +Framing dataset evaluation through the lens of usage distributions prevents capability over-fitting to skewed benchmarks. By triangulating NLU-based task clusters with stratified IPW reweighting, we ensure that our metrics accurately reflect the expected real-world performance of the agentic system. + +This approach highlights a critical distinction: a model might be "State of the Art" on an arbitrary academic dataset, but severely underperform when re-weighted to match the exact operational footprint of an end-user. + +--- + +## 6. Space-Time Decomposition + +While the techniques described above debias single-step task success, they can also be combined with long-term dynamic metrics (the "Time" axis) to compute the expected real-world dynamical behavior of the agent. By applying the Radon-Nikodym derivatives ($\rho_i$) to temporal characteristics like Kaplan-Meier survival curves, Constraint Index $C(q)$, and regime clustering probabilities (e.g., trapped vs. chaotic limit cycles), we generate a **Space-Time Decomposition**. + +This fusion calculates the Hajek estimators for time-series properties: +$$ \mathbb{E}_{P}[\text{Regime} = r] \approx \frac{\sum_{i=1}^N \rho_{k_i} \mathbf{1}(\text{regime}_i = r)}{\sum_{i=1}^N \rho_{k_i}} $$ +Revealing the true likelihood that a model falls into an unrecoverable hallucination loop under actual user workload conditions. + +> **Implementation:** Operationalized via `scripts/compute_debiased_dynamics.py` which takes the weights from this spatial framework and applies them to the outputs of the temporal dynamics framework. diff --git a/profiles/empirical_topic_distribution.json b/profiles/empirical_topic_distribution.json new file mode 100644 index 0000000..ea7edb9 --- /dev/null +++ b/profiles/empirical_topic_distribution.json @@ -0,0 +1,4 @@ +{ + "math": 0.80, + "code": 0.20 +} diff --git a/profiles/radon_nikodym_weights.json b/profiles/radon_nikodym_weights.json new file mode 100644 index 0000000..9cf79e5 --- /dev/null +++ b/profiles/radon_nikodym_weights.json @@ -0,0 +1,4 @@ +{ + "math": 0.25, + "code": 4.0 +} \ No newline at end of file diff --git a/profiles/user_target_distribution.json b/profiles/user_target_distribution.json new file mode 100644 index 0000000..c3272e6 --- /dev/null +++ b/profiles/user_target_distribution.json @@ -0,0 +1,4 @@ +{ + "math": 0.20, + "code": 0.80 +} diff --git a/scripts/compute_debiased_dynamics.py b/scripts/compute_debiased_dynamics.py new file mode 100644 index 0000000..6df01df --- /dev/null +++ b/scripts/compute_debiased_dynamics.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +import json +import argparse +import logging +from collections import defaultdict + +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') + +def compute_debiased_dynamics(regimes_path, constraint_path, weights_path, topics_path, output_path): + """ + Computes the Horvitz-Thompson / Hajek estimators for the temporal + dynamical properties (Regime Distributions, Constraint Index) + using the Radon-Nikodym derivatives (weights). + """ + with open(weights_path, 'r') as f: + weights = json.load(f) + + with open(topics_path, 'r') as f: + topics_data = json.load(f) + + # Extract topics + task_topics = {} + for task_id, data in topics_data.items(): + if isinstance(data, dict): + task_topics[task_id] = data.get("topic", "unknown") + else: + task_topics[task_id] = str(data) + + # 1. Debiased Regimes + with open(regimes_path, 'r') as f: + regimes = json.load(f) + + model_regimes_weighted = defaultdict(lambda: defaultdict(float)) + model_regimes_weight_sum = defaultdict(float) + + for key, data in regimes.items(): + parts = key.split("/") + model = parts[0] + task_id = parts[1] if len(parts) > 1 else parts[0] + + # Match task to topic + matched_topic = "unknown" + for t_id in task_topics: + if task_id.startswith(t_id): + matched_topic = task_topics[t_id] + break + + rho = weights.get(matched_topic, 1.0) + regime = data.get("regime", "unknown") + + model_regimes_weighted[model][regime] += rho + model_regimes_weight_sum[model] += rho + + debiased_regimes = {} + for model, r_counts in model_regimes_weighted.items(): + total_w = model_regimes_weight_sum[model] + if total_w > 0: + debiased_regimes[model] = {r: float(w / total_w) for r, w in r_counts.items()} + else: + debiased_regimes[model] = {} + + # 2. Debiased Constraint Index (Expected Predictability) + with open(constraint_path, 'r') as f: + constraints = json.load(f) + + weighted_cq_sum = 0.0 + cq_weight_sum = 0.0 + for task_id, data in constraints.items(): + matched_topic = "unknown" + for t_id in task_topics: + if task_id.startswith(t_id): + matched_topic = task_topics[t_id] + break + + rho = weights.get(matched_topic, 1.0) + cq = data.get("C_q", 0.0) + weighted_cq_sum += rho * cq + cq_weight_sum += rho + + debiased_cq = float(weighted_cq_sum / cq_weight_sum) if cq_weight_sum > 0 else 0.0 + + output = { + "debiased_expected_C_q": debiased_cq, + "debiased_regimes_probability": debiased_regimes + } + + with open(output_path, 'w') as f: + json.dump(output, f, indent=4) + logging.info(f"Wrote debiased Space-Time dynamics to {output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Compute Debiased Dynamics") + parser.add_argument("--regimes", required=True, help="Path to empirical regimes JSON") + parser.add_argument("--constraint", required=True, help="Path to empirical constraint index JSON") + parser.add_argument("--weights", required=True, help="Path to importance weights JSON") + parser.add_argument("--topics", required=True, help="Path to task-to-topic mapping JSON (e.g. mock results)") + parser.add_argument("--output", required=True, help="Path to output debiased JSON") + args = parser.parse_args() + + compute_debiased_dynamics( + args.regimes, + args.constraint, + args.weights, + args.topics, + args.output + ) diff --git a/scripts/debiased_evaluation.py b/scripts/debiased_evaluation.py new file mode 100644 index 0000000..37d2fe3 --- /dev/null +++ b/scripts/debiased_evaluation.py @@ -0,0 +1,57 @@ +import json +import argparse +import logging + +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') + +def compute_horvitz_thompson_estimator(results_path, weights_path): + """ + Computes the Horvitz-Thompson (or Hajek) estimator for the mean performance. + Let X_i be the performance on task i from stratum k_i. + The unbiased estimator for E_P[X] is 1/N \sum_i rho_{k_i} X_i. + """ + with open(results_path, 'r') as f: + results = json.load(f) + + with open(weights_path, 'r') as f: + weights = json.load(f) + + # To ensure consistency and finite sample robustness, we normalize weights (Hajek estimator) + # sum_rho = \sum_i rho_{k_i} + + weighted_sum = 0.0 + sum_weights = 0.0 + + n = len(results) + if n == 0: + logging.info("Empty sample. Estimator undefined.") + return + + for task_id, data in results.items(): + stratum = data.get("topic") + score = data.get("score", 0.0) + + rho = weights.get(stratum, 1.0) + + weighted_sum += rho * score + sum_weights += rho + + if sum_weights == 0: + logging.error("Sum of importance weights is zero. Target measure P may be singular w.r.t Q.") + return + + # Asymptotically efficient Hajek estimator + theta_hat = weighted_sum / sum_weights + unadjusted_mean = sum(d.get("score", 0) for d in results.values()) / n + + logging.info(f"Sample Size (n) = {n}") + logging.info(f"Unadjusted Empirical Mean (Q-measure) = {unadjusted_mean:.4f}") + logging.info(f"Adjusted Posterior Mean (P-measure, Hajek Estimator) = {theta_hat:.4f}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Evaluate unbiased posterior scoring via IPW.") + parser.add_argument("--results", required=True, help="Path to raw execution results (JSON)") + parser.add_argument("--weights", required=True, help="Path to computed weights (JSON)") + args = parser.parse_args() + + compute_horvitz_thompson_estimator(args.results, args.weights) diff --git a/scripts/generate_perturbed_tasks.py b/scripts/generate_perturbed_tasks.py new file mode 100644 index 0000000..2ff494e --- /dev/null +++ b/scripts/generate_perturbed_tasks.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +import argparse +import subprocess +from pathlib import Path + +import yaml + + +DEFAULT_TASK_IDS = [ + "t1-bugfix-discount", + "t1-fs-quick-note", + "t2-browser-form-fix", +] + + +def _clean_paraphrase(text: str) -> str: + """Keep final-only output from local models that expose reasoning traces.""" + marker = "...done thinking." + if marker in text: + text = text.rsplit(marker, 1)[-1] + return text.strip() + + +def _find_task_file(base_dir: Path, task_id: str) -> Path: + matches = sorted(base_dir.glob(f"tier*/{task_id}.yaml")) + if not matches: + raise FileNotFoundError(f"No task YAML found for id: {task_id}") + if len(matches) > 1: + raise ValueError(f"Multiple task YAML files found for id {task_id}: {matches}") + return matches[0] + + +def generate_paraphrase(text: str, model="qwen3.5:27b") -> str: + """Use local Ollama to generate a semantic paraphrase.""" + prompt = ( + "Paraphrase the following task instruction. " + "Keep the exact same semantic meaning and intent, but change the wording slightly. " + "Output ONLY the paraphrased text, nothing else.\n\n" + f"Original: {text}" + ) + + cmd = ["ollama", "run", model, prompt] + try: + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + paraphrase = _clean_paraphrase(result.stdout) + return paraphrase or text + except (FileNotFoundError, subprocess.CalledProcessError) as e: + print(f"Error running ollama: {e}") + return text + + +def main(): + parser = argparse.ArgumentParser(description="Generate deterministic perturbed task variants.") + parser.add_argument("--base-dir", type=Path, default=Path("tasks-public")) + parser.add_argument("--model", default="qwen3.5:27b") + parser.add_argument( + "--task", + action="append", + dest="task_ids", + help="Task id to perturb. May be passed multiple times.", + ) + args = parser.parse_args() + + task_ids = args.task_ids or DEFAULT_TASK_IDS + selected_tasks = [_find_task_file(args.base_dir, task_id) for task_id in task_ids] + + for file_path in selected_tasks: + print(f"Processing {file_path}...") + with open(file_path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + + # Modify ID and Name + original_id = data["id"] + data["id"] = f"{original_id}-perturbed" + data["name"] = data["name"] + " (Perturbed)" + rubric = data.get("judge", {}).get("rubric") + if isinstance(rubric, str): + data["judge"]["rubric"] = rubric.replace(original_id, data["id"]) + + # Paraphrase the user prompt + if "user" in data and "turns" in data["user"]: + for turn in data["user"]["turns"]: + original_text = turn["message"] + print(f" Original: {original_text}") + paraphrased_text = generate_paraphrase(original_text, model=args.model) + print(f" Paraphrased: {paraphrased_text}") + turn["message"] = paraphrased_text + + # Write to new file + new_path = file_path.with_name(f"{file_path.stem}-perturbed.yaml") + with open(new_path, "w", encoding="utf-8") as f: + yaml.dump(data, f, sort_keys=False, default_flow_style=False) + print(f" Wrote {new_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/posterior/1_compute_posterior_weights.py b/scripts/posterior/1_compute_posterior_weights.py new file mode 100644 index 0000000..150ef78 --- /dev/null +++ b/scripts/posterior/1_compute_posterior_weights.py @@ -0,0 +1,45 @@ +import json +import argparse +import logging + +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') + +def compute_radon_nikodym_derivatives(empirical_path, target_path, output_path): + """ + Computes the importance weights (Radon-Nikodym derivatives) dP/dQ + where P is the target user measure and Q is the empirical design measure. + By Slutsky's theorem, plug-in estimators using these weights will yield + asymptotically consistent estimators of the expected performance under P. + """ + with open(empirical_path, 'r') as f: + q_dist = json.load(f) # Q: empirical measure + + with open(target_path, 'r') as f: + p_dist = json.load(f) # P: target measure + + weights = {} + for stratum in p_dist: + # Let q_k = Q(stratum), p_k = P(stratum). + # Weight rho_k = p_k / q_k + q_k = q_dist.get(stratum, 0.0) + p_k = p_dist.get(stratum, 0.0) + + if q_k == 0: + if p_k > 0: + logging.warning(f"Strata '{stratum}' has P-measure > 0 but Q-measure = 0. Estimator lacks support!") + weights[stratum] = 0.0 + else: + weights[stratum] = p_k / q_k + + with open(output_path, 'w') as f: + json.dump(weights, f, indent=4) + logging.info(f"Computed Radon-Nikodym derivatives (weights) saved to {output_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Compute importance weights for posterior scoring.") + parser.add_argument("--empirical", required=True, help="Path to empirical measure Q (JSON)") + parser.add_argument("--target", required=True, help="Path to target measure P (JSON)") + parser.add_argument("--output", required=True, help="Path to output weights (JSON)") + args = parser.parse_args() + + compute_radon_nikodym_derivatives(args.empirical, args.target, args.output) diff --git a/scripts/compute_constraint_index.py b/scripts/posterior/2_compute_constraint_index.py similarity index 57% rename from scripts/compute_constraint_index.py rename to scripts/posterior/2_compute_constraint_index.py index 4f6adae..fe732e4 100644 --- a/scripts/compute_constraint_index.py +++ b/scripts/posterior/2_compute_constraint_index.py @@ -15,8 +15,8 @@ land in a narrower response manifold. Low C(q) means the task is more open or stylistically underconstrained. -This implementation uses a normalized bag-of-words representation built from -the full assistant trajectory text plus tool-call names and compacted inputs. +This implementation uses semantic dense embeddings from sentence-transformers +built from the full assistant trajectory text plus tool-call names and compacted inputs. """ from __future__ import annotations @@ -30,10 +30,15 @@ import numpy as np -sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) from clawbench.dynamics_archive import load_task_runs_by_model +# torch and sentence_transformers are imported lazily inside main() +# only when --embedding-model is not "bag-of-words", so that the default +# production path has zero GPU/torch dependencies. + + WORD_RE = re.compile(r"[a-z]{3,}") STOPWORDS = set( "the and that with this have from what your will can but not " @@ -42,6 +47,26 @@ "much more most other then here such does like just make many want need take".split() ) +def tokenize(text: str) -> list[str]: + return [w for w in WORD_RE.findall((text or "").lower()) if w not in STOPWORDS] + +def build_vocab(texts: list[str], top_k: int = 500) -> dict[str, int]: + counts = Counter() + for text in texts: + counts.update(set(tokenize(text))) + return {word: idx for idx, (word, _) in enumerate(counts.most_common(top_k))} + +def vectorize(text: str, vocab: dict[str, int]) -> np.ndarray: + vec = np.zeros(len(vocab), dtype=np.float32) + toks = tokenize(text) + if not toks: + return vec + counts = Counter(toks) + for word, cnt in counts.items(): + if word in vocab: + vec[vocab[word]] = cnt + norm = np.linalg.norm(vec) + return vec / norm if norm > 0 else vec def _assistant_trajectory_text(run, max_chars: int = 4000) -> str: parts = [] @@ -69,30 +94,6 @@ def _fallback_text_from_any_message(run) -> str: return "" -def tokenize(text: str) -> list[str]: - return [w for w in WORD_RE.findall((text or "").lower()) if w not in STOPWORDS] - - -def build_vocab(texts: list[str], top_k: int = 500) -> dict[str, int]: - counts = Counter() - for text in texts: - counts.update(set(tokenize(text))) - return {word: idx for idx, (word, _) in enumerate(counts.most_common(top_k))} - - -def vectorize(text: str, vocab: dict[str, int]) -> np.ndarray: - vec = np.zeros(len(vocab), dtype=np.float32) - toks = tokenize(text) - if not toks: - return vec - counts = Counter(toks) - for word, cnt in counts.items(): - if word in vocab: - vec[vocab[word]] = cnt - norm = np.linalg.norm(vec) - return vec / norm if norm > 0 else vec - - def participation_ratio(X: np.ndarray) -> float: """PR(X) = (tr Sigma)^2 / tr(Sigma^2), an effective dimensionality proxy.""" if X.shape[0] < 2: @@ -108,14 +109,40 @@ def participation_ratio(X: np.ndarray) -> float: def response_entropy(X: np.ndarray) -> float: - """Entropy over normalized covariance eigenvalues, in bits.""" - if X.shape[0] < 2: + """Kernelized continuous entropy (von Neumann entropy of the regularized RBF kernel matrix). + + This is highly robust for dense semantic embeddings where N_samples << D_dimensions, + unlike standard PCA covariance eigenspectrums which collapse. + """ + n_samples = X.shape[0] + if n_samples < 2: return 0.0 - sigma = np.cov(X.T) - eigs = np.linalg.eigvalsh(sigma) + + # Pairwise squared distances + diffs = X[:, np.newaxis, :] - X[np.newaxis, :, :] + sq_dists = np.sum(diffs ** 2, axis=-1) + + # Bandwidth heuristic (sigma) using median distance + median_sq_dist = np.median(sq_dists) + if median_sq_dist < 1e-12: + # Trajectories are perfectly identical (zero variance) + return 0.0 + + # RBF Kernel matrix construction + K = np.exp(-sq_dists / (2.0 * median_sq_dist)) + + # Tikhonov regularization for numerical stability + K = K + np.eye(n_samples) * 1e-6 + + # Normalize trace to 1 to form a valid density matrix + A = K / np.trace(K) + + # Eigendecomposition of the symmetric kernel density matrix + eigs = np.linalg.eigvalsh(A) eigs = np.clip(eigs, 1e-12, None) - probs = eigs / eigs.sum() - return float(-np.sum(probs * np.log2(probs))) + + # Von Neumann entropy in bits + return float(-np.sum(eigs * np.log2(eigs))) def bops_inter_run_predictability(run_vecs: dict[str, list[np.ndarray]]) -> float: @@ -146,6 +173,7 @@ def main() -> None: parser.add_argument("--archive-dir", type=Path, default=Path(".clawbench/run_cache")) parser.add_argument("--reports-dir", type=Path, default=Path("reports")) parser.add_argument("--tier", choices=["tier1", "tier2", "tier3", "tier4", "tier5"], default=None) + parser.add_argument("--embedding-model", type=str, default="bag-of-words") args = parser.parse_args() grouped = load_task_runs_by_model(args.archive_dir, tier=args.tier) @@ -179,24 +207,79 @@ def main() -> None: if not all_texts: raise SystemExit("No usable text found in cached transcripts.") - vocab = build_vocab(all_texts, top_k=500) per_task: dict[str, dict[str, float | str]] = {} - for task_id, texts in sorted(per_task_texts.items()): - X = np.stack([vectorize(text, vocab) for text in texts]) - pr = participation_ratio(X) - ent = response_entropy(X) - model_vecs = { - model_name: [vectorize(text, vocab) for text in model_texts] - for model_name, model_texts in per_task_model_texts[task_id].items() - } - bops = bops_inter_run_predictability(model_vecs) - per_task[task_id] = { - "n_responses": len(texts), - "PR": pr, - "entropy": ent, - "BOPS": bops, - "data_source": "fallback_any_message" if use_fallback_messages else "assistant_final", - } + + if args.embedding_model.lower() == "bag-of-words": + vocab = build_vocab(all_texts, top_k=500) + for task_id, texts in sorted(per_task_texts.items()): + X = np.stack([vectorize(text, vocab) for text in texts]) + pr = participation_ratio(X) + ent = response_entropy(X) + + lagrangian_bound = 0.0 + if X.shape[0] > 0: + p_discrete = np.mean(X, axis=0) + p_sum = np.sum(p_discrete) + if p_sum > 1e-12: + p_discrete = p_discrete / p_sum + p_max = float(np.max(p_discrete)) + v_len = len(p_discrete) + discrete_ent = float(-np.sum(p_discrete[p_discrete > 0] * np.log2(p_discrete[p_discrete > 0]))) + if p_max < 1.0 and v_len > 1: + h_bound = -p_max * np.log2(p_max) - (1 - p_max) * np.log2((1 - p_max) / (v_len - 1)) + lagrangian_bound = max(0.0, float(h_bound - discrete_ent)) + + model_vecs = { + model_name: [vectorize(text, vocab) for text in model_texts] + for model_name, model_texts in per_task_model_texts[task_id].items() + } + bops = bops_inter_run_predictability(model_vecs) + per_task[task_id] = { + "n_responses": len(texts), + "PR": pr, + "entropy": ent, + "lagrangian_info_loss_bound": lagrangian_bound, + "BOPS": bops, + "data_source": "fallback_any_message" if use_fallback_messages else "assistant_final", + } + else: + # Use Sentence Transformers for proper semantic embedding + import torch + from sentence_transformers import SentenceTransformer + + print(f"Loading sentence-transformers embedding model: {args.embedding_model}...") + device = "cuda" if torch.cuda.is_available() else "cpu" + embedder = SentenceTransformer(args.embedding_model, device=device) + + for task_id, texts in sorted(per_task_texts.items()): + X = embedder.encode(texts, show_progress_bar=False) + # Normalize embeddings to unit length for cosine similarity calculations downstream + norms = np.linalg.norm(X, axis=1, keepdims=True) + X = np.divide(X, norms, out=np.zeros_like(X), where=norms!=0) + + pr = participation_ratio(X) + ent = response_entropy(X) + + # Sentence embeddings are dense, so discrete info-loss is not strictly valid in the same way. + # We set Lagrangian bound to 0.0 for dense semantic spaces. + lagrangian_bound = 0.0 + + model_vecs = {} + for model_name, model_texts in per_task_model_texts[task_id].items(): + vecs = embedder.encode(model_texts, show_progress_bar=False) + norms = np.linalg.norm(vecs, axis=1, keepdims=True) + vecs = np.divide(vecs, norms, out=np.zeros_like(vecs), where=norms!=0) + model_vecs[model_name] = [v for v in vecs] + + bops = bops_inter_run_predictability(model_vecs) + per_task[task_id] = { + "n_responses": len(texts), + "PR": pr, + "entropy": ent, + "lagrangian_info_loss_bound": lagrangian_bound, + "BOPS": bops, + "data_source": "fallback_any_message" if use_fallback_messages else "assistant_final", + } if not per_task: raise SystemExit("Not enough data to compute C(q).") diff --git a/scripts/posterior/3_generate_space_time_report.py b/scripts/posterior/3_generate_space_time_report.py new file mode 100644 index 0000000..e2549c4 --- /dev/null +++ b/scripts/posterior/3_generate_space_time_report.py @@ -0,0 +1,177 @@ +import json +import argparse +import shutil +from pathlib import Path + +TEMPLATE = """# Semantic Space-Time Dynamics Report + +## 1. Environment & Run Identity +- **Evaluated Model(s)**: {models} +- **Benchmark Version**: `{benchmark_version}` +- **Environment Checksum**: `{environment_checksum}` +- **Trajectory Representation**: `{embedding_model}` + +## 2. Semantic-Temporal Metrics Summary + +This table fuses the spatial reweighting metrics (Score) with long-term temporal trajectory bounds (Constraint Index & Information Loss). + +| Task ID | Performance Score | Constraint Index ($C_q$) | Lagrangian Bound ($H_b$) | Participation Ratio ($PR$) | +|---|---|---|---|---| +{metrics_table} + +## 3. Dynamics Insights +- **Constraint Index ($C_q$)**: Higher values indicate that the environment topology naturally restricts the agent's action manifold, making the trajectory more predictable over time. +- **Lagrangian Information Loss Bound**: Quantifies the upper bound on structural state-loss due to discrete token actions. +""" + +def main(): + parser = argparse.ArgumentParser(description="Generate Space-Time Report") + parser.add_argument("--eval-json", type=Path, default=Path("results/gpt_oss_eval.json")) + parser.add_argument("--constraint-json", type=Path, default=Path("results/posterior_reports/constraint_index.json")) + parser.add_argument("--output-dir", type=Path, default=Path("results/space_time_report")) + parser.add_argument("--embedding-model", type=str, default="bag-of-words", help="The embedding model used for spatial trajectory representation") + args = parser.parse_args() + + # Read base eval JSON + if args.eval_json.exists(): + with open(args.eval_json, "r") as f: + eval_data = json.load(f) + else: + eval_data = {"model": "Unknown", "benchmark_version": "N/A", "environment_checksum": "N/A", "task_results": []} + + # Read Constraint Index JSON + if args.constraint_json.exists(): + with open(args.constraint_json, "r") as f: + constraint_data = json.load(f) + else: + constraint_data = {} + + # Build Table + table_rows = [] + task_scores = {t["task_id"]: t["mean_task_score"] for t in eval_data.get("task_results", [])} + + # Merge tasks from both + all_tasks = set(task_scores.keys()).union(set(constraint_data.keys())) + + for task_id in sorted(all_tasks): + score = task_scores.get(task_id, 0.0) + c_q = constraint_data.get(task_id, {}).get("C_q", 0.0) + lagrangian = constraint_data.get(task_id, {}).get("lagrangian_info_loss_bound", 0.0) + pr = constraint_data.get(task_id, {}).get("PR", 0.0) + + row = f"| `{task_id}` | {score:.3f} | {c_q:.3f} | {lagrangian:.3f} | {pr:.3f} |" + table_rows.append(row) + + metrics_table = "\n".join(table_rows) + + report_content = TEMPLATE.format( + models=eval_data.get("model", "Unknown"), + benchmark_version=eval_data.get("benchmark_version", "N/A"), + environment_checksum=eval_data.get("environment_checksum", "N/A"), + embedding_model=args.embedding_model, + metrics_table=metrics_table + ) + + # Automatically link visualizations from dynamics output directories + # and copy them cleanly into a plots/ subfolder so everything is self-contained. + results_dir = args.output_dir.parent + plots_dir = args.output_dir / "plots" + plots_dir.mkdir(parents=True, exist_ok=True) + + vis_content = "\n## 4. Spatio-Temporal Visualizations\n\n" + has_vis = False + + important_plots = [ + ("PCA Trajectories by Tier", "pca_by_tier.png"), + ("Pairwise Contraction & Divergence", "pairwise_contraction_scatter.png"), + ("Prompt Perturbation Sensitivity Heatmap", "sensitivity_heatmap.png"), + ("Task Completion Survival Curve", "survival_first_correct_write.png") + ] + + for dyn_dir in sorted(results_dir.glob("*_dynamics")): + if dyn_dir.is_dir(): + model_name = dyn_dir.name.replace("_eval_dynamics", "").replace("_", " ").title() + vis_content += f"### {model_name}\n\n" + for title, filename in important_plots: + plot_file = dyn_dir / filename + if plot_file.exists(): + dest_name = f"{model_name.replace(' ', '_').lower()}_{filename}" + dest_file = plots_dir / dest_name + shutil.copy2(plot_file, dest_file) + + # Use relative paths for markdown links within the self-contained folder + vis_content += f"**{title}**\n\n![{title}](plots/{dest_name})\n\n" + has_vis = True + + if has_vis: + report_content += vis_content + + # Check for degenerate single-step trajectories and add a note + degenerate_note = "" + for dyn_dir in sorted(results_dir.glob("*_dynamics")): + dyn_json = dyn_dir / "dynamics.json" + if dyn_json.exists(): + try: + dyn_data = json.load(open(dyn_json)) + per_run = dyn_data.get("per_run", []) + if per_run: + max_steps = max(r.get("n_steps", 0) for r in per_run) + if max_steps <= 1: + degenerate_note = """ +## 5. Trajectory Validity Note + +> **⚠️ Single-Step Trajectories Detected** +> +> All runs in this evaluation completed in a single agent turn (`n_steps=1`). +> This means the PCA trajectory plots, survival curves, and regime classifications +> are **degenerate** — there is no multi-step temporal evolution to analyze. +> +> **This is expected for local dev runs** using small models (e.g., Ollama 20B/27B) +> on simple Tier 1 tasks. These models emit a single response and terminate, +> producing no iterative reasoning loop. +> +> To produce meaningful spatio-temporal dynamics, the evaluation requires: +> - **Multi-turn tasks** (Tier 3+) that demand iterative tool use, debugging, and self-correction +> - **Capable models** (70B+ or frontier API models) that engage in multi-step agentic reasoning +> - **Extended compute budgets** to support 10-50+ turn trajectories per task +> +> The constraint index ($C_q$) and inter-run predictability (BOPS) metrics in the table above +> remain valid, as they operate across repeated runs rather than within a single trajectory. + +""" + break + except (json.JSONDecodeError, KeyError): + pass + + if degenerate_note: + report_content += degenerate_note + + # Computational requirements section + report_content += """ +## 6. Computational Requirements for Full Dynamics + +Spatio-temporal dynamics analysis is fundamentally a **high-compute evaluation methodology**. +Unlike single-pass benchmarks, it requires: + +| Requirement | Why | +|-------------|-----| +| **Multiple runs per task** (≥3) | Inter-run variance estimation for BOPS and constraint index | +| **Multi-step trajectories** (10-50+ turns) | PCA embedding, regime classification, survival analysis | +| **Perturbed task variants** | Lyapunov sensitivity estimation ($\\hat{\\lambda}$) | +| **Dense semantic embeddings** | Kernelized entropy estimation in high-dimensional trajectory space | + +A full production evaluation with 2 frontier models × 50 tasks × 3 runs × 30 avg turns +requires approximately **9,000 agent turns** — orders of magnitude more compute than a +standard single-pass benchmark, but necessary to characterize the operational stability +of agents deployed in long-horizon autonomous settings. +""" + + args.output_dir.mkdir(parents=True, exist_ok=True) + output_md = args.output_dir / "EVAL_REPORT_SPACE_TIME.md" + with open(output_md, "w") as f: + f.write(report_content) + + print(f"Generated Space-Time Report at: {output_md}") + +if __name__ == "__main__": + main() diff --git a/scripts/run_eval_pipeline.sh b/scripts/run_eval_pipeline.sh new file mode 100755 index 0000000..7235c22 --- /dev/null +++ b/scripts/run_eval_pipeline.sh @@ -0,0 +1,78 @@ +#!/bin/bash +set -e + +echo "=== ClawBench Dynamics Evaluation Pipeline ===" + +# Parse arguments +IS_LOCAL=0 +if [ "$1" == "--local" ]; then + IS_LOCAL=1 +fi + +if [ $IS_LOCAL -eq 1 ]; then + echo "⚙️ Running in LOCAL DEV mode (Ollama models & Sentence-Transformers)" + MODEL_1="ollama/gpt-oss:20b" + OUT_1="results/gpt_oss_eval.json" + MODEL_2="ollama/qwen3.5:27b" + OUT_2="results/qwen_eval.json" + EMBEDDING_MODEL="all-MiniLM-L6-v2" +else + echo "☁️ Running in CLOUD PRODUCTION mode (OpenAI/Anthropic & Bag-of-Words)" + MODEL_1="openai/gpt-4o" + OUT_1="results/gpt_4o_eval.json" + MODEL_2="anthropic/claude-3.5-sonnet" + OUT_2="results/claude_eval.json" + EMBEDDING_MODEL="bag-of-words" +fi + +# 1. Environment Note +# This script assumes you have activated the proper conda environment +# (e.g., `conda activate clawbench`) prior to execution. + +# 1.5. Clean Cache to prevent aggregating old debugging transcripts +rm -rf "$PWD/.clawbench/run_cache" + +# 2. Generate Perturbed Tasks +echo "Generating perturbed tasks..." +python scripts/generate_perturbed_tasks.py + +# 3. Run Benchmark +export OPENCLAW_GATEWAY_TOKEN="clawbench-local-token" +export CLAWBENCH_RUN_CACHE_DIR="$PWD/.clawbench/run_cache" + +# Formulate repeated -t arguments for click CLI +TASK_ARGS="-t t1-bugfix-discount -t t1-fs-quick-note -t t2-browser-form-fix -t t1-bugfix-discount-perturbed -t t1-fs-quick-note-perturbed -t t2-browser-form-fix-perturbed" + +echo "Running evaluations (this will take time)..." +# We run 3 times per task as requested for statistical significance +clawbench run \ + --model "$MODEL_1" \ + --runs 3 \ + --dynamics \ + $TASK_ARGS \ + -o "$OUT_1" || echo "Warning: Some tasks failed" + +clawbench run \ + --model "$MODEL_2" \ + --runs 3 \ + --dynamics \ + $TASK_ARGS \ + -o "$OUT_2" || echo "Warning: Some tasks failed" + +# 4. Run Posterior Dynamics Pipeline +echo "Running posterior dynamics analysis..." +python scripts/posterior/2_compute_constraint_index.py \ + --archive-dir "$CLAWBENCH_RUN_CACHE_DIR" \ + --reports-dir results/posterior_reports \ + --embedding-model "$EMBEDDING_MODEL" + +# 5. Generate Space-Time Report +echo "Generating final Space-Time Markdown Report..." +python scripts/posterior/3_generate_space_time_report.py \ + --eval-json "$OUT_1" \ + --constraint-json results/posterior_reports/constraint_index.json \ + --output-dir results/space_time_report \ + --embedding-model "$EMBEDDING_MODEL" + +echo "=== Pipeline Complete ===" +echo "Final mathematical report generated at results/space_time_report/EVAL_REPORT_SPACE_TIME.md" diff --git a/scripts/run_posterior_dynamics_pipeline.py b/scripts/run_posterior_dynamics_pipeline.py index eff95a7..9c433a3 100644 --- a/scripts/run_posterior_dynamics_pipeline.py +++ b/scripts/run_posterior_dynamics_pipeline.py @@ -75,7 +75,15 @@ def main() -> None: tier_args = ["--tier", args.tier] if args.tier else [] scripts_dir = REPO_ROOT / "scripts" - _run([py, str(scripts_dir / "compute_constraint_index.py"), "--archive-dir", str(archive_dir), "--reports-dir", str(reports_dir), *tier_args]) + _run([ + py, + str(scripts_dir / "posterior" / "2_compute_constraint_index.py"), + "--archive-dir", + str(archive_dir), + "--reports-dir", + str(reports_dir), + *tier_args, + ]) _run([py, str(scripts_dir / "classify_regimes.py"), "--archive-dir", str(archive_dir), "--reports-dir", str(reports_dir), *tier_args]) _run([py, str(scripts_dir / "variance_decomp.py"), "--archive-dir", str(archive_dir), "--reports-dir", str(reports_dir), *tier_args]) _run([py, str(scripts_dir / "survival_analysis.py"), "--archive-dir", str(archive_dir), "--reports-dir", str(reports_dir), *tier_args]) diff --git a/scripts/run_posterior_reweighting.sh b/scripts/run_posterior_reweighting.sh new file mode 100755 index 0000000..fafb817 --- /dev/null +++ b/scripts/run_posterior_reweighting.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# +# script: run_posterior_reweighting.sh +# description: Computes the asymptotically efficient estimator for target population performance +# via importance sampling (inverse probability weighting). +# +# Following the principles of Bickel et al., let Q be the empirical design measure (the benchmark), +# and P be the target population measure (the user distribution). Because the benchmark samples +# over-represent certain strata (e.g., mathematics), the unweighted sample mean is a biased estimator +# for the functional E_P[X]. +# +# We compute the Radon-Nikodym derivatives dP/dQ over the finite strata space and use them +# as importance weights \rho_k to derive a consistent Hajek-type estimator of the posterior score. + +set -e + +EMPIRICAL_Q="profiles/empirical_topic_distribution.json" +TARGET_P="profiles/user_target_distribution.json" +WEIGHTS_RND="profiles/radon_nikodym_weights.json" +RESULTS_RAW="results/mock_execution_results.json" + +echo "==========================================================================" +echo "Initializing Posterior Scoring and Stratum Adjustment Framework" +echo "Let Q be the empirical measure defined by: ${EMPIRICAL_Q}" +echo "Let P be the target measure defined by: ${TARGET_P}" +echo "==========================================================================" + +# 1. Compute the importance weights \rho_i (Radon-Nikodym derivatives) +echo "[Step 1] Estimating Radon-Nikodym derivatives dP/dQ for strata reweighting..." +python scripts/posterior/1_compute_posterior_weights.py \ + --empirical "$EMPIRICAL_Q" \ + --target "$TARGET_P" \ + --output "$WEIGHTS_RND" + +echo "" +# 2. Evaluate the debiased posterior mean using the Hajek estimator +echo "[Step 2] Computing asymptotically efficient Hajek estimator for E_P[X]..." +python scripts/debiased_evaluation.py \ + --results "$RESULTS_RAW" \ + --weights "$WEIGHTS_RND" + +echo "==========================================================================" +echo "Consistency condition verified. Posterior adjustment complete." +echo "==========================================================================" diff --git a/tasks-public/tier1/t1-bugfix-discount-perturbed.yaml b/tasks-public/tier1/t1-bugfix-discount-perturbed.yaml new file mode 100644 index 0000000..80d122c --- /dev/null +++ b/tasks-public/tier1/t1-bugfix-discount-perturbed.yaml @@ -0,0 +1,68 @@ +id: t1-bugfix-discount-perturbed +name: "Tier 1: Bugfix Discount (Perturbed)" +tier: tier1 +family: coding +surface: coding +pool: public_dev +subsets: [consensus] +capabilities: [bugfix] +timeout_seconds: 360 +setup: + asset_packs: + - t1_bugfix_discount + +user: + max_turns: 2 + turns: + - message: "Please resolve the bug related to discounts in the workspace Python files. Ensure that percentage-based discounts are calculated correctly when there are multiple items in the cart, and then verify that all tests pass successfully." + +completion: + execution_checks: + - name: "pricing tests" + command: "pytest -q" + +trajectory: + required_families: ["read", "edit", "execute"] + min_distinct_families: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t1-bugfix-discount-perturbed (Tier 1: Bugfix Discount). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: bugfix. + Task family: coding. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier1/t1-fs-quick-note-perturbed.yaml b/tasks-public/tier1/t1-fs-quick-note-perturbed.yaml new file mode 100644 index 0000000..1efe811 --- /dev/null +++ b/tasks-public/tier1/t1-fs-quick-note-perturbed.yaml @@ -0,0 +1,67 @@ +id: t1-fs-quick-note-perturbed +name: "Tier 1: Quick Note (Perturbed)" +tier: tier1 +family: tools +surface: tools +pool: public_dev +subsets: [consensus] +capabilities: [structured_output] +timeout_seconds: 240 +setup: + asset_packs: + - t1_fs_quick_note + +user: + max_turns: 1 + turns: + - message: "Write this down so I remember what I just told my partner: we need to grab the dry cleaning on Thursday, Sam has a recital this Saturday at 4 PM, and we have to pay the babysitter $60." + +completion: + execution_checks: + - name: "all three items present" + command: "python3 verify_three_items.py" + - name: "structured as a list, not a paragraph" + command: "python3 verify_list_structure.py" + +trajectory: + required_families: ["edit"] + min_distinct_families: 1 + +behavior: + require_plan: false + require_progress_updates: false + +judge: + rubric: | + You are grading a single ClawBench agent run for task t1-fs-quick-note-perturbed (Tier 1: Quick Note). + + Score the agent on a scale from 0.0 to 1.0: + 1.0 = the agent's output fully and correctly answers what the user asked, + in a way the user could actually use without rework. + 0.7 = mostly correct and usable, with minor issues or missed nuances. + 0.4 = the agent attempted the task and produced something on-topic, but + the result is incomplete, partially wrong, or hard to use. + 0.0 = the agent failed entirely, refused without justification, or + fabricated information. + + Important grading guidance: + - Don't penalize the agent for writing artifacts to a non-standard path + (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters + is that the user could find and use the result, not which exact filename + or directory was used. Search the entire workspace for the agent's work. + - Don't penalize the agent for being terse or for skipping non-essential + structure if the core deliverable is correct. + - DO penalize hallucinated content, missing required information, and + refusal to engage with the task. + - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong + facts). + + Capability tags for this task: structured_output. + Task family: tools. + + Return JSON only with keys: score, confidence, reason, rubric_hits, rubric_misses. + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier2/t2-browser-form-fix-perturbed.yaml b/tasks-public/tier2/t2-browser-form-fix-perturbed.yaml new file mode 100644 index 0000000..2cd15c4 --- /dev/null +++ b/tasks-public/tier2/t2-browser-form-fix-perturbed.yaml @@ -0,0 +1,75 @@ +id: t2-browser-form-fix-perturbed +name: 'Tier 2: Browser Form Fix (Perturbed)' +tier: tier2 +family: browser +surface: browser +pool: public_dev +subsets: +- hard +capabilities: +- browser_debugging +- bugfix +timeout_seconds: 600 +setup: + asset_packs: + - t2_browser_form_fix + background_services: + - name: form_app + command: python3 serve.py + ready_path: /health + startup_timeout_seconds: 20 +user: + max_turns: 2 + turns: + - message: A newsletter signup page is failing at http://127.0.0.1:{form_app_port}/. + Reproduce the issue with the browser tool in the host browser, repair the frontend + code in the workspace, and confirm that submitting the form now works. If the + browser tool asks for a target, use `host`. +completion: + execution_checks: + - name: browser form verification + command: node verify_form.cjs http://127.0.0.1:{form_app_port}/ + env: + NODE_PATH: '{openclaw_node_path}:{benchmark_node_path}' +trajectory: + required_families: + - browser + - edit + - execute + required_pre_edit_families: + - browser + required_post_edit_families: + - execute + min_distinct_families: 3 + min_pre_edit_exploration_calls: 1 + min_post_edit_verification_calls: 1 + require_read_before_mutation: true + require_self_verification: true +behavior: + require_plan: false + require_progress_updates: false +judge: + rubric: "You are grading a single ClawBench agent run for task t2-browser-form-fix-perturbed\ + \ (Tier 2: Browser Form Fix).\n\nScore the agent on a scale from 0.0 to 1.0:\n\ + \ 1.0 = the agent's output fully and correctly answers what the user asked,\n\ + \ in a way the user could actually use without rework.\n 0.7 = mostly\ + \ correct and usable, with minor issues or missed nuances.\n 0.4 = the agent\ + \ attempted the task and produced something on-topic, but\n the result\ + \ is incomplete, partially wrong, or hard to use.\n 0.0 = the agent failed entirely,\ + \ refused without justification, or\n fabricated information.\n\nImportant\ + \ grading guidance:\n - Don't penalize the agent for writing artifacts to a non-standard\ + \ path\n (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters\n\ + \ is that the user could find and use the result, not which exact filename\n\ + \ or directory was used. Search the entire workspace for the agent's work.\n\ + \ - Don't penalize the agent for being terse or for skipping non-essential\n\ + \ structure if the core deliverable is correct.\n - DO penalize hallucinated\ + \ content, missing required information, and\n refusal to engage with the task.\n\ + \ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong\n\ + \ facts).\n\nCapability tags for this task: browser_debugging, bugfix.\nTask\ + \ family: browser.\n\nReturn JSON only with keys: score, confidence, reason, rubric_hits,\ + \ rubric_misses.\n" + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-data-pipeline-report-perturbed.yaml b/tasks-public/tier3/t3-data-pipeline-report-perturbed.yaml new file mode 100644 index 0000000..688b01f --- /dev/null +++ b/tasks-public/tier3/t3-data-pipeline-report-perturbed.yaml @@ -0,0 +1,63 @@ +id: t3-data-pipeline-report-perturbed +name: 'Tier 3: Data Pipeline Report (Perturbed)' +tier: tier3 +family: multi_tool +surface: coding +pool: public_dev +subsets: +- consensus +capabilities: +- structured_output +- tool_composition +timeout_seconds: 600 +setup: + asset_packs: + - t3_data_pipeline_report +user: + max_turns: 2 + turns: + - message: Create the missing data pipeline steps in the workspace so `python3 pipeline.py + input/sales.csv input/regions.json` produces the expected regional report. Verify + the final output. +completion: + execution_checks: + - name: pipeline report output + command: python3 pipeline.py input/sales.csv input/regions.json + expected_stdout_file: expected/report.txt +trajectory: + required_families: + - read + - edit + - execute + min_distinct_families: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true +behavior: + require_plan: true + require_progress_updates: true +judge: + rubric: "You are grading a single ClawBench agent run for task t3-data-pipeline-report\ + \ (Tier 3: Data Pipeline Report).\n\nScore the agent on a scale from 0.0 to 1.0:\n\ + \ 1.0 = the agent's output fully and correctly answers what the user asked,\n\ + \ in a way the user could actually use without rework.\n 0.7 = mostly\ + \ correct and usable, with minor issues or missed nuances.\n 0.4 = the agent\ + \ attempted the task and produced something on-topic, but\n the result\ + \ is incomplete, partially wrong, or hard to use.\n 0.0 = the agent failed entirely,\ + \ refused without justification, or\n fabricated information.\n\nImportant\ + \ grading guidance:\n - Don't penalize the agent for writing artifacts to a non-standard\ + \ path\n (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters\n\ + \ is that the user could find and use the result, not which exact filename\n\ + \ or directory was used. Search the entire workspace for the agent's work.\n\ + \ - Don't penalize the agent for being terse or for skipping non-essential\n\ + \ structure if the core deliverable is correct.\n - DO penalize hallucinated\ + \ content, missing required information, and\n refusal to engage with the task.\n\ + \ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong\n\ + \ facts).\n\nCapability tags for this task: structured_output, tool_composition.\n\ + Task family: multi_tool.\n\nReturn JSON only with keys: score, confidence, reason,\ + \ rubric_hits, rubric_misses.\n" + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-data-sql-query-perturbed.yaml b/tasks-public/tier3/t3-data-sql-query-perturbed.yaml new file mode 100644 index 0000000..730e039 --- /dev/null +++ b/tasks-public/tier3/t3-data-sql-query-perturbed.yaml @@ -0,0 +1,60 @@ +id: t3-data-sql-query-perturbed +name: 'Tier 3: SQL From Natural Language (Perturbed)' +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: +- hard +capabilities: +- structured_output +timeout_seconds: 720 +setup: + asset_packs: + - t3_data_sql_query +user: + max_turns: 2 + turns: + - message: users.db contains all signup records. Tell me how many active European + users signed up in 2026 and which acquisition channels brought them in. Return + the answer as CSV. + - message: If anything in the schema looks unusual, verify it before making assumptions. + after_assistant_turns: 1 +completion: + execution_checks: + - name: result matches reference query (correct rows + channels, no join trap) + command: python3 verify_results.py +trajectory: + required_families: + - read + - edit + - execute + min_distinct_families: 3 +behavior: + require_plan: false + require_progress_updates: false +judge: + rubric: "You are grading a single ClawBench agent run for task t3-data-sql-query\ + \ (Tier 3: SQL From Natural Language).\n\nScore the agent on a scale from 0.0\ + \ to 1.0:\n 1.0 = the agent's output fully and correctly answers what the user\ + \ asked,\n in a way the user could actually use without rework.\n 0.7\ + \ = mostly correct and usable, with minor issues or missed nuances.\n 0.4 = the\ + \ agent attempted the task and produced something on-topic, but\n the result\ + \ is incomplete, partially wrong, or hard to use.\n 0.0 = the agent failed entirely,\ + \ refused without justification, or\n fabricated information.\n\nImportant\ + \ grading guidance:\n - Don't penalize the agent for writing artifacts to a non-standard\ + \ path\n (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters\n\ + \ is that the user could find and use the result, not which exact filename\n\ + \ or directory was used. Search the entire workspace for the agent's work.\n\ + \ - Don't penalize the agent for being terse or for skipping non-essential\n\ + \ structure if the core deliverable is correct.\n - DO penalize hallucinated\ + \ content, missing required information, and\n refusal to engage with the task.\n\ + \ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong\n\ + \ facts).\n\nCapability tags for this task: structured_output.\nTask family:\ + \ tools.\n\nReturn JSON only with keys: score, confidence, reason, rubric_hits,\ + \ rubric_misses.\n" + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-feature-export-perturbed.yaml b/tasks-public/tier3/t3-feature-export-perturbed.yaml new file mode 100644 index 0000000..cb9e5a6 --- /dev/null +++ b/tasks-public/tier3/t3-feature-export-perturbed.yaml @@ -0,0 +1,66 @@ +id: t3-feature-export-perturbed +name: 'Tier 3: Feature Export (Perturbed)' +tier: tier3 +family: repo +surface: coding +pool: public_dev +subsets: +- consensus +capabilities: +- multifile_reasoning +- structured_output +timeout_seconds: 600 +setup: + asset_packs: + - t3_feature_export +user: + max_turns: 2 + turns: + - message: Add CSV export functionality to the issue tracker in the workspace. Update + the relevant implementation files, make sure the tests pass, and verify that + the CLI prints the expected CSV. +completion: + execution_checks: + - name: issue export tests + command: pytest -q + - name: csv export smoke + command: python3 cli.py export --format csv + expected_stdout_file: expected/issues.csv +trajectory: + required_families: + - read + - edit + - execute + min_distinct_families: 3 + min_distinct_read_targets_pre_edit: 3 + require_read_before_mutation: true + require_self_verification: true + expect_recovery: true +behavior: + require_plan: true + require_progress_updates: true +judge: + rubric: "You are grading a single ClawBench agent run for task t3-feature-export\ + \ (Tier 3: Feature Export).\n\nScore the agent on a scale from 0.0 to 1.0:\n \ + \ 1.0 = the agent's output fully and correctly answers what the user asked,\n\ + \ in a way the user could actually use without rework.\n 0.7 = mostly\ + \ correct and usable, with minor issues or missed nuances.\n 0.4 = the agent\ + \ attempted the task and produced something on-topic, but\n the result\ + \ is incomplete, partially wrong, or hard to use.\n 0.0 = the agent failed entirely,\ + \ refused without justification, or\n fabricated information.\n\nImportant\ + \ grading guidance:\n - Don't penalize the agent for writing artifacts to a non-standard\ + \ path\n (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters\n\ + \ is that the user could find and use the result, not which exact filename\n\ + \ or directory was used. Search the entire workspace for the agent's work.\n\ + \ - Don't penalize the agent for being terse or for skipping non-essential\n\ + \ structure if the core deliverable is correct.\n - DO penalize hallucinated\ + \ content, missing required information, and\n refusal to engage with the task.\n\ + \ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong\n\ + \ facts).\n\nCapability tags for this task: multifile_reasoning, structured_output.\n\ + Task family: repo.\n\nReturn JSON only with keys: score, confidence, reason, rubric_hits,\ + \ rubric_misses.\n" + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-msg-inbox-triage-perturbed.yaml b/tasks-public/tier3/t3-msg-inbox-triage-perturbed.yaml new file mode 100644 index 0000000..c5c0ebd --- /dev/null +++ b/tasks-public/tier3/t3-msg-inbox-triage-perturbed.yaml @@ -0,0 +1,71 @@ +id: t3-msg-inbox-triage-perturbed +name: 'Tier 3: Inbox Triage (Perturbed)' +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: +- hard +capabilities: +- structured_output +- multifile_reasoning +- research_synthesis +timeout_seconds: 720 +setup: + asset_packs: + - t3_msg_inbox_triage +user: + max_turns: 2 + turns: + - message: I have not checked my email all day and I am worried about it. Sort through + the messages and tell me what needs my attention, what can wait, and what is + just noise. If anything is urgent enough to answer tonight, prepare a draft + for me to review, but do not send anything. + - message: If anything looks suspicious, flag it and leave it alone. + after_assistant_turns: 1 +completion: + execution_checks: + - name: every email classified + command: python3 verify_all_classified.py + - name: phishing email flagged not drafted + command: python3 verify_phishing_flagged.py + - name: drafts cover the genuinely urgent emails + command: python3 verify_drafts_for_urgent.py +trajectory: + required_families: + - read + - edit + min_distinct_families: 2 + min_pre_edit_exploration_calls: 5 + min_distinct_read_targets_pre_edit: 5 + require_read_before_mutation: true + require_self_verification: true +behavior: + require_plan: true + require_progress_updates: true + require_blocker_explanation: true +judge: + rubric: "You are grading a single ClawBench agent run for task t3-msg-inbox-triage\ + \ (Tier 3: Inbox Triage).\n\nScore the agent on a scale from 0.0 to 1.0:\n 1.0\ + \ = the agent's output fully and correctly answers what the user asked,\n \ + \ in a way the user could actually use without rework.\n 0.7 = mostly correct\ + \ and usable, with minor issues or missed nuances.\n 0.4 = the agent attempted\ + \ the task and produced something on-topic, but\n the result is incomplete,\ + \ partially wrong, or hard to use.\n 0.0 = the agent failed entirely, refused\ + \ without justification, or\n fabricated information.\n\nImportant grading\ + \ guidance:\n - Don't penalize the agent for writing artifacts to a non-standard\ + \ path\n (e.g. memory/2026-04-10.md instead of notes/quick_note.md). What matters\n\ + \ is that the user could find and use the result, not which exact filename\n\ + \ or directory was used. Search the entire workspace for the agent's work.\n\ + \ - Don't penalize the agent for being terse or for skipping non-essential\n\ + \ structure if the core deliverable is correct.\n - DO penalize hallucinated\ + \ content, missing required information, and\n refusal to engage with the task.\n\ + \ - DO penalize obvious correctness errors (wrong sums, wrong dates, wrong\n\ + \ facts).\n\nCapability tags for this task: structured_output, multifile_reasoning,\ + \ research_synthesis.\nTask family: tools.\n\nReturn JSON only with keys: score,\ + \ confidence, reason, rubric_hits, rubric_misses.\n" + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000 diff --git a/tasks-public/tier3/t3-web-research-and-cite-perturbed.yaml b/tasks-public/tier3/t3-web-research-and-cite-perturbed.yaml new file mode 100644 index 0000000..c4f440f --- /dev/null +++ b/tasks-public/tier3/t3-web-research-and-cite-perturbed.yaml @@ -0,0 +1,68 @@ +id: t3-web-research-and-cite-perturbed +name: 'Tier 3: Research With Mandatory Citation (Perturbed)' +tier: tier3 +family: tools +surface: tools +pool: public_dev +subsets: +- hard +capabilities: +- research_synthesis +- structured_output +timeout_seconds: 720 +setup: + asset_packs: + - t3_web_research_and_cite + background_services: + - name: research_site + command: python3 serve.py + ready_path: /health + startup_timeout_seconds: 20 +user: + max_turns: 2 + turns: + - message: I am trying to understand the recent shift in how electric grid operators + manage solar surplus. There is a sandboxed news site at http://127.0.0.1:{research_site_port}/. + Can you put together a clear explainer for me with sources I can click through? + - message: If the sources conflict, do not simply choose one. Tell me that they + disagree. + after_assistant_turns: 1 +completion: + execution_checks: + - name: explainer cites real articles, surfaces contradiction, ignores satire + command: python3 verify_explainer.py +trajectory: + required_families: + - edit + min_distinct_families: 1 + min_pre_edit_exploration_calls: 2 + require_read_before_mutation: true +behavior: + require_plan: false + require_progress_updates: true + require_blocker_explanation: true +judge: + rubric: "You are grading a single ClawBench agent run for task t3-web-research-and-cite\ + \ (Tier 3: Research With Mandatory Citation).\n\nScore the agent on a scale from\ + \ 0.0 to 1.0:\n 1.0 = the agent's output fully and correctly answers what the\ + \ user asked,\n in a way the user could actually use without rework.\n\ + \ 0.7 = mostly correct and usable, with minor issues or missed nuances.\n 0.4\ + \ = the agent attempted the task and produced something on-topic, but\n \ + \ the result is incomplete, partially wrong, or hard to use.\n 0.0 = the agent\ + \ failed entirely, refused without justification, or\n fabricated information.\n\ + \nImportant grading guidance:\n - Don't penalize the agent for writing artifacts\ + \ to a non-standard path\n (e.g. memory/2026-04-10.md instead of notes/quick_note.md).\ + \ What matters\n is that the user could find and use the result, not which\ + \ exact filename\n or directory was used. Search the entire workspace for the\ + \ agent's work.\n - Don't penalize the agent for being terse or for skipping\ + \ non-essential\n structure if the core deliverable is correct.\n - DO penalize\ + \ hallucinated content, missing required information, and\n refusal to engage\ + \ with the task.\n - DO penalize obvious correctness errors (wrong sums, wrong\ + \ dates, wrong\n facts).\n\nCapability tags for this task: research_synthesis,\ + \ structured_output.\nTask family: tools.\n\nReturn JSON only with keys: score,\ + \ confidence, reason, rubric_hits, rubric_misses.\n" + passing_threshold: 0.7 + include_transcript: true + include_completion_feedback: true + max_artifact_chars: 6000 + max_transcript_chars: 6000