From 2f39d3f086353261d71caf809c693c5e9cf84f6c Mon Sep 17 00:00:00 2001 From: jif-oai Date: Wed, 22 Oct 2025 16:31:44 +0100 Subject: [PATCH 1/3] feat: use actual tokenizer for unified_exec truncation --- codex-rs/Cargo.lock | 1 + codex-rs/core/Cargo.toml | 1 + codex-rs/core/src/truncate.rs | 100 +++++++++++++++++----------- codex-rs/utils/tokenizer/src/lib.rs | 7 +- 4 files changed, 68 insertions(+), 41 deletions(-) diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 507c7d7bf07..61124d2e328 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -1067,6 +1067,7 @@ dependencies = [ "codex-rmcp-client", "codex-utils-pty", "codex-utils-string", + "codex-utils-tokenizer", "core-foundation 0.9.4", "core_test_support", "dirs", diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index fdc1136f08e..4cd6e703fdc 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -29,6 +29,7 @@ codex-rmcp-client = { workspace = true } codex-async-utils = { workspace = true } codex-utils-string = { workspace = true } codex-utils-pty = { workspace = true } +codex-utils-tokenizer = { workspace = true } dirs = { workspace = true } dunce = { workspace = true } env-flags = { workspace = true } diff --git a/codex-rs/core/src/truncate.rs b/codex-rs/core/src/truncate.rs index ab0158720b5..87b439a7577 100644 --- a/codex-rs/core/src/truncate.rs +++ b/codex-rs/core/src/truncate.rs @@ -1,18 +1,35 @@ //! Utilities for truncating large chunks of output while preserving a prefix //! and suffix on UTF-8 boundaries. +use codex_utils_tokenizer::Tokenizer; + /// Truncate the middle of a UTF-8 string to at most `max_bytes` bytes, /// preserving the beginning and the end. Returns the possibly truncated -/// string and `Some(original_token_count)` (estimated at 4 bytes/token) +/// string and `Some(original_token_count)` (counted with the local tokenizer; +/// falls back to a 4-bytes-per-token estimate if the tokenizer cannot load) /// if truncation occurred; otherwise returns the original string and `None`. pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option) { if s.len() <= max_bytes { return (s.to_string(), None); } - let est_tokens = (s.len() as u64).div_ceil(4); + // Build a tokenizer for counting (default to o200k_base; fall back to cl100k_base). + // If both fail, fall back to a 4-bytes-per-token estimate. + let tok = Tokenizer::default().ok(); + let token_count = |text: &str| -> u64 { + if let Some(ref t) = tok { + t.count(text) as u64 + } else { + (text.len() as u64).div_ceil(4) + } + }; + + let total_tokens = token_count(s); if max_bytes == 0 { - return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens)); + return ( + format!("…{total_tokens} tokens truncated…"), + Some(total_tokens), + ); } fn truncate_on_boundary(input: &str, max_len: usize) -> &str { @@ -50,13 +67,17 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option idx } - let mut guess_tokens = est_tokens; + // Iterate to stabilize marker length → keep budget → boundaries. + let mut guess_tokens: u64 = 1; for _ in 0..4 { let marker = format!("…{guess_tokens} tokens truncated…"); let marker_len = marker.len(); let keep_budget = max_bytes.saturating_sub(marker_len); if keep_budget == 0 { - return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens)); + return ( + format!("…{total_tokens} tokens truncated…"), + Some(total_tokens), + ); } let left_budget = keep_budget / 2; @@ -67,59 +88,72 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option suffix_start = prefix_end; } - let kept_content_bytes = prefix_end + (s.len() - suffix_start); - let truncated_content_bytes = s.len().saturating_sub(kept_content_bytes); - let new_tokens = (truncated_content_bytes as u64).div_ceil(4); + // Tokens actually removed (middle slice) using the real tokenizer. + let removed_tokens = token_count(&s[prefix_end..suffix_start]); - if new_tokens == guess_tokens { - let mut out = String::with_capacity(marker_len + kept_content_bytes + 1); + // If the number of digits in the token count does not change the marker length, + // we can finalize output. + let final_marker = format!("…{removed_tokens} tokens truncated…"); + if final_marker.len() == marker_len { + let kept_content_bytes = prefix_end + (s.len() - suffix_start); + let mut out = String::with_capacity(final_marker.len() + kept_content_bytes + 1); out.push_str(&s[..prefix_end]); - out.push_str(&marker); + out.push_str(&final_marker); out.push('\n'); out.push_str(&s[suffix_start..]); - return (out, Some(est_tokens)); + return (out, Some(total_tokens)); } - guess_tokens = new_tokens; + guess_tokens = removed_tokens; } + // Fallback build after iterations: compute with the last guess. let marker = format!("…{guess_tokens} tokens truncated…"); let marker_len = marker.len(); let keep_budget = max_bytes.saturating_sub(marker_len); if keep_budget == 0 { - return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens)); + return ( + format!("…{total_tokens} tokens truncated…"), + Some(total_tokens), + ); } let left_budget = keep_budget / 2; let right_budget = keep_budget - left_budget; let prefix_end = pick_prefix_end(s, left_budget); - let suffix_start = pick_suffix_start(s, right_budget); + let mut suffix_start = pick_suffix_start(s, right_budget); + if suffix_start < prefix_end { + suffix_start = prefix_end; + } let mut out = String::with_capacity(marker_len + prefix_end + (s.len() - suffix_start) + 1); out.push_str(&s[..prefix_end]); out.push_str(&marker); out.push('\n'); out.push_str(&s[suffix_start..]); - (out, Some(est_tokens)) + (out, Some(total_tokens)) } #[cfg(test)] mod tests { use super::truncate_middle; + use codex_utils_tokenizer::Tokenizer; #[test] fn truncate_middle_no_newlines_fallback() { + let tok = Tokenizer::default().expect("load tokenizer"); let s = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ*"; let max_bytes = 32; let (out, original) = truncate_middle(s, max_bytes); assert!(out.starts_with("abc")); assert!(out.contains("tokens truncated")); assert!(out.ends_with("XYZ*")); - assert_eq!(original, Some((s.len() as u64).div_ceil(4))); + assert_eq!(original, Some(tok.count(s) as u64)); } #[test] fn truncate_middle_prefers_newline_boundaries() { + let tok = Tokenizer::default().expect("load tokenizer"); let mut s = String::new(); for i in 1..=20 { s.push_str(&format!("{i:03}\n")); @@ -131,50 +165,36 @@ mod tests { assert!(out.starts_with("001\n002\n003\n004\n")); assert!(out.contains("tokens truncated")); assert!(out.ends_with("017\n018\n019\n020\n")); - assert_eq!(tokens, Some(20)); + assert_eq!(tokens, Some(tok.count(&s) as u64)); } #[test] fn truncate_middle_handles_utf8_content() { + let tok = Tokenizer::default().expect("load tokenizer"); let s = "😀😀😀😀😀😀😀😀😀😀\nsecond line with ascii text\n"; let max_bytes = 32; let (out, tokens) = truncate_middle(s, max_bytes); assert!(out.contains("tokens truncated")); assert!(!out.contains('\u{fffd}')); - assert_eq!(tokens, Some((s.len() as u64).div_ceil(4))); + assert_eq!(tokens, Some(tok.count(s) as u64)); } #[test] fn truncate_middle_prefers_newline_boundaries_2() { + let tok = Tokenizer::default().expect("load tokenizer"); // Build a multi-line string of 20 numbered lines (each "NNN\n"). let mut s = String::new(); for i in 1..=20 { s.push_str(&format!("{i:03}\n")); } - // Total length: 20 lines * 4 bytes per line = 80 bytes. assert_eq!(s.len(), 80); - // Choose a cap that forces truncation while leaving room for - // a few lines on each side after accounting for the marker. let max_bytes = 64; - // Expect exact output: first 4 lines, marker, last 4 lines, and correct token estimate (80/4 = 20). - assert_eq!( - truncate_middle(&s, max_bytes), - ( - r#"001 -002 -003 -004 -…12 tokens truncated… -017 -018 -019 -020 -"# - .to_string(), - Some(20) - ) - ); + let (out, total) = truncate_middle(&s, max_bytes); + assert!(out.starts_with("001\n002\n003\n004\n")); + assert!(out.contains("tokens truncated")); + assert!(out.ends_with("017\n018\n019\n020\n")); + assert_eq!(total, Some(tok.count(&s) as u64)); } } diff --git a/codex-rs/utils/tokenizer/src/lib.rs b/codex-rs/utils/tokenizer/src/lib.rs index 93740889d15..fbd53864f83 100644 --- a/codex-rs/utils/tokenizer/src/lib.rs +++ b/codex-rs/utils/tokenizer/src/lib.rs @@ -55,8 +55,13 @@ impl Tokenizer { Ok(Self { inner }) } + /// Default to `O200kBase` + pub fn default() -> Result { + Self::new(EncodingKind::O200kBase) + } + /// Build a tokenizer using an `OpenAI` model name (maps to an encoding). - /// Falls back to the `o200k_base` encoding when the model is unknown. + /// Falls back to the `O200kBase` encoding when the model is unknown. pub fn for_model(model: &str) -> Result { match tiktoken_rs::get_bpe_from_model(model) { Ok(inner) => Ok(Self { inner }), From 2ef71f468fcfe18ef8bf5a0fb6325fa405cc7d6f Mon Sep 17 00:00:00 2001 From: jif-oai Date: Wed, 22 Oct 2025 16:35:25 +0100 Subject: [PATCH 2/3] Clippy --- codex-rs/core/src/truncate.rs | 2 +- codex-rs/utils/tokenizer/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/codex-rs/core/src/truncate.rs b/codex-rs/core/src/truncate.rs index 87b439a7577..a8bf1651fa3 100644 --- a/codex-rs/core/src/truncate.rs +++ b/codex-rs/core/src/truncate.rs @@ -15,7 +15,7 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option // Build a tokenizer for counting (default to o200k_base; fall back to cl100k_base). // If both fail, fall back to a 4-bytes-per-token estimate. - let tok = Tokenizer::default().ok(); + let tok = Tokenizer::try_default().ok(); let token_count = |text: &str| -> u64 { if let Some(ref t) = tok { t.count(text) as u64 diff --git a/codex-rs/utils/tokenizer/src/lib.rs b/codex-rs/utils/tokenizer/src/lib.rs index fbd53864f83..6cda6e635f6 100644 --- a/codex-rs/utils/tokenizer/src/lib.rs +++ b/codex-rs/utils/tokenizer/src/lib.rs @@ -56,7 +56,7 @@ impl Tokenizer { } /// Default to `O200kBase` - pub fn default() -> Result { + pub fn try_default() -> Result { Self::new(EncodingKind::O200kBase) } From 6c204f40a7852f89323573dbb5c243b35c8c336c Mon Sep 17 00:00:00 2001 From: jif-oai Date: Wed, 22 Oct 2025 16:36:04 +0100 Subject: [PATCH 3/3] Clippy 2 --- codex-rs/core/src/truncate.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/codex-rs/core/src/truncate.rs b/codex-rs/core/src/truncate.rs index a8bf1651fa3..3f0be8fcaf9 100644 --- a/codex-rs/core/src/truncate.rs +++ b/codex-rs/core/src/truncate.rs @@ -141,7 +141,7 @@ mod tests { #[test] fn truncate_middle_no_newlines_fallback() { - let tok = Tokenizer::default().expect("load tokenizer"); + let tok = Tokenizer::try_default().expect("load tokenizer"); let s = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ*"; let max_bytes = 32; let (out, original) = truncate_middle(s, max_bytes); @@ -153,7 +153,7 @@ mod tests { #[test] fn truncate_middle_prefers_newline_boundaries() { - let tok = Tokenizer::default().expect("load tokenizer"); + let tok = Tokenizer::try_default().expect("load tokenizer"); let mut s = String::new(); for i in 1..=20 { s.push_str(&format!("{i:03}\n")); @@ -170,7 +170,7 @@ mod tests { #[test] fn truncate_middle_handles_utf8_content() { - let tok = Tokenizer::default().expect("load tokenizer"); + let tok = Tokenizer::try_default().expect("load tokenizer"); let s = "😀😀😀😀😀😀😀😀😀😀\nsecond line with ascii text\n"; let max_bytes = 32; let (out, tokens) = truncate_middle(s, max_bytes); @@ -182,7 +182,7 @@ mod tests { #[test] fn truncate_middle_prefers_newline_boundaries_2() { - let tok = Tokenizer::default().expect("load tokenizer"); + let tok = Tokenizer::try_default().expect("load tokenizer"); // Build a multi-line string of 20 numbered lines (each "NNN\n"). let mut s = String::new(); for i in 1..=20 {