Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions codex-rs/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions codex-rs/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ codex-rmcp-client = { workspace = true }
codex-async-utils = { workspace = true }
codex-utils-string = { workspace = true }
codex-utils-pty = { workspace = true }
codex-utils-tokenizer = { workspace = true }
dirs = { workspace = true }
dunce = { workspace = true }
env-flags = { workspace = true }
Expand Down
100 changes: 60 additions & 40 deletions codex-rs/core/src/truncate.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
//! Utilities for truncating large chunks of output while preserving a prefix
//! and suffix on UTF-8 boundaries.

use codex_utils_tokenizer::Tokenizer;

/// Truncate the middle of a UTF-8 string to at most `max_bytes` bytes,
/// preserving the beginning and the end. Returns the possibly truncated
/// string and `Some(original_token_count)` (estimated at 4 bytes/token)
/// string and `Some(original_token_count)` (counted with the local tokenizer;
/// falls back to a 4-bytes-per-token estimate if the tokenizer cannot load)
/// if truncation occurred; otherwise returns the original string and `None`.
pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>) {
if s.len() <= max_bytes {
return (s.to_string(), None);
}

let est_tokens = (s.len() as u64).div_ceil(4);
// Build a tokenizer for counting (default to o200k_base; fall back to cl100k_base).
// If both fail, fall back to a 4-bytes-per-token estimate.
let tok = Tokenizer::try_default().ok();
let token_count = |text: &str| -> u64 {
if let Some(ref t) = tok {
t.count(text) as u64
} else {
(text.len() as u64).div_ceil(4)
}
};

let total_tokens = token_count(s);
if max_bytes == 0 {
return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens));
return (
format!("…{total_tokens} tokens truncated…"),
Some(total_tokens),
);
}

fn truncate_on_boundary(input: &str, max_len: usize) -> &str {
Expand Down Expand Up @@ -50,13 +67,17 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>
idx
}

let mut guess_tokens = est_tokens;
// Iterate to stabilize marker length → keep budget → boundaries.
let mut guess_tokens: u64 = 1;
for _ in 0..4 {
let marker = format!("…{guess_tokens} tokens truncated…");
let marker_len = marker.len();
let keep_budget = max_bytes.saturating_sub(marker_len);
if keep_budget == 0 {
return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens));
return (
format!("…{total_tokens} tokens truncated…"),
Some(total_tokens),
);
}

let left_budget = keep_budget / 2;
Expand All @@ -67,59 +88,72 @@ pub(crate) fn truncate_middle(s: &str, max_bytes: usize) -> (String, Option<u64>
suffix_start = prefix_end;
}

let kept_content_bytes = prefix_end + (s.len() - suffix_start);
let truncated_content_bytes = s.len().saturating_sub(kept_content_bytes);
let new_tokens = (truncated_content_bytes as u64).div_ceil(4);
// Tokens actually removed (middle slice) using the real tokenizer.
let removed_tokens = token_count(&s[prefix_end..suffix_start]);

if new_tokens == guess_tokens {
let mut out = String::with_capacity(marker_len + kept_content_bytes + 1);
// If the number of digits in the token count does not change the marker length,
// we can finalize output.
let final_marker = format!("…{removed_tokens} tokens truncated…");
if final_marker.len() == marker_len {
let kept_content_bytes = prefix_end + (s.len() - suffix_start);
let mut out = String::with_capacity(final_marker.len() + kept_content_bytes + 1);
out.push_str(&s[..prefix_end]);
out.push_str(&marker);
out.push_str(&final_marker);
out.push('\n');
out.push_str(&s[suffix_start..]);
return (out, Some(est_tokens));
return (out, Some(total_tokens));
}

guess_tokens = new_tokens;
guess_tokens = removed_tokens;
}

// Fallback build after iterations: compute with the last guess.
let marker = format!("…{guess_tokens} tokens truncated…");
let marker_len = marker.len();
let keep_budget = max_bytes.saturating_sub(marker_len);
if keep_budget == 0 {
return (format!("…{est_tokens} tokens truncated…"), Some(est_tokens));
return (
format!("…{total_tokens} tokens truncated…"),
Some(total_tokens),
);
}

let left_budget = keep_budget / 2;
let right_budget = keep_budget - left_budget;
let prefix_end = pick_prefix_end(s, left_budget);
let suffix_start = pick_suffix_start(s, right_budget);
let mut suffix_start = pick_suffix_start(s, right_budget);
if suffix_start < prefix_end {
suffix_start = prefix_end;
}

let mut out = String::with_capacity(marker_len + prefix_end + (s.len() - suffix_start) + 1);
out.push_str(&s[..prefix_end]);
out.push_str(&marker);
out.push('\n');
out.push_str(&s[suffix_start..]);
(out, Some(est_tokens))
(out, Some(total_tokens))
}

#[cfg(test)]
mod tests {
use super::truncate_middle;
use codex_utils_tokenizer::Tokenizer;

#[test]
fn truncate_middle_no_newlines_fallback() {
let tok = Tokenizer::try_default().expect("load tokenizer");
let s = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ*";
let max_bytes = 32;
let (out, original) = truncate_middle(s, max_bytes);
assert!(out.starts_with("abc"));
assert!(out.contains("tokens truncated"));
assert!(out.ends_with("XYZ*"));
assert_eq!(original, Some((s.len() as u64).div_ceil(4)));
assert_eq!(original, Some(tok.count(s) as u64));
}

#[test]
fn truncate_middle_prefers_newline_boundaries() {
let tok = Tokenizer::try_default().expect("load tokenizer");
let mut s = String::new();
for i in 1..=20 {
s.push_str(&format!("{i:03}\n"));
Expand All @@ -131,50 +165,36 @@ mod tests {
assert!(out.starts_with("001\n002\n003\n004\n"));
assert!(out.contains("tokens truncated"));
assert!(out.ends_with("017\n018\n019\n020\n"));
assert_eq!(tokens, Some(20));
assert_eq!(tokens, Some(tok.count(&s) as u64));
}

#[test]
fn truncate_middle_handles_utf8_content() {
let tok = Tokenizer::try_default().expect("load tokenizer");
let s = "😀😀😀😀😀😀😀😀😀😀\nsecond line with ascii text\n";
let max_bytes = 32;
let (out, tokens) = truncate_middle(s, max_bytes);

assert!(out.contains("tokens truncated"));
assert!(!out.contains('\u{fffd}'));
assert_eq!(tokens, Some((s.len() as u64).div_ceil(4)));
assert_eq!(tokens, Some(tok.count(s) as u64));
}

#[test]
fn truncate_middle_prefers_newline_boundaries_2() {
let tok = Tokenizer::try_default().expect("load tokenizer");
// Build a multi-line string of 20 numbered lines (each "NNN\n").
let mut s = String::new();
for i in 1..=20 {
s.push_str(&format!("{i:03}\n"));
}
// Total length: 20 lines * 4 bytes per line = 80 bytes.
assert_eq!(s.len(), 80);

// Choose a cap that forces truncation while leaving room for
// a few lines on each side after accounting for the marker.
let max_bytes = 64;
// Expect exact output: first 4 lines, marker, last 4 lines, and correct token estimate (80/4 = 20).
assert_eq!(
truncate_middle(&s, max_bytes),
(
r#"001
002
003
004
…12 tokens truncated…
017
018
019
020
"#
.to_string(),
Some(20)
)
);
let (out, total) = truncate_middle(&s, max_bytes);
assert!(out.starts_with("001\n002\n003\n004\n"));
assert!(out.contains("tokens truncated"));
assert!(out.ends_with("017\n018\n019\n020\n"));
assert_eq!(total, Some(tok.count(&s) as u64));
}
}
7 changes: 6 additions & 1 deletion codex-rs/utils/tokenizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,13 @@ impl Tokenizer {
Ok(Self { inner })
}

/// Default to `O200kBase`
pub fn try_default() -> Result<Self, TokenizerError> {
Self::new(EncodingKind::O200kBase)
}

/// Build a tokenizer using an `OpenAI` model name (maps to an encoding).
/// Falls back to the `o200k_base` encoding when the model is unknown.
/// Falls back to the `O200kBase` encoding when the model is unknown.
pub fn for_model(model: &str) -> Result<Self, TokenizerError> {
match tiktoken_rs::get_bpe_from_model(model) {
Ok(inner) => Ok(Self { inner }),
Expand Down
Loading