From 6ea1aa0072829eb6d41922f5bd0ec6461c7f31f9 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 14:41:57 -0800 Subject: [PATCH 01/18] fix compact --- codex-rs/core/src/compact.rs | 5 ++++- codex-rs/core/templates/compact/prompt.md | 10 +--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/codex-rs/core/src/compact.rs b/codex-rs/core/src/compact.rs index a1c053304781..0831b60d11c8 100644 --- a/codex-rs/core/src/compact.rs +++ b/codex-rs/core/src/compact.rs @@ -26,6 +26,7 @@ use futures::prelude::*; use tracing::error; pub const SUMMARIZATION_PROMPT: &str = include_str!("../templates/compact/prompt.md"); +pub const SUMMARY_PREFIX: &str = include_str!("../templates/compact/summary_prefix.md"); const COMPACT_USER_MESSAGE_MAX_TOKENS: usize = 20_000; pub(crate) async fn run_inline_auto_compact_task( @@ -140,7 +141,9 @@ async fn run_compact_task_inner( } let history_snapshot = sess.clone_history().await.get_history(); - let summary_text = get_last_assistant_message_from_turn(&history_snapshot).unwrap_or_default(); + let summary_suffix = + get_last_assistant_message_from_turn(&history_snapshot).unwrap_or_default(); + let summary_text = format!("{SUMMARY_PREFIX}\n{summary_suffix}"); let user_messages = collect_user_messages(&history_snapshot); let initial_context = sess.build_initial_context(turn_context.as_ref()); diff --git a/codex-rs/core/templates/compact/prompt.md b/codex-rs/core/templates/compact/prompt.md index 42fae605db8a..0907835c94d8 100644 --- a/codex-rs/core/templates/compact/prompt.md +++ b/codex-rs/core/templates/compact/prompt.md @@ -1,9 +1 @@ -You are performing a CONTEXT CHECKPOINT COMPACTION. Create a handoff summary for another LLM that will resume the task. - -Include: -- Current progress and key decisions made -- Important context, constraints, or user preferences -- What remains to be done (clear next steps) -- Any critical data, examples, or references needed to continue - -Be concise, structured, and focused on helping the next LLM seamlessly continue the work. +You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work. \ No newline at end of file From 50a53333786249f03284067b997c41d84365c7cd Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 14:49:39 -0800 Subject: [PATCH 02/18] sumary --- codex-rs/core/templates/compact/summary_prefix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 codex-rs/core/templates/compact/summary_prefix.md diff --git a/codex-rs/core/templates/compact/summary_prefix.md b/codex-rs/core/templates/compact/summary_prefix.md new file mode 100644 index 000000000000..be6313b84720 --- /dev/null +++ b/codex-rs/core/templates/compact/summary_prefix.md @@ -0,0 +1 @@ +Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis: \ No newline at end of file From a846760a8734386d5218c597ed23b140f79dc1a3 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 16:52:50 -0800 Subject: [PATCH 03/18] fix compact --- codex-rs/core/src/codex.rs | 18 +- codex-rs/core/src/compact.rs | 12 +- codex-rs/core/tests/suite/compact.rs | 481 +++++++++++++++++- .../core/tests/suite/compact_resume_fork.rs | 3 +- 4 files changed, 490 insertions(+), 24 deletions(-) diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index a13c7a0c4dd1..52bf15003854 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -1819,7 +1819,6 @@ pub(crate) async fn run_task( // Although from the perspective of codex.rs, TurnDiffTracker has the lifecycle of a Task which contains // many turns, from the perspective of the user, it is a single turn. let turn_diff_tracker = Arc::new(tokio::sync::Mutex::new(TurnDiffTracker::new())); - let mut auto_compact_recently_attempted = false; loop { // Note that pending_input would be something like a message the user @@ -1871,30 +1870,15 @@ pub(crate) async fn run_task( let token_limit_reached = total_usage_tokens .map(|tokens| tokens >= limit) .unwrap_or(false); + eprintln!("token_limit_reached: {token_limit_reached}"); let (responses, items_to_record_in_conversation_history) = process_items(processed_items, &sess, &turn_context).await; if token_limit_reached { - if auto_compact_recently_attempted { - let limit_str = limit.to_string(); - let current_tokens = total_usage_tokens - .map(|tokens| tokens.to_string()) - .unwrap_or_else(|| "unknown".to_string()); - let event = EventMsg::Error(ErrorEvent { - message: format!( - "Conversation is still above the token limit after automatic summarization (limit {limit_str}, current {current_tokens}). Please start a new session or trim your input." - ), - }); - sess.send_event(&turn_context, event).await; - break; - } - auto_compact_recently_attempted = true; compact::run_inline_auto_compact_task(sess.clone(), turn_context.clone()).await; continue; } - auto_compact_recently_attempted = false; - if responses.is_empty() { last_agent_message = get_last_assistant_message_from_turn( &items_to_record_in_conversation_history, diff --git a/codex-rs/core/src/compact.rs b/codex-rs/core/src/compact.rs index 0831b60d11c8..6908faeec2a9 100644 --- a/codex-rs/core/src/compact.rs +++ b/codex-rs/core/src/compact.rs @@ -204,12 +204,22 @@ pub(crate) fn collect_user_messages(items: &[ResponseItem]) -> Vec { items .iter() .filter_map(|item| match crate::event_mapping::parse_turn_item(item) { - Some(TurnItem::UserMessage(user)) => Some(user.message()), + Some(TurnItem::UserMessage(user)) => { + if is_summary_message(&user.message()) { + None + } else { + Some(user.message()) + } + } _ => None, }) .collect() } +pub(crate) fn is_summary_message(message: &str) -> bool { + message.starts_with(format!("{SUMMARY_PREFIX}\n").as_str()) +} + pub(crate) fn build_compacted_history( initial_context: Vec, user_messages: &[String], diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 9560a3ee9e93..64d8d308392b 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -1,8 +1,10 @@ +#![allow(clippy::expect_used)] use codex_core::CodexAuth; use codex_core::ConversationManager; use codex_core::ModelProviderInfo; use codex_core::NewConversation; use codex_core::built_in_model_providers; +use codex_core::compact::SUMMARY_PREFIX; use codex_core::config::Config; use codex_core::protocol::ErrorEvent; use codex_core::protocol::EventMsg; @@ -12,7 +14,10 @@ use codex_core::protocol::RolloutLine; use codex_core::protocol::WarningEvent; use codex_protocol::user_input::UserInput; use core_test_support::load_default_config_for_test; +use core_test_support::responses::ev_local_shell_call; +use core_test_support::responses::ev_reasoning_item; use core_test_support::skip_if_no_network; +use core_test_support::test_codex::test_codex; use core_test_support::wait_for_event; use core_test_support::wait_for_event_match; use std::collections::VecDeque; @@ -50,10 +55,8 @@ const DUMMY_FUNCTION_NAME: &str = "unsupported_tool"; const DUMMY_CALL_ID: &str = "call-multi-auto"; const FUNCTION_CALL_LIMIT_MSG: &str = "function call limit push"; const POST_AUTO_USER_MSG: &str = "post auto follow-up"; -const COMPACT_PROMPT_MARKER: &str = - "You are performing a CONTEXT CHECKPOINT COMPACTION for a tool."; -pub(super) const TEST_COMPACT_PROMPT: &str = - "You are performing a CONTEXT CHECKPOINT COMPACTION for a tool.\nTest-only compact prompt."; +const COMPACT_PROMPT_MARKER: &str = "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work."; +pub(super) const TEST_COMPACT_PROMPT: &str = "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work."; pub(super) const COMPACT_WARNING_MESSAGE: &str = "Heads up: Long conversations and multiple compactions can cause the model to be less accurate. Start a new conversation when possible to keep conversations small and targeted."; @@ -433,6 +436,476 @@ async fn manual_compact_emits_estimated_token_usage_event() { ); } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { + skip_if_no_network!(); + + let auto_compact_limit = 5000; + + let server = start_mock_server().await; + + let codex = test_codex() + .with_config(move |config| { + config.model_auto_compact_token_limit = Some(auto_compact_limit); + }) + .build(&server) + .await + .expect("build codex") + .codex; + + // user message + let user_message = "create an app"; + + codex + .submit(Op::UserInput { + items: vec![UserInput::Text { + text: user_message.into(), + }], + }) + .await + .expect("submit user input"); + + let token_count_used = 6000; + let token_count_used_after_compaction = 2000; + let model_reasoning_response_1_sse = sse(vec![ + ev_reasoning_item("m1", &["I will create a react app"], &[]), + ev_local_shell_call("r1-shell", "completed", vec!["echo", "make-react"]), + ev_completed_with_tokens("r1", token_count_used), + ]); + let model_compact_response_1_sse = sse(vec![ + ev_assistant_message( + "m2", + "The task is to create an app. I started to create a react app.", + ), + ev_completed_with_tokens("r2", token_count_used_after_compaction), + ]); + let model_reasoning_response_2_sse = sse(vec![ + ev_reasoning_item("m3", &["I will create a node app"], &[]), + ev_local_shell_call("r3-shell", "completed", vec!["echo", "make-node"]), + ev_completed_with_tokens("r3", token_count_used), + ]); + let model_compact_response_2_sse = sse(vec![ + ev_assistant_message( + "m4", + "The task is to create an app. I started to create a react app. then I realized that I need to create a node app.", + ), + ev_completed_with_tokens("r4", token_count_used_after_compaction), + ]); + let model_reasoning_response_3_sse = sse(vec![ + ev_reasoning_item("m6", &["I will create a python app"], &[]), + ev_local_shell_call("r6-shell", "completed", vec!["echo", "make-python"]), + ev_completed_with_tokens("r6", token_count_used), + ]); + let model_compact_response_3_sse = sse(vec![ + ev_assistant_message( + "m7", + "The task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app.", + ), + ev_completed_with_tokens("r7", token_count_used_after_compaction), + ]); + let model_final_response_sse = sse(vec![ + ev_assistant_message( + "m8", + "The task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app.", + ), + ev_completed_with_tokens("r8", token_count_used_after_compaction + 1000), + ]); + + let bodies = vec![ + model_reasoning_response_1_sse, + model_compact_response_1_sse, + model_reasoning_response_2_sse, + model_compact_response_2_sse, + model_reasoning_response_3_sse, + model_compact_response_3_sse, + model_final_response_sse, + ]; + + mount_sse_sequence(&server, bodies).await; + wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; + + let requests_payloads = server.received_requests().await.unwrap(); + + let body = requests_payloads.clone()[0] + .body_json::() + .unwrap(); + let input = body.get("input").and_then(|v| v.as_array()).unwrap(); + let environment_message = input[0]["content"][0]["text"].as_str().unwrap(); + + // test 1: after compaction, we should have one environment message, one user message, and one user message with summary prefix + let compaction_indices = [2, 4, 6]; + for i in compaction_indices { + let body = requests_payloads.clone()[i] + .body_json::() + .unwrap(); + let input = body.get("input").and_then(|v| v.as_array()).unwrap(); + assert_eq!(input.len(), 3); + let environment_message = input[0]["content"][0]["text"].as_str().unwrap(); + let user_message = input[1]["content"][0]["text"].as_str().unwrap(); + let summary_prefix = input[2]["content"][0]["text"].as_str().unwrap(); + assert_eq!(environment_message, environment_message); + assert_eq!(user_message, "create an app"); + assert!(summary_prefix.starts_with(format!("{SUMMARY_PREFIX}\n").as_str())); + } + + let expected_requests_inputs = json!([ + [ + // 0 + { + "content": [ + { + "text": environment_message, + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "create an app", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + } + ] + , + [ + // 1 + { + "content": [ + { + "text": environment_message, + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "create an app", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": null, + "encrypted_content": null, + "summary": [ + { + "text": "I will create a react app", + "type": "summary_text" + } + ], + "type": "reasoning" + }, + { + "action": { + "command": [ + "echo", + "make-react" + ], + "env": null, + "timeout_ms": null, + "type": "exec", + "user": null, + "working_directory": null + }, + "call_id": "r1-shell", + "status": "completed", + "type": "local_shell_call" + }, + { + "call_id": "r1-shell", + "output": "execution error: Io(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })", + "type": "function_call_output" + }, + { + "content": [ + { + "text": "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + } + ] + , + [ + // 2 + { + "content": [ + { + "text": environment_message, + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "create an app", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + } + ] + , + [ + // 3 + { + "content": [ + { + "text": environment_message, + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "create an app", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": null, + "encrypted_content": null, + "summary": [ + { + "text": "I will create a node app", + "type": "summary_text" + } + ], + "type": "reasoning" + }, + { + "action": { + "command": [ + "echo", + "make-node" + ], + "env": null, + "timeout_ms": null, + "type": "exec", + "user": null, + "working_directory": null + }, + "call_id": "r3-shell", + "status": "completed", + "type": "local_shell_call" + }, + { + "call_id": "r3-shell", + "output": "execution error: Io(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })", + "type": "function_call_output" + }, + { + "content": [ + { + "text": "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + } + ] + , + // 4 + [ + { + "content": [ + { + "text": environment_message, + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "create an app", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + } + ] + , + [ + // 5 + { + "content": [ + { + "text": environment_message, + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "create an app", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": null, + "encrypted_content": null, + "summary": [ + { + "text": "I will create a python app", + "type": "summary_text" + } + ], + "type": "reasoning" + }, + { + "action": { + "command": [ + "echo", + "make-python" + ], + "env": null, + "timeout_ms": null, + "type": "exec", + "user": null, + "working_directory": null + }, + "call_id": "r6-shell", + "status": "completed", + "type": "local_shell_call" + }, + { + "call_id": "r6-shell", + "output": "execution error: Io(Os { code: 2, kind: NotFound, message: \"No such file or directory\" })", + "type": "function_call_output" + }, + { + "content": [ + { + "text": "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + } + ] + , + [ + { + // 6 + "content": [ + { + "text": environment_message, + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "create an app", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + }, + { + "content": [ + { + "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app.", + "type": "input_text" + } + ], + "role": "user", + "type": "message" + } + ] + ]); + + assert_eq!(requests_payloads.len(), 7); + + for (i, request) in requests_payloads.iter().enumerate() { + let body = request.body_json::().unwrap(); + let input = body.get("input").and_then(|v| v.as_array()).unwrap(); + assert_eq!( + input.as_slice(), + expected_requests_inputs[i].as_array().unwrap().as_slice() + ); + } +} + // Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts. #[cfg_attr(windows, tokio::test(flavor = "multi_thread", worker_threads = 4))] #[cfg_attr(not(windows), tokio::test(flavor = "multi_thread", worker_threads = 2))] diff --git a/codex-rs/core/tests/suite/compact_resume_fork.rs b/codex-rs/core/tests/suite/compact_resume_fork.rs index c3c1354bbfa5..c8163c805da2 100644 --- a/codex-rs/core/tests/suite/compact_resume_fork.rs +++ b/codex-rs/core/tests/suite/compact_resume_fork.rs @@ -38,8 +38,7 @@ use tempfile::TempDir; use wiremock::MockServer; const AFTER_SECOND_RESUME: &str = "AFTER_SECOND_RESUME"; -const COMPACT_PROMPT_MARKER: &str = - "You are performing a CONTEXT CHECKPOINT COMPACTION for a tool."; +const COMPACT_PROMPT_MARKER: &str = "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work."; fn network_disabled() -> bool { std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() From 8c149e91142f6c826dda3f9eeb39d691432f0621 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 16:55:08 -0800 Subject: [PATCH 04/18] fix compact --- codex-rs/core/tests/suite/compact.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 64d8d308392b..2a4feaec3f58 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -550,7 +550,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { let expected_requests_inputs = json!([ [ - // 0 + // 0: first request of the user message. { "content": [ { @@ -574,7 +574,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ] , [ - // 1 + // 1: first automatic compaction request. { "content": [ { @@ -640,7 +640,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ] , [ - // 2 + // 2: request after first automatic compaction. { "content": [ { @@ -674,7 +674,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ] , [ - // 3 + // 3: request for second automatic compaction. { "content": [ { @@ -749,7 +749,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { } ] , - // 4 + // 4: request after second automatic compaction. [ { "content": [ @@ -784,7 +784,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ] , [ - // 5 + // 5: request for third automatic compaction. { "content": [ { @@ -861,7 +861,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { , [ { - // 6 + // 6: request after third automatic compaction. "content": [ { "text": environment_message, From f6862b1544561c0bac9c2e14a341cd5ee18079a4 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:04:36 -0800 Subject: [PATCH 05/18] remove eprintln --- codex-rs/core/src/codex.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 52bf15003854..1f853bb42270 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -1870,7 +1870,6 @@ pub(crate) async fn run_task( let token_limit_reached = total_usage_tokens .map(|tokens| tokens >= limit) .unwrap_or(false); - eprintln!("token_limit_reached: {token_limit_reached}"); let (responses, items_to_record_in_conversation_history) = process_items(processed_items, &sess, &turn_context).await; From fd147482c0d35e6e01476a983f12ccf862afcb77 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:13:07 -0800 Subject: [PATCH 06/18] remove eprintln --- codex-rs/core/src/codex.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 1f853bb42270..dbde7a4e286c 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -1873,6 +1873,7 @@ pub(crate) async fn run_task( let (responses, items_to_record_in_conversation_history) = process_items(processed_items, &sess, &turn_context).await; + // as long as compaction works well in getting us way below the token limit, we shouldn't worry about being in an infinite loop. if token_limit_reached { compact::run_inline_auto_compact_task(sess.clone(), turn_context.clone()).await; continue; From fa8536caaccbc23db043ffc1e6d43950f7fa4f21 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:14:01 -0800 Subject: [PATCH 07/18] remove eprintln --- codex-rs/core/templates/compact/summary_prefix.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codex-rs/core/templates/compact/summary_prefix.md b/codex-rs/core/templates/compact/summary_prefix.md index be6313b84720..62a7161b89b2 100644 --- a/codex-rs/core/templates/compact/summary_prefix.md +++ b/codex-rs/core/templates/compact/summary_prefix.md @@ -1 +1 @@ -Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis: \ No newline at end of file +Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis: \ No newline at end of file From 1dd1f10f00a3b8f2495264d8a63c2bd2b902de6a Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:15:57 -0800 Subject: [PATCH 08/18] tests --- codex-rs/core/tests/suite/compact_resume_fork.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/codex-rs/core/tests/suite/compact_resume_fork.rs b/codex-rs/core/tests/suite/compact_resume_fork.rs index c8163c805da2..44d78e9bf041 100644 --- a/codex-rs/core/tests/suite/compact_resume_fork.rs +++ b/codex-rs/core/tests/suite/compact_resume_fork.rs @@ -80,8 +80,9 @@ fn extract_summary_message(request: &Value, summary_text: &str) -> Value { .and_then(Value::as_array) .and_then(|arr| arr.first()) .and_then(|entry| entry.get("text")) - .and_then(Value::as_str) - == Some(summary_text) + .and_then(Value::as_str). + .map(|text| text.contains(summary_text)) + .unwrap_or(false) }) }) .cloned() From c65c5a131528b5a738e1cad683435cb1343ae7ce Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:30:23 -0800 Subject: [PATCH 09/18] update tests --- codex-rs/core/tests/suite/compact.rs | 81 +++++++++---------- .../core/tests/suite/compact_resume_fork.rs | 15 ++-- 2 files changed, 47 insertions(+), 49 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 2a4feaec3f58..822a90762fe6 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -4,6 +4,7 @@ use codex_core::ConversationManager; use codex_core::ModelProviderInfo; use codex_core::NewConversation; use codex_core::built_in_model_providers; +use codex_core::compact::SUMMARIZATION_PROMPT; use codex_core::compact::SUMMARY_PREFIX; use codex_core::config::Config; use codex_core::protocol::ErrorEvent; @@ -55,8 +56,6 @@ const DUMMY_FUNCTION_NAME: &str = "unsupported_tool"; const DUMMY_CALL_ID: &str = "call-multi-auto"; const FUNCTION_CALL_LIMIT_MSG: &str = "function call limit push"; const POST_AUTO_USER_MSG: &str = "post auto follow-up"; -const COMPACT_PROMPT_MARKER: &str = "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work."; -pub(super) const TEST_COMPACT_PROMPT: &str = "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work."; pub(super) const COMPACT_WARNING_MESSAGE: &str = "Heads up: Long conversations and multiple compactions can cause the model to be less accurate. Start a new conversation when possible to keep conversations small and targeted."; @@ -82,7 +81,7 @@ fn drop_call_id(value: &mut serde_json::Value) { } fn set_test_compact_prompt(config: &mut Config) { - config.compact_prompt = Some(TEST_COMPACT_PROMPT.to_string()); + config.compact_prompt = Some(SUMMARIZATION_PROMPT.to_string()); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -110,13 +109,13 @@ async fn summarize_context_three_requests_and_instructions() { // Mount three expectations, one per request, matched by body content. let first_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains("\"text\":\"hello world\"") && !body.contains(COMPACT_PROMPT_MARKER) + body.contains("\"text\":\"hello world\"") && !body.contains(SUMMARIZATION_PROMPT) }; let first_request_mock = mount_sse_once_match(&server, first_matcher, sse1).await; let second_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(COMPACT_PROMPT_MARKER) + body.contains(SUMMARIZATION_PROMPT) }; let second_request_mock = mount_sse_once_match(&server, second_matcher, sse2).await; @@ -200,7 +199,7 @@ async fn summarize_context_three_requests_and_instructions() { assert_eq!(last2.get("role").unwrap().as_str().unwrap(), "user"); let text2 = last2["content"][0]["text"].as_str().unwrap(); assert_eq!( - text2, TEST_COMPACT_PROMPT, + text2, SUMMARIZATION_PROMPT, "expected summarize trigger, got `{text2}`" ); @@ -257,7 +256,7 @@ async fn summarize_context_three_requests_and_instructions() { assert!( !messages .iter() - .any(|(_, text)| text.contains(TEST_COMPACT_PROMPT)), + .any(|(_, text)| text.contains(SUMMARIZATION_PROMPT)), "third request should not include the summarize trigger" ); @@ -361,7 +360,7 @@ async fn manual_compact_uses_custom_prompt() { if text == custom_prompt { found_custom_prompt = true; } - if text == TEST_COMPACT_PROMPT { + if text == SUMMARIZATION_PROMPT { found_default_prompt = true; } } @@ -526,7 +525,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { let requests_payloads = server.received_requests().await.unwrap(); - let body = requests_payloads.clone()[0] + let body = requests_payloads[0] .body_json::() .unwrap(); let input = body.get("input").and_then(|v| v.as_array()).unwrap(); @@ -630,7 +629,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work.", + "text": SUMMARIZATION_PROMPT, "type": "input_text" } ], @@ -664,7 +663,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app.", + "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app."), "type": "input_text" } ], @@ -698,7 +697,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app.", + "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app."), "type": "input_text" } ], @@ -740,7 +739,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work.", + "text": SUMMARIZATION_PROMPT, "type": "input_text" } ], @@ -774,7 +773,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app.", + "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app."), "type": "input_text" } ], @@ -808,7 +807,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app.", + "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app."), "type": "input_text" } ], @@ -850,7 +849,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work.", + "text": SUMMARIZATION_PROMPT, "type": "input_text" } ], @@ -884,7 +883,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": "Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis:\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app.", + "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app."), "type": "input_text" } ], @@ -938,7 +937,7 @@ async fn auto_compact_runs_after_token_limit_hit() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(FIRST_AUTO_MSG) && !body.contains(SECOND_AUTO_MSG) - && !body.contains(COMPACT_PROMPT_MARKER) + && !body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, first_matcher, sse1).await; @@ -946,27 +945,27 @@ async fn auto_compact_runs_after_token_limit_hit() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(SECOND_AUTO_MSG) && body.contains(FIRST_AUTO_MSG) - && !body.contains(COMPACT_PROMPT_MARKER) + && !body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, second_matcher, sse2).await; let third_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(COMPACT_PROMPT_MARKER) + body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, third_matcher, sse3).await; let resume_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(AUTO_SUMMARY_TEXT) - && !body.contains(COMPACT_PROMPT_MARKER) + && !body.contains(SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG) }; mount_sse_once_match(&server, resume_matcher, sse_resume).await; let fourth_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(POST_AUTO_USER_MSG) && !body.contains(COMPACT_PROMPT_MARKER) + body.contains(POST_AUTO_USER_MSG) && !body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, fourth_matcher, sse4).await; @@ -1030,7 +1029,7 @@ async fn auto_compact_runs_after_token_limit_hit() { let is_auto_compact = |req: &wiremock::Request| { std::str::from_utf8(&req.body) .unwrap_or("") - .contains(COMPACT_PROMPT_MARKER) + .contains(SUMMARIZATION_PROMPT) }; let auto_compact_count = requests.iter().filter(|req| is_auto_compact(req)).count(); assert_eq!( @@ -1053,7 +1052,7 @@ async fn auto_compact_runs_after_token_limit_hit() { .find_map(|(idx, req)| { let body = std::str::from_utf8(&req.body).unwrap_or(""); (body.contains(AUTO_SUMMARY_TEXT) - && !body.contains(COMPACT_PROMPT_MARKER) + && !body.contains(SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG)) .then_some(idx) }) @@ -1065,7 +1064,7 @@ async fn auto_compact_runs_after_token_limit_hit() { .rev() .find_map(|(idx, req)| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - (body.contains(POST_AUTO_USER_MSG) && !body.contains(COMPACT_PROMPT_MARKER)) + (body.contains(POST_AUTO_USER_MSG) && !body.contains(SUMMARIZATION_PROMPT)) .then_some(idx) }) .expect("follow-up request missing"); @@ -1112,7 +1111,7 @@ async fn auto_compact_runs_after_token_limit_hit() { .and_then(|text| text.as_str()) .unwrap_or_default(); assert_eq!( - last_text, TEST_COMPACT_PROMPT, + last_text, SUMMARIZATION_PROMPT, "auto compact should send the summarization prompt as a user message", ); @@ -1193,7 +1192,7 @@ async fn auto_compact_persists_rollout_entries() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(FIRST_AUTO_MSG) && !body.contains(SECOND_AUTO_MSG) - && !body.contains(COMPACT_PROMPT_MARKER) + && !body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, first_matcher, sse1).await; @@ -1201,13 +1200,13 @@ async fn auto_compact_persists_rollout_entries() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(SECOND_AUTO_MSG) && body.contains(FIRST_AUTO_MSG) - && !body.contains(COMPACT_PROMPT_MARKER) + && !body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, second_matcher, sse2).await; let third_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(COMPACT_PROMPT_MARKER) + body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, third_matcher, sse3).await; @@ -1306,19 +1305,19 @@ async fn auto_compact_stops_after_failed_attempt() { let first_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(FIRST_AUTO_MSG) && !body.contains(COMPACT_PROMPT_MARKER) + body.contains(FIRST_AUTO_MSG) && !body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, first_matcher, sse1.clone()).await; let second_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(COMPACT_PROMPT_MARKER) + body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, second_matcher, sse2.clone()).await; let third_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - !body.contains(COMPACT_PROMPT_MARKER) && body.contains(SUMMARY_TEXT) + !body.contains(SUMMARIZATION_PROMPT) && body.contains(SUMMARY_TEXT) }; mount_sse_once_match(&server, third_matcher, sse3.clone()).await; @@ -1379,7 +1378,7 @@ async fn auto_compact_stops_after_failed_attempt() { .and_then(|items| items.first()) .and_then(|entry| entry.get("text")) .and_then(|text| text.as_str()) - .map(|text| text == TEST_COMPACT_PROMPT) + .map(|text| text == SUMMARIZATION_PROMPT) .unwrap_or(false) }); assert!( @@ -1486,7 +1485,7 @@ async fn manual_compact_retries_after_context_window_error() { .and_then(|items| items.first()) .and_then(|entry| entry.get("text")) .and_then(|text| text.as_str()), - Some(TEST_COMPACT_PROMPT), + Some(SUMMARIZATION_PROMPT), "compact attempt should include summarization prompt" ); assert_eq!( @@ -1497,7 +1496,7 @@ async fn manual_compact_retries_after_context_window_error() { .and_then(|items| items.first()) .and_then(|entry| entry.get("text")) .and_then(|text| text.as_str()), - Some(TEST_COMPACT_PROMPT), + Some(SUMMARIZATION_PROMPT), "retry attempt should include summarization prompt" ); assert_eq!( @@ -1643,13 +1642,13 @@ async fn manual_compact_twice_preserves_latest_user_messages() { "first turn request missing first user message" ); assert!( - !contains_user_text(&first_turn_input, TEST_COMPACT_PROMPT), + !contains_user_text(&first_turn_input, SUMMARIZATION_PROMPT), "first turn request should not include summarization prompt" ); let first_compact_input = requests[1].input(); assert!( - contains_user_text(&first_compact_input, TEST_COMPACT_PROMPT), + contains_user_text(&first_compact_input, SUMMARIZATION_PROMPT), "first compact request should include summarization prompt" ); assert!( @@ -1669,7 +1668,7 @@ async fn manual_compact_twice_preserves_latest_user_messages() { let second_compact_input = requests[3].input(); assert!( - contains_user_text(&second_compact_input, TEST_COMPACT_PROMPT), + contains_user_text(&second_compact_input, SUMMARIZATION_PROMPT), "second compact request should include summarization prompt" ); assert!( @@ -1841,7 +1840,7 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_ "first request should contain the user input" ); assert!( - request_bodies[1].contains(COMPACT_PROMPT_MARKER), + request_bodies[1].contains(SUMMARIZATION_PROMPT), "first auto compact request should include the summarization prompt" ); assert!( @@ -1849,7 +1848,7 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_ "function call output should be sent before the second auto compact" ); assert!( - request_bodies[4].contains(COMPACT_PROMPT_MARKER), + request_bodies[4].contains(SUMMARIZATION_PROMPT), "second auto compact request should include the summarization prompt" ); } @@ -1945,7 +1944,7 @@ async fn auto_compact_triggers_after_function_call_over_95_percent_usage() { let auto_compact_body = auto_compact_mock.single_request().body_json().to_string(); assert!( - auto_compact_body.contains(COMPACT_PROMPT_MARKER), + auto_compact_body.contains(SUMMARIZATION_PROMPT), "auto compact request should include the summarization prompt after exceeding 95% (limit {limit})" ); } diff --git a/codex-rs/core/tests/suite/compact_resume_fork.rs b/codex-rs/core/tests/suite/compact_resume_fork.rs index 44d78e9bf041..7ab0028f8144 100644 --- a/codex-rs/core/tests/suite/compact_resume_fork.rs +++ b/codex-rs/core/tests/suite/compact_resume_fork.rs @@ -10,13 +10,13 @@ use super::compact::COMPACT_WARNING_MESSAGE; use super::compact::FIRST_REPLY; use super::compact::SUMMARY_TEXT; -use super::compact::TEST_COMPACT_PROMPT; use codex_core::CodexAuth; use codex_core::CodexConversation; use codex_core::ConversationManager; use codex_core::ModelProviderInfo; use codex_core::NewConversation; use codex_core::built_in_model_providers; +use codex_core::compact::SUMMARIZATION_PROMPT; use codex_core::config::Config; use codex_core::config::OPENAI_DEFAULT_MODEL; use codex_core::protocol::EventMsg; @@ -38,7 +38,6 @@ use tempfile::TempDir; use wiremock::MockServer; const AFTER_SECOND_RESUME: &str = "AFTER_SECOND_RESUME"; -const COMPACT_PROMPT_MARKER: &str = "You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work."; fn network_disabled() -> bool { std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() @@ -80,7 +79,7 @@ fn extract_summary_message(request: &Value, summary_text: &str) -> Value { .and_then(Value::as_array) .and_then(|arr| arr.first()) .and_then(|entry| entry.get("text")) - .and_then(Value::as_str). + .and_then(Value::as_str) .map(|text| text.contains(summary_text)) .unwrap_or(false) }) @@ -283,7 +282,7 @@ async fn compact_resume_and_fork_preserve_model_history_view() { "content": [ { "type": "input_text", - "text": TEST_COMPACT_PROMPT + "text": SUMMARIZATION_PROMPT } ] } @@ -741,7 +740,7 @@ async fn mount_initial_flow(server: &MockServer) { let match_first = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains("\"text\":\"hello world\"") - && !body.contains(COMPACT_PROMPT_MARKER) + && !body.contains(SUMMARIZATION_PROMPT) && !body.contains(&format!("\"text\":\"{SUMMARY_TEXT}\"")) && !body.contains("\"text\":\"AFTER_COMPACT\"") && !body.contains("\"text\":\"AFTER_RESUME\"") @@ -751,7 +750,7 @@ async fn mount_initial_flow(server: &MockServer) { let match_compact = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(COMPACT_PROMPT_MARKER) + body.contains(SUMMARIZATION_PROMPT) }; mount_sse_once_match(server, match_compact, sse2).await; @@ -785,7 +784,7 @@ async fn mount_second_compact_flow(server: &MockServer) { let match_second_compact = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(COMPACT_PROMPT_MARKER) && body.contains("AFTER_FORK") + body.contains(SUMMARIZATION_PROMPT) && body.contains("AFTER_FORK") }; mount_sse_once_match(server, match_second_compact, sse6).await; @@ -806,7 +805,7 @@ async fn start_test_conversation( let home = TempDir::new().expect("create temp dir"); let mut config = load_default_config_for_test(&home); config.model_provider = model_provider; - config.compact_prompt = Some(TEST_COMPACT_PROMPT.to_string()); + config.compact_prompt = Some(SUMMARIZATION_PROMPT.to_string()); let manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy")); let NewConversation { conversation, .. } = manager From e97150671ac7fdf769e9daa04f9126758d3cddc7 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:35:16 -0800 Subject: [PATCH 10/18] update tests --- codex-rs/core/tests/suite/compact.rs | 83 +++++++++++++++++----------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 822a90762fe6..c4bffc2fec78 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -63,6 +63,10 @@ fn auto_summary(summary: &str) -> String { summary.to_string() } +fn summary_with_prefix(summary: &str) -> String { + format!("{SUMMARY_PREFIX}\n{summary}") +} + fn drop_call_id(value: &mut serde_json::Value) { match value { serde_json::Value::Object(obj) => { @@ -212,6 +216,7 @@ async fn summarize_context_three_requests_and_instructions() { ); let mut messages: Vec<(String, String)> = Vec::new(); + let expected_summary_message = summary_with_prefix(SUMMARY_TEXT); for item in input3 { if let Some("message") = item.get("type").and_then(|v| v.as_str()) { @@ -250,7 +255,7 @@ async fn summarize_context_three_requests_and_instructions() { assert!( messages .iter() - .any(|(r, t)| r == "user" && t == SUMMARY_TEXT), + .any(|(r, t)| r == "user" && t == expected_summary_message), "third request should include the summary message" ); assert!( @@ -287,7 +292,7 @@ async fn summarize_context_three_requests_and_instructions() { api_turn_count += 1; } RolloutItem::Compacted(ci) => { - if ci.message == SUMMARY_TEXT { + if ci.message == expected_summary_message { saw_compacted_summary = true; } } @@ -454,6 +459,14 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { // user message let user_message = "create an app"; + // summary texts from model + let first_summary_text = "The task is to create an app. I started to create a react app."; + let second_summary_text = "The task is to create an app. I started to create a react app. then I realized that I need to create a node app."; + let third_summary_text = "The task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app."; + // summary texts with prefix + let prefixed_first_summary = summary_with_prefix(first_summary_text); + let prefixed_second_summary = summary_with_prefix(second_summary_text); + let prefixed_third_summary = summary_with_prefix(third_summary_text); codex .submit(Op::UserInput { @@ -472,10 +485,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ev_completed_with_tokens("r1", token_count_used), ]); let model_compact_response_1_sse = sse(vec![ - ev_assistant_message( - "m2", - "The task is to create an app. I started to create a react app.", - ), + ev_assistant_message("m2", first_summary_text), ev_completed_with_tokens("r2", token_count_used_after_compaction), ]); let model_reasoning_response_2_sse = sse(vec![ @@ -484,10 +494,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ev_completed_with_tokens("r3", token_count_used), ]); let model_compact_response_2_sse = sse(vec![ - ev_assistant_message( - "m4", - "The task is to create an app. I started to create a react app. then I realized that I need to create a node app.", - ), + ev_assistant_message("m4", second_summary_text), ev_completed_with_tokens("r4", token_count_used_after_compaction), ]); let model_reasoning_response_3_sse = sse(vec![ @@ -496,10 +503,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ev_completed_with_tokens("r6", token_count_used), ]); let model_compact_response_3_sse = sse(vec![ - ev_assistant_message( - "m7", - "The task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app.", - ), + ev_assistant_message("m7", third_summary_text), ev_completed_with_tokens("r7", token_count_used_after_compaction), ]); let model_final_response_sse = sse(vec![ @@ -533,7 +537,12 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { // test 1: after compaction, we should have one environment message, one user message, and one user message with summary prefix let compaction_indices = [2, 4, 6]; - for i in compaction_indices { + let expected_summaries = [ + prefixed_first_summary.as_str(), + prefixed_second_summary.as_str(), + prefixed_third_summary.as_str(), + ]; + for (i, expected_summary) in compaction_indices.into_iter().zip(expected_summaries) { let body = requests_payloads.clone()[i] .body_json::() .unwrap(); @@ -541,10 +550,13 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { assert_eq!(input.len(), 3); let environment_message = input[0]["content"][0]["text"].as_str().unwrap(); let user_message = input[1]["content"][0]["text"].as_str().unwrap(); - let summary_prefix = input[2]["content"][0]["text"].as_str().unwrap(); + let summary_message = input[2]["content"][0]["text"].as_str().unwrap(); assert_eq!(environment_message, environment_message); assert_eq!(user_message, "create an app"); - assert!(summary_prefix.starts_with(format!("{SUMMARY_PREFIX}\n").as_str())); + assert_eq!( + summary_message, expected_summary, + "compaction request at index {i} should include the prefixed summary" + ); } let expected_requests_inputs = json!([ @@ -663,7 +675,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app."), + "text": prefixed_first_summary.clone(), "type": "input_text" } ], @@ -697,7 +709,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app."), + "text": prefixed_first_summary.clone(), "type": "input_text" } ], @@ -773,7 +785,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app."), + "text": prefixed_second_summary.clone(), "type": "input_text" } ], @@ -807,7 +819,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app."), + "text": prefixed_second_summary.clone(), "type": "input_text" } ], @@ -883,7 +895,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { { "content": [ { - "text": format!("{SUMMARY_PREFIX}\nThe task is to create an app. I started to create a react app. then I realized that I need to create a node app. then I realized that I need to create a python app."), + "text": prefixed_third_summary.clone(), "type": "input_text" } ], @@ -932,6 +944,7 @@ async fn auto_compact_runs_after_token_limit_hit() { ev_assistant_message("m4", FINAL_REPLY), ev_completed_with_tokens("r4", 120), ]); + let prefixed_auto_summary = summary_with_prefix(AUTO_SUMMARY_TEXT); let first_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); @@ -955,9 +968,10 @@ async fn auto_compact_runs_after_token_limit_hit() { }; mount_sse_once_match(&server, third_matcher, sse3).await; - let resume_matcher = |req: &wiremock::Request| { + let resume_marker = prefixed_auto_summary.clone(); + let resume_matcher = move |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(AUTO_SUMMARY_TEXT) + body.contains(&resume_marker) && !body.contains(SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG) }; @@ -1046,12 +1060,13 @@ async fn auto_compact_runs_after_token_limit_hit() { "auto compact should add a third request" ); + let resume_summary_marker = prefixed_auto_summary.clone(); let resume_index = requests .iter() .enumerate() .find_map(|(idx, req)| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - (body.contains(AUTO_SUMMARY_TEXT) + (body.contains(&resume_summary_marker) && !body.contains(SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG)) .then_some(idx) @@ -1126,7 +1141,7 @@ async fn auto_compact_runs_after_token_limit_hit() { .and_then(|arr| arr.first()) .and_then(|entry| entry.get("text")) .and_then(|v| v.as_str()) - == Some(AUTO_SUMMARY_TEXT) + == Some(prefixed_auto_summary.as_str()) }), "resume request should include compacted history" ); @@ -1161,7 +1176,9 @@ async fn auto_compact_runs_after_token_limit_hit() { "auto compact follow-up request should include the new user message" ); assert!( - user_texts.iter().any(|text| text == AUTO_SUMMARY_TEXT), + user_texts + .iter() + .any(|text| text == prefixed_auto_summary.as_str()), "auto compact follow-up request should include the summary message" ); } @@ -1297,6 +1314,7 @@ async fn auto_compact_stops_after_failed_attempt() { ev_assistant_message("m2", &summary_payload), ev_completed_with_tokens("r2", 50), ]); + let prefixed_summary_text = summary_with_prefix(SUMMARY_TEXT); let sse3 = sse(vec![ ev_assistant_message("m3", STILL_TOO_BIG_REPLY), @@ -1315,9 +1333,10 @@ async fn auto_compact_stops_after_failed_attempt() { }; mount_sse_once_match(&server, second_matcher, sse2.clone()).await; - let third_matcher = |req: &wiremock::Request| { + let summary_marker = prefixed_summary_text.clone(); + let third_matcher = move |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - !body.contains(SUMMARIZATION_PROMPT) && body.contains(SUMMARY_TEXT) + !body.contains(SUMMARIZATION_PROMPT) && body.contains(&summary_marker) }; mount_sse_once_match(&server, third_matcher, sse3.clone()).await; @@ -1525,6 +1544,8 @@ async fn manual_compact_twice_preserves_latest_user_messages() { let final_user_message = "post compact follow-up"; let first_summary = "FIRST_MANUAL_SUMMARY"; let second_summary = "SECOND_MANUAL_SUMMARY"; + let expected_first_summary = summary_with_prefix(first_summary); + let expected_second_summary = summary_with_prefix(second_summary); let server = start_mock_server().await; @@ -1704,7 +1725,7 @@ async fn manual_compact_twice_preserves_latest_user_messages() { }), json!({ "content": vec![json!({ - "text": first_summary, + "text": expected_first_summary, "type": "input_text", })], "role": "user", @@ -1720,7 +1741,7 @@ async fn manual_compact_twice_preserves_latest_user_messages() { }), json!({ "content": vec![json!({ - "text": second_summary, + "text": expected_second_summary, "type": "input_text", })], "role": "user", From 0b8c7babff48a02e4c76dafbef79cd713be0c07e Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:37:58 -0800 Subject: [PATCH 11/18] simplify --- codex-rs/core/tests/suite/compact.rs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index c4bffc2fec78..3080305665d7 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -255,7 +255,7 @@ async fn summarize_context_three_requests_and_instructions() { assert!( messages .iter() - .any(|(r, t)| r == "user" && t == expected_summary_message), + .any(|(r, t)| r == "user" && t == &expected_summary_message), "third request should include the summary message" ); assert!( @@ -444,14 +444,9 @@ async fn manual_compact_emits_estimated_token_usage_event() { async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { skip_if_no_network!(); - let auto_compact_limit = 5000; - let server = start_mock_server().await; let codex = test_codex() - .with_config(move |config| { - config.model_auto_compact_token_limit = Some(auto_compact_limit); - }) .build(&server) .await .expect("build codex") @@ -477,8 +472,8 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { .await .expect("submit user input"); - let token_count_used = 6000; - let token_count_used_after_compaction = 2000; + let token_count_used = 270_000; + let token_count_used_after_compaction = 80000; let model_reasoning_response_1_sse = sse(vec![ ev_reasoning_item("m1", &["I will create a react app"], &[]), ev_local_shell_call("r1-shell", "completed", vec!["echo", "make-react"]), From dad2d6da1cf2c27b46f9591d2472357560e2dba6 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 17:41:26 -0800 Subject: [PATCH 12/18] simplify --- codex-rs/core/tests/suite/compact.rs | 54 ++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 3080305665d7..b27f3612fbd4 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -454,6 +454,9 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { // user message let user_message = "create an app"; + + // Prepare the mock responses from the model + // summary texts from model let first_summary_text = "The task is to create an app. I started to create a react app."; let second_summary_text = "The task is to create an app. I started to create a react app. then I realized that I need to create a node app."; @@ -462,45 +465,53 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { let prefixed_first_summary = summary_with_prefix(first_summary_text); let prefixed_second_summary = summary_with_prefix(second_summary_text); let prefixed_third_summary = summary_with_prefix(third_summary_text); - - codex - .submit(Op::UserInput { - items: vec![UserInput::Text { - text: user_message.into(), - }], - }) - .await - .expect("submit user input"); - + // token used count after long work let token_count_used = 270_000; + // token used count after compaction let token_count_used_after_compaction = 80000; + + // mock responses from the model + + // first chunk of work let model_reasoning_response_1_sse = sse(vec![ ev_reasoning_item("m1", &["I will create a react app"], &[]), ev_local_shell_call("r1-shell", "completed", vec!["echo", "make-react"]), ev_completed_with_tokens("r1", token_count_used), ]); + + // first compaction response let model_compact_response_1_sse = sse(vec![ ev_assistant_message("m2", first_summary_text), ev_completed_with_tokens("r2", token_count_used_after_compaction), ]); + + // second chunk of work let model_reasoning_response_2_sse = sse(vec![ ev_reasoning_item("m3", &["I will create a node app"], &[]), ev_local_shell_call("r3-shell", "completed", vec!["echo", "make-node"]), ev_completed_with_tokens("r3", token_count_used), ]); + + // second compaction response let model_compact_response_2_sse = sse(vec![ ev_assistant_message("m4", second_summary_text), ev_completed_with_tokens("r4", token_count_used_after_compaction), ]); + + // third chunk of work let model_reasoning_response_3_sse = sse(vec![ ev_reasoning_item("m6", &["I will create a python app"], &[]), ev_local_shell_call("r6-shell", "completed", vec!["echo", "make-python"]), ev_completed_with_tokens("r6", token_count_used), ]); + + // third compaction response let model_compact_response_3_sse = sse(vec![ ev_assistant_message("m7", third_summary_text), ev_completed_with_tokens("r7", token_count_used_after_compaction), ]); + + // final response let model_final_response_sse = sse(vec![ ev_assistant_message( "m8", @@ -509,6 +520,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ev_completed_with_tokens("r8", token_count_used_after_compaction + 1000), ]); + // mount the mock responses from the model let bodies = vec![ model_reasoning_response_1_sse, model_compact_response_1_sse, @@ -518,10 +530,20 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { model_compact_response_3_sse, model_final_response_sse, ]; - mount_sse_sequence(&server, bodies).await; + + // Start the conversation with the user message + codex + .submit(Op::UserInput { + items: vec![UserInput::Text { + text: user_message.into(), + }], + }) + .await + .expect("submit user input"); wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; + // collect the requests payloads from the model let requests_payloads = server.received_requests().await.unwrap(); let body = requests_payloads[0] @@ -544,16 +566,17 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { let input = body.get("input").and_then(|v| v.as_array()).unwrap(); assert_eq!(input.len(), 3); let environment_message = input[0]["content"][0]["text"].as_str().unwrap(); - let user_message = input[1]["content"][0]["text"].as_str().unwrap(); + let user_message_received = input[1]["content"][0]["text"].as_str().unwrap(); let summary_message = input[2]["content"][0]["text"].as_str().unwrap(); assert_eq!(environment_message, environment_message); - assert_eq!(user_message, "create an app"); + assert_eq!(user_message_received, user_message); assert_eq!( summary_message, expected_summary, "compaction request at index {i} should include the prefixed summary" ); } + // test 2: the expected requests inputs should be as follows: let expected_requests_inputs = json!([ [ // 0: first request of the user message. @@ -900,8 +923,6 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ] ]); - assert_eq!(requests_payloads.len(), 7); - for (i, request) in requests_payloads.iter().enumerate() { let body = request.body_json::().unwrap(); let input = body.get("input").and_then(|v| v.as_array()).unwrap(); @@ -910,6 +931,9 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { expected_requests_inputs[i].as_array().unwrap().as_slice() ); } + + // test 3: the number of requests should be 7 + assert_eq!(requests_payloads.len(), 7); } // Windows CI only: bump to 4 workers to prevent SSE/event starvation and test timeouts. From 17f95d1661422912659f6cbd47e1e0f19f0fc689 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 19:23:23 -0800 Subject: [PATCH 13/18] finally --- codex-rs/core/tests/suite/compact.rs | 134 ++------------------------- 1 file changed, 8 insertions(+), 126 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index b27f3612fbd4..48d5429e31f4 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -7,7 +7,6 @@ use codex_core::built_in_model_providers; use codex_core::compact::SUMMARIZATION_PROMPT; use codex_core::compact::SUMMARY_PREFIX; use codex_core::config::Config; -use codex_core::protocol::ErrorEvent; use codex_core::protocol::EventMsg; use codex_core::protocol::Op; use codex_core::protocol::RolloutItem; @@ -44,7 +43,6 @@ const THIRD_USER_MSG: &str = "next turn"; const AUTO_SUMMARY_TEXT: &str = "AUTO_SUMMARY"; const FIRST_AUTO_MSG: &str = "token limit start"; const SECOND_AUTO_MSG: &str = "token limit push"; -const STILL_TOO_BIG_REPLY: &str = "STILL_TOO_BIG"; const MULTI_AUTO_MSG: &str = "multi auto"; const SECOND_LARGE_REPLY: &str = "SECOND_LARGE_REPLY"; const FIRST_AUTO_SUMMARY: &str = "FIRST_AUTO_SUMMARY"; @@ -963,7 +961,7 @@ async fn auto_compact_runs_after_token_limit_hit() { ev_assistant_message("m4", FINAL_REPLY), ev_completed_with_tokens("r4", 120), ]); - let prefixed_auto_summary = summary_with_prefix(AUTO_SUMMARY_TEXT); + let prefixed_auto_summary = AUTO_SUMMARY_TEXT; let first_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); @@ -987,10 +985,10 @@ async fn auto_compact_runs_after_token_limit_hit() { }; mount_sse_once_match(&server, third_matcher, sse3).await; - let resume_marker = prefixed_auto_summary.clone(); + let resume_marker = prefixed_auto_summary; let resume_matcher = move |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(&resume_marker) + body.contains(resume_marker) && !body.contains(SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG) }; @@ -1079,13 +1077,13 @@ async fn auto_compact_runs_after_token_limit_hit() { "auto compact should add a third request" ); - let resume_summary_marker = prefixed_auto_summary.clone(); + let resume_summary_marker = prefixed_auto_summary; let resume_index = requests .iter() .enumerate() .find_map(|(idx, req)| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - (body.contains(&resume_summary_marker) + (body.contains(resume_summary_marker) && !body.contains(SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG)) .then_some(idx) @@ -1160,7 +1158,8 @@ async fn auto_compact_runs_after_token_limit_hit() { .and_then(|arr| arr.first()) .and_then(|entry| entry.get("text")) .and_then(|v| v.as_str()) - == Some(prefixed_auto_summary.as_str()) + .map(|text| text.contains(prefixed_auto_summary)) + .unwrap_or(false) }), "resume request should include compacted history" ); @@ -1197,7 +1196,7 @@ async fn auto_compact_runs_after_token_limit_hit() { assert!( user_texts .iter() - .any(|text| text == prefixed_auto_summary.as_str()), + .any(|text| text.contains(prefixed_auto_summary)), "auto compact follow-up request should include the summary message" ); } @@ -1317,114 +1316,6 @@ async fn auto_compact_persists_rollout_entries() { ); } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn auto_compact_stops_after_failed_attempt() { - skip_if_no_network!(); - - let server = start_mock_server().await; - - let sse1 = sse(vec![ - ev_assistant_message("m1", FIRST_REPLY), - ev_completed_with_tokens("r1", 500), - ]); - - let summary_payload = auto_summary(SUMMARY_TEXT); - let sse2 = sse(vec![ - ev_assistant_message("m2", &summary_payload), - ev_completed_with_tokens("r2", 50), - ]); - let prefixed_summary_text = summary_with_prefix(SUMMARY_TEXT); - - let sse3 = sse(vec![ - ev_assistant_message("m3", STILL_TOO_BIG_REPLY), - ev_completed_with_tokens("r3", 500), - ]); - - let first_matcher = |req: &wiremock::Request| { - let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(FIRST_AUTO_MSG) && !body.contains(SUMMARIZATION_PROMPT) - }; - mount_sse_once_match(&server, first_matcher, sse1.clone()).await; - - let second_matcher = |req: &wiremock::Request| { - let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(SUMMARIZATION_PROMPT) - }; - mount_sse_once_match(&server, second_matcher, sse2.clone()).await; - - let summary_marker = prefixed_summary_text.clone(); - let third_matcher = move |req: &wiremock::Request| { - let body = std::str::from_utf8(&req.body).unwrap_or(""); - !body.contains(SUMMARIZATION_PROMPT) && body.contains(&summary_marker) - }; - mount_sse_once_match(&server, third_matcher, sse3.clone()).await; - - let model_provider = ModelProviderInfo { - base_url: Some(format!("{}/v1", server.uri())), - ..built_in_model_providers()["openai"].clone() - }; - - let home = TempDir::new().unwrap(); - let mut config = load_default_config_for_test(&home); - config.model_provider = model_provider; - set_test_compact_prompt(&mut config); - config.model_auto_compact_token_limit = Some(200); - let conversation_manager = ConversationManager::with_auth(CodexAuth::from_api_key("dummy")); - let codex = conversation_manager - .new_conversation(config) - .await - .unwrap() - .conversation; - - codex - .submit(Op::UserInput { - items: vec![UserInput::Text { - text: FIRST_AUTO_MSG.into(), - }], - }) - .await - .unwrap(); - - let error_event = wait_for_event(&codex, |ev| matches!(ev, EventMsg::Error(_))).await; - let EventMsg::Error(ErrorEvent { message }) = error_event else { - panic!("expected error event"); - }; - assert!( - message.contains("limit"), - "error message should include limit information: {message}" - ); - wait_for_event(&codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; - - let requests = server.received_requests().await.unwrap(); - assert_eq!( - requests.len(), - 3, - "auto compact should attempt at most one summarization before erroring" - ); - - let last_body = requests[2].body_json::().unwrap(); - let input = last_body - .get("input") - .and_then(|v| v.as_array()) - .unwrap_or_else(|| panic!("unexpected request format: {last_body}")); - let contains_prompt = input.iter().any(|item| { - item.get("type").and_then(|v| v.as_str()) == Some("message") - && item.get("role").and_then(|v| v.as_str()) == Some("user") - && item - .get("content") - .and_then(|v| v.as_array()) - .and_then(|items| items.first()) - .and_then(|entry| entry.get("text")) - .and_then(|text| text.as_str()) - .map(|text| text == SUMMARIZATION_PROMPT) - .unwrap_or(false) - }); - assert!( - !contains_prompt, - "third request should be the follow-up turn, not another summarization", - ); -} - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn manual_compact_retries_after_context_window_error() { skip_if_no_network!(); @@ -1563,7 +1454,6 @@ async fn manual_compact_twice_preserves_latest_user_messages() { let final_user_message = "post compact follow-up"; let first_summary = "FIRST_MANUAL_SUMMARY"; let second_summary = "SECOND_MANUAL_SUMMARY"; - let expected_first_summary = summary_with_prefix(first_summary); let expected_second_summary = summary_with_prefix(second_summary); let server = start_mock_server().await; @@ -1742,14 +1632,6 @@ async fn manual_compact_twice_preserves_latest_user_messages() { "role": "user", "type": "message", }), - json!({ - "content": vec![json!({ - "text": expected_first_summary, - "type": "input_text", - })], - "role": "user", - "type": "message", - }), json!({ "content": vec![json!({ "text": second_user_message, From 4b41773e926bf1e9eaa6e3c0d76606909db42119 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 19:43:32 -0800 Subject: [PATCH 14/18] simplify --- codex-rs/core/tests/suite/compact.rs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 48d5429e31f4..008a88619733 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -921,13 +921,25 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { ] ]); + // ignore local shell calls output because it differs from OS to another and it's out of the scope of this test. + fn normalize_inputs(values: &[serde_json::Value]) -> Vec { + values + .iter() + .filter(|value| { + value + .get("type") + .and_then(|ty| ty.as_str()) + .is_none_or(|ty| ty != "local_shell_call") + }) + .cloned() + .collect() + } + for (i, request) in requests_payloads.iter().enumerate() { let body = request.body_json::().unwrap(); let input = body.get("input").and_then(|v| v.as_array()).unwrap(); - assert_eq!( - input.as_slice(), - expected_requests_inputs[i].as_array().unwrap().as_slice() - ); + let expected_input = expected_requests_inputs[i].as_array().unwrap(); + assert_eq!(normalize_inputs(input), normalize_inputs(expected_input)); } // test 3: the number of requests should be 7 From 575ee51497c8702fdca23bdccb9655e3777da100 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 20:23:13 -0800 Subject: [PATCH 15/18] lint --- codex-rs/core/tests/suite/compact.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 008a88619733..06fbaca6f626 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -929,7 +929,7 @@ async fn multiple_auto_compact_per_task_runs_after_token_limit_hit() { value .get("type") .and_then(|ty| ty.as_str()) - .is_none_or(|ty| ty != "local_shell_call") + .is_none_or(|ty| ty != "function_call_output") }) .cloned() .collect() From 7e2f49348e554c087f89ac5015df03b88b9419ee Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 22:24:56 -0800 Subject: [PATCH 16/18] fix-compact --- codex-rs/core/templates/compact/prompt.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/codex-rs/core/templates/compact/prompt.md b/codex-rs/core/templates/compact/prompt.md index 0907835c94d8..42fae605db8a 100644 --- a/codex-rs/core/templates/compact/prompt.md +++ b/codex-rs/core/templates/compact/prompt.md @@ -1 +1,9 @@ -You have exceeded the maximum number of tokens, please stop and write a summary of your work for the next agent. Your note should summarize what you finished and what still needs work. \ No newline at end of file +You are performing a CONTEXT CHECKPOINT COMPACTION. Create a handoff summary for another LLM that will resume the task. + +Include: +- Current progress and key decisions made +- Important context, constraints, or user preferences +- What remains to be done (clear next steps) +- Any critical data, examples, or references needed to continue + +Be concise, structured, and focused on helping the next LLM seamlessly continue the work. From 6688e413f1bcb19e39ad14fdcacbf939ed06c0b6 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 23:01:38 -0800 Subject: [PATCH 17/18] :( --- codex-rs/core/tests/suite/compact.rs | 45 +++++++++++-------- .../core/tests/suite/compact_resume_fork.rs | 17 +++++-- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 06fbaca6f626..6164f14636ce 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -86,6 +86,17 @@ fn set_test_compact_prompt(config: &mut Config) { config.compact_prompt = Some(SUMMARIZATION_PROMPT.to_string()); } +fn body_contains_text(body: &str, text: &str) -> bool { + body.contains(&json_fragment(text)) +} + +fn json_fragment(text: &str) -> String { + serde_json::to_string(text) + .expect("serialize text to JSON") + .trim_matches('"') + .to_string() +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn summarize_context_three_requests_and_instructions() { skip_if_no_network!(); @@ -111,13 +122,13 @@ async fn summarize_context_three_requests_and_instructions() { // Mount three expectations, one per request, matched by body content. let first_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains("\"text\":\"hello world\"") && !body.contains(SUMMARIZATION_PROMPT) + body.contains("\"text\":\"hello world\"") && !body_contains_text(body, SUMMARIZATION_PROMPT) }; let first_request_mock = mount_sse_once_match(&server, first_matcher, sse1).await; let second_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(SUMMARIZATION_PROMPT) + body_contains_text(body, SUMMARIZATION_PROMPT) }; let second_request_mock = mount_sse_once_match(&server, second_matcher, sse2).await; @@ -979,7 +990,7 @@ async fn auto_compact_runs_after_token_limit_hit() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(FIRST_AUTO_MSG) && !body.contains(SECOND_AUTO_MSG) - && !body.contains(SUMMARIZATION_PROMPT) + && !body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, first_matcher, sse1).await; @@ -987,13 +998,13 @@ async fn auto_compact_runs_after_token_limit_hit() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(SECOND_AUTO_MSG) && body.contains(FIRST_AUTO_MSG) - && !body.contains(SUMMARIZATION_PROMPT) + && !body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, second_matcher, sse2).await; let third_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(SUMMARIZATION_PROMPT) + body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, third_matcher, sse3).await; @@ -1001,14 +1012,14 @@ async fn auto_compact_runs_after_token_limit_hit() { let resume_matcher = move |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(resume_marker) - && !body.contains(SUMMARIZATION_PROMPT) + && !body_contains_text(body, SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG) }; mount_sse_once_match(&server, resume_matcher, sse_resume).await; let fourth_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(POST_AUTO_USER_MSG) && !body.contains(SUMMARIZATION_PROMPT) + body.contains(POST_AUTO_USER_MSG) && !body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, fourth_matcher, sse4).await; @@ -1070,9 +1081,7 @@ async fn auto_compact_runs_after_token_limit_hit() { requests.len() ); let is_auto_compact = |req: &wiremock::Request| { - std::str::from_utf8(&req.body) - .unwrap_or("") - .contains(SUMMARIZATION_PROMPT) + body_contains_text(std::str::from_utf8(&req.body).unwrap_or(""), SUMMARIZATION_PROMPT) }; let auto_compact_count = requests.iter().filter(|req| is_auto_compact(req)).count(); assert_eq!( @@ -1096,7 +1105,7 @@ async fn auto_compact_runs_after_token_limit_hit() { .find_map(|(idx, req)| { let body = std::str::from_utf8(&req.body).unwrap_or(""); (body.contains(resume_summary_marker) - && !body.contains(SUMMARIZATION_PROMPT) + && !body_contains_text(body, SUMMARIZATION_PROMPT) && !body.contains(POST_AUTO_USER_MSG)) .then_some(idx) }) @@ -1108,7 +1117,7 @@ async fn auto_compact_runs_after_token_limit_hit() { .rev() .find_map(|(idx, req)| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - (body.contains(POST_AUTO_USER_MSG) && !body.contains(SUMMARIZATION_PROMPT)) + (body.contains(POST_AUTO_USER_MSG) && !body_contains_text(body, SUMMARIZATION_PROMPT)) .then_some(idx) }) .expect("follow-up request missing"); @@ -1239,7 +1248,7 @@ async fn auto_compact_persists_rollout_entries() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(FIRST_AUTO_MSG) && !body.contains(SECOND_AUTO_MSG) - && !body.contains(SUMMARIZATION_PROMPT) + && !body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, first_matcher, sse1).await; @@ -1247,13 +1256,13 @@ async fn auto_compact_persists_rollout_entries() { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains(SECOND_AUTO_MSG) && body.contains(FIRST_AUTO_MSG) - && !body.contains(SUMMARIZATION_PROMPT) + && !body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, second_matcher, sse2).await; let third_matcher = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(SUMMARIZATION_PROMPT) + body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(&server, third_matcher, sse3).await; @@ -1774,7 +1783,7 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_ "first request should contain the user input" ); assert!( - request_bodies[1].contains(SUMMARIZATION_PROMPT), + body_contains_text(&request_bodies[1], SUMMARIZATION_PROMPT), "first auto compact request should include the summarization prompt" ); assert!( @@ -1782,7 +1791,7 @@ async fn auto_compact_allows_multiple_attempts_when_interleaved_with_other_turn_ "function call output should be sent before the second auto compact" ); assert!( - request_bodies[4].contains(SUMMARIZATION_PROMPT), + body_contains_text(&request_bodies[4], SUMMARIZATION_PROMPT), "second auto compact request should include the summarization prompt" ); } @@ -1878,7 +1887,7 @@ async fn auto_compact_triggers_after_function_call_over_95_percent_usage() { let auto_compact_body = auto_compact_mock.single_request().body_json().to_string(); assert!( - auto_compact_body.contains(SUMMARIZATION_PROMPT), + body_contains_text(&auto_compact_body, SUMMARIZATION_PROMPT), "auto compact request should include the summarization prompt after exceeding 95% (limit {limit})" ); } diff --git a/codex-rs/core/tests/suite/compact_resume_fork.rs b/codex-rs/core/tests/suite/compact_resume_fork.rs index 7ab0028f8144..e10f5748fbe0 100644 --- a/codex-rs/core/tests/suite/compact_resume_fork.rs +++ b/codex-rs/core/tests/suite/compact_resume_fork.rs @@ -43,6 +43,17 @@ fn network_disabled() -> bool { std::env::var(CODEX_SANDBOX_NETWORK_DISABLED_ENV_VAR).is_ok() } +fn body_contains_text(body: &str, text: &str) -> bool { + body.contains(&json_fragment(text)) +} + +fn json_fragment(text: &str) -> String { + serde_json::to_string(text) + .expect("serialize text to JSON") + .trim_matches('"') + .to_string() +} + fn filter_out_ghost_snapshot_entries(items: &[Value]) -> Vec { items .iter() @@ -740,7 +751,7 @@ async fn mount_initial_flow(server: &MockServer) { let match_first = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); body.contains("\"text\":\"hello world\"") - && !body.contains(SUMMARIZATION_PROMPT) + && !body_contains_text(body, SUMMARIZATION_PROMPT) && !body.contains(&format!("\"text\":\"{SUMMARY_TEXT}\"")) && !body.contains("\"text\":\"AFTER_COMPACT\"") && !body.contains("\"text\":\"AFTER_RESUME\"") @@ -750,7 +761,7 @@ async fn mount_initial_flow(server: &MockServer) { let match_compact = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(SUMMARIZATION_PROMPT) + body_contains_text(body, SUMMARIZATION_PROMPT) }; mount_sse_once_match(server, match_compact, sse2).await; @@ -784,7 +795,7 @@ async fn mount_second_compact_flow(server: &MockServer) { let match_second_compact = |req: &wiremock::Request| { let body = std::str::from_utf8(&req.body).unwrap_or(""); - body.contains(SUMMARIZATION_PROMPT) && body.contains("AFTER_FORK") + body_contains_text(body, SUMMARIZATION_PROMPT) && body.contains("AFTER_FORK") }; mount_sse_once_match(server, match_second_compact, sse6).await; From 55b12ceaa5ce57ea6a263b0269ca4a4cf59dde92 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Fri, 14 Nov 2025 23:03:38 -0800 Subject: [PATCH 18/18] tighten_panic_double_truncation --- codex-rs/core/tests/suite/compact.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 6164f14636ce..9808a406104e 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -1081,7 +1081,10 @@ async fn auto_compact_runs_after_token_limit_hit() { requests.len() ); let is_auto_compact = |req: &wiremock::Request| { - body_contains_text(std::str::from_utf8(&req.body).unwrap_or(""), SUMMARIZATION_PROMPT) + body_contains_text( + std::str::from_utf8(&req.body).unwrap_or(""), + SUMMARIZATION_PROMPT, + ) }; let auto_compact_count = requests.iter().filter(|req| is_auto_compact(req)).count(); assert_eq!(