fix: #1916 openai_chatcompletions.Converter.extract_all_content does not support input_audio type items (#1918)

seratch · web-flow · commit 886b9fcbbf67 · 2025-10-18T06:33:34.000+09:00
diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py
@@ -8,6 +8,7 @@
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam,
     ChatCompletionContentPartImageParam,
+    ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartParam,
     ChatCompletionContentPartTextParam,
     ChatCompletionDeveloperMessageParam,
@@ -27,6 +28,7 @@
     ResponseFileSearchToolCallParam,
     ResponseFunctionToolCall,
     ResponseFunctionToolCallParam,
+    ResponseInputAudioParam,
     ResponseInputContentParam,
     ResponseInputFileParam,
     ResponseInputImageParam,
@@ -287,6 +289,32 @@ def extract_all_content(
                         },
                     )
                 )
+            elif isinstance(c, dict) and c.get("type") == "input_audio":
+                casted_audio_param = cast(ResponseInputAudioParam, c)
+                audio_payload = casted_audio_param.get("input_audio")
+                if not audio_payload:
+                    raise UserError(
+                        f"Only audio data is supported for input_audio {casted_audio_param}"
+                    )
+                if not isinstance(audio_payload, dict):
+                    raise UserError(
+                        f"input_audio must provide audio data and format {casted_audio_param}"
+                    )
+                audio_data = audio_payload.get("data")
+                audio_format = audio_payload.get("format")
+                if not audio_data or not audio_format:
+                    raise UserError(
+                        f"input_audio requires both data and format {casted_audio_param}"
+                    )
+                out.append(
+                    ChatCompletionContentPartInputAudioParam(
+                        type="input_audio",
+                        input_audio={
+                            "data": audio_data,
+                            "format": audio_format,
+                        },
+                    )
+                )
             elif isinstance(c, dict) and c.get("type") == "input_file":
                 casted_file_param = cast(ResponseInputFileParam, c)
                 if "file_data" not in casted_file_param or not casted_file_param["file_data"]:
diff --git a/tests/test_openai_chatcompletions_converter.py b/tests/test_openai_chatcompletions_converter.py
@@ -32,6 +32,7 @@
 from openai.types.responses import (
     ResponseFunctionToolCall,
     ResponseFunctionToolCallParam,
+    ResponseInputAudioParam,
     ResponseInputTextParam,
     ResponseOutputMessage,
     ResponseOutputRefusal,
@@ -280,6 +281,36 @@ def test_extract_all_and_text_content_for_strings_and_lists():
     assert [p["text"] for p in text_parts] == ["one", "two"]
 
 
+def test_extract_all_content_handles_input_audio():
+    """
+    input_audio entries should translate into ChatCompletion input_audio parts.
+    """
+    audio: ResponseInputAudioParam = {
+        "type": "input_audio",
+        "input_audio": {"data": "AAA=", "format": "wav"},
+    }
+    parts = Converter.extract_all_content([audio])
+    assert isinstance(parts, list)
+    assert parts == [
+        {
+            "type": "input_audio",
+            "input_audio": {"data": "AAA=", "format": "wav"},
+        }
+    ]
+
+
+def test_extract_all_content_rejects_invalid_input_audio():
+    """
+    input_audio requires both data and format fields to be present.
+    """
+    audio_missing_data = cast(ResponseInputAudioParam, {
+        "type": "input_audio",
+        "input_audio": {"format": "wav"},
+    })
+    with pytest.raises(UserError):
+        Converter.extract_all_content([audio_missing_data])
+
+
 def test_items_to_messages_handles_system_and_developer_roles():
     """
     Roles other than `user` (e.g. `system` and `developer`) need to be