From ccef9827b31206fc9ba40d2b6165eeefda7621f5 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:04:26 +0000 Subject: [PATCH 1/3] chore(api): Minor docs and type updates for realtime --- .stats.yml | 4 +- .../input_audio_buffer_timeout_triggered.rb | 30 +- .../realtime/realtime_audio_config_input.rb | 25 +- .../realtime_audio_input_turn_detection.rb | 290 ++++++---- .../models/realtime/realtime_server_event.rb | 14 +- .../models/realtime/realtime_session.rb | 297 ++++++---- .../realtime_session_create_response.rb | 306 ++++++---- ...ltime_transcription_session_audio_input.rb | 27 +- ...tion_session_audio_input_turn_detection.rb | 292 ++++++---- lib/openai/models/responses/response.rb | 16 +- .../responses/response_create_params.rb | 16 +- .../input_audio_buffer_timeout_triggered.rbi | 29 +- .../realtime/realtime_audio_config_input.rbi | 72 ++- .../realtime_audio_input_turn_detection.rbi | 467 ++++++++------- .../models/realtime/realtime_session.rbi | 537 ++++++++++-------- .../realtime_session_create_response.rbi | 530 +++++++++-------- ...time_transcription_session_audio_input.rbi | 67 ++- ...ion_session_audio_input_turn_detection.rbi | 464 ++++++++------- rbi/openai/models/responses/response.rbi | 24 +- .../responses/response_create_params.rbi | 24 +- rbi/openai/resources/responses.rbi | 16 +- .../realtime/realtime_audio_config_input.rbs | 12 +- .../realtime_audio_input_turn_detection.rbs | 156 ++--- .../models/realtime/realtime_session.rbs | 164 +++--- .../realtime_session_create_response.rbs | 168 +++--- ...time_transcription_session_audio_input.rbs | 12 +- ...ion_session_audio_input_turn_detection.rbs | 156 ++--- 27 files changed, 2448 insertions(+), 1767 deletions(-) diff --git a/.stats.yml b/.stats.yml index 2aa16be8..5388f246 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 118 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml -openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml +openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649 config_hash: 930dac3aa861344867e4ac84f037b5df diff --git a/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb b/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb index 91227a91..2d9af6dd 100644 --- a/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +++ b/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb @@ -5,13 +5,15 @@ module Models module Realtime class InputAudioBufferTimeoutTriggered < OpenAI::Internal::Type::BaseModel # @!attribute audio_end_ms - # Millisecond offset where speech ended within the buffered audio. + # Millisecond offset of audio written to the input audio buffer at the time the + # timeout was triggered. # # @return [Integer] required :audio_end_ms, Integer # @!attribute audio_start_ms - # Millisecond offset where speech started within the buffered audio. + # Millisecond offset of audio written to the input audio buffer that was after the + # playback time of the last model response. # # @return [Integer] required :audio_start_ms, Integer @@ -35,11 +37,29 @@ class InputAudioBufferTimeoutTriggered < OpenAI::Internal::Type::BaseModel required :type, const: :"input_audio_buffer.timeout_triggered" # @!method initialize(audio_end_ms:, audio_start_ms:, event_id:, item_id:, type: :"input_audio_buffer.timeout_triggered") - # Returned when the server VAD timeout is triggered for the input audio buffer. + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::InputAudioBufferTimeoutTriggered} for more details. # - # @param audio_end_ms [Integer] Millisecond offset where speech ended within the buffered audio. + # Returned when the Server VAD timeout is triggered for the input audio buffer. + # This is configured with `idle_timeout_ms` in the `turn_detection` settings of + # the session, and it indicates that there hasn't been any speech detected for the + # configured duration. # - # @param audio_start_ms [Integer] Millisecond offset where speech started within the buffered audio. + # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio + # after the last model response up to the triggering time, as an offset from the + # beginning of audio written to the input audio buffer. This means it demarcates + # the segment of audio that was silent and the difference between the start and + # end values will roughly match the configured timeout. + # + # The empty audio will be committed to the conversation as an `input_audio` item + # (there will be a `input_audio_buffer.committed` event) and a model response will + # be generated. There may be speech that didn't trigger VAD but is still detected + # by the model, so the model may respond with something relevant to the + # conversation or a prompt to continue speaking. + # + # @param audio_end_ms [Integer] Millisecond offset of audio written to the input audio buffer at the time the ti + # + # @param audio_start_ms [Integer] Millisecond offset of audio written to the input audio buffer that was after the # # @param event_id [String] The unique ID of the server event. # diff --git a/lib/openai/models/realtime/realtime_audio_config_input.rb b/lib/openai/models/realtime/realtime_audio_config_input.rb index 89f70507..37ca5874 100644 --- a/lib/openai/models/realtime/realtime_audio_config_input.rb +++ b/lib/openai/models/realtime/realtime_audio_config_input.rb @@ -36,17 +36,20 @@ class RealtimeAudioConfigInput < OpenAI::Internal::Type::BaseModel # @!attribute turn_detection # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. # - # @return [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection, nil] - optional :turn_detection, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection } + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + # + # @return [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad, nil] + optional :turn_detection, union: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection }, nil?: true # @!method initialize(format_: nil, noise_reduction: nil, transcription: nil, turn_detection: nil) # Some parameter documentations has been truncated, see @@ -58,7 +61,7 @@ class RealtimeAudioConfigInput < OpenAI::Internal::Type::BaseModel # # @param transcription [OpenAI::Models::Realtime::AudioTranscription] Configuration for input audio transcription, defaults to off and can be set to ` # - # @param turn_detection [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # @param turn_detection [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # @see OpenAI::Models::Realtime::RealtimeAudioConfigInput#noise_reduction class NoiseReduction < OpenAI::Internal::Type::BaseModel diff --git a/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb b/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb index c1695bed..376f499b 100644 --- a/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +++ b/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb @@ -3,128 +3,184 @@ module OpenAI module Models module Realtime - class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel - # @!attribute create_response - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - # - # @return [Boolean, nil] - optional :create_response, OpenAI::Internal::Type::Boolean - - # @!attribute eagerness - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Eagerness, nil] - optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness } - - # @!attribute idle_timeout_ms - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received and emits a `timeout_triggered` event. - # - # @return [Integer, nil] - optional :idle_timeout_ms, Integer, nil?: true - - # @!attribute interrupt_response - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - # - # @return [Boolean, nil] - optional :interrupt_response, OpenAI::Internal::Type::Boolean - - # @!attribute prefix_padding_ms - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - # - # @return [Integer, nil] - optional :prefix_padding_ms, Integer - - # @!attribute silence_duration_ms - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - # - # @return [Integer, nil] - optional :silence_duration_ms, Integer - - # @!attribute threshold - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - # - # @return [Float, nil] - optional :threshold, Float - - # @!attribute type - # Type of turn detection. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Type, nil] - optional :type, enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type } - - # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil) - # Some parameter documentations has been truncated, see - # {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection} for more details. - # - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - # - # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs - # - # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # - # @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when - # - # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th - # - # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec - # - # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m - # - # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # - # @param type [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Type] Type of turn detection. - - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - # - # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection#eagerness - module Eagerness - extend OpenAI::Internal::Type::Enum - - LOW = :low - MEDIUM = :medium - HIGH = :high - AUTO = :auto - - # @!method self.values - # @return [Array] + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + module RealtimeAudioInputTurnDetection + extend OpenAI::Internal::Type::Union + + discriminator :type + + # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence. + variant :server_vad, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad } + + # Server-side semantic turn detection which uses a model to determine when the user has finished speaking. + variant :semantic_vad, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad } + + class ServerVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `server_vad` to turn on simple Server VAD. + # + # @return [Symbol, :server_vad] + required :type, const: :server_vad + + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean + + # @!attribute idle_timeout_ms + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + # + # @return [Integer, nil] + optional :idle_timeout_ms, Integer, nil?: true + + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean + + # @!attribute prefix_padding_ms + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + # + # @return [Integer, nil] + optional :prefix_padding_ms, Integer + + # @!attribute silence_duration_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + # + # @return [Integer, nil] + optional :silence_duration_ms, Integer + + # @!attribute threshold + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + # + # @return [Float, nil] + optional :threshold, Float + + # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad} for more + # details. + # + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec + # + # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m + # + # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # + # @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD. end - # Type of turn detection. - # - # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection#type - module Type - extend OpenAI::Internal::Type::Enum + class SemanticVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + # + # @return [Symbol, :semantic_vad] + required :type, const: :semantic_vad + + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean + + # @!attribute eagerness + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness, nil] + optional :eagerness, + enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness } - SERVER_VAD = :server_vad - SEMANTIC_VAD = :semantic_vad + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean - # @!method self.values - # @return [Array] + # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad} for + # more details. + # + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD. + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad#eagerness + module Eagerness + extend OpenAI::Internal::Type::Enum + + LOW = :low + MEDIUM = :medium + HIGH = :high + AUTO = :auto + + # @!method self.values + # @return [Array] + end end + + # @!method self.variants + # @return [Array(OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad)] end end end diff --git a/lib/openai/models/realtime/realtime_server_event.rb b/lib/openai/models/realtime/realtime_server_event.rb index af9f4692..9ea21835 100644 --- a/lib/openai/models/realtime/realtime_server_event.rb +++ b/lib/openai/models/realtime/realtime_server_event.rb @@ -208,7 +208,19 @@ module RealtimeServerEvent # The event will include the full content of the Item except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if needed. variant :"conversation.item.done", -> { OpenAI::Realtime::ConversationItemDone } - # Returned when the server VAD timeout is triggered for the input audio buffer. + # Returned when the Server VAD timeout is triggered for the input audio buffer. This is configured + # with `idle_timeout_ms` in the `turn_detection` settings of the session, and it indicates that + # there hasn't been any speech detected for the configured duration. + # + # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio after the last + # model response up to the triggering time, as an offset from the beginning of audio written + # to the input audio buffer. This means it demarcates the segment of audio that was silent and + # the difference between the start and end values will roughly match the configured timeout. + # + # The empty audio will be committed to the conversation as an `input_audio` item (there will be a + # `input_audio_buffer.committed` event) and a model response will be generated. There may be speech + # that didn't trigger VAD but is still detected by the model, so the model may respond with + # something relevant to the conversation or a prompt to continue speaking. variant :"input_audio_buffer.timeout_triggered", -> { OpenAI::Realtime::InputAudioBufferTimeoutTriggered } # Returned when an input audio transcription segment is identified for an item. diff --git a/lib/openai/models/realtime/realtime_session.rb b/lib/openai/models/realtime/realtime_session.rb index 035c49f7..74db4075 100644 --- a/lib/openai/models/realtime/realtime_session.rb +++ b/lib/openai/models/realtime/realtime_session.rb @@ -158,17 +158,20 @@ class RealtimeSession < OpenAI::Internal::Type::BaseModel # @!attribute turn_detection # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - # - # @return [OpenAI::Models::Realtime::RealtimeSession::TurnDetection, nil] - optional :turn_detection, -> { OpenAI::Realtime::RealtimeSession::TurnDetection }, nil?: true + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + # + # @return [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil] + optional :turn_detection, union: -> { OpenAI::Realtime::RealtimeSession::TurnDetection }, nil?: true # @!attribute voice # The voice the model uses to respond. Voice cannot be changed during the session @@ -182,7 +185,7 @@ class RealtimeSession < OpenAI::Internal::Type::BaseModel # Some parameter documentations has been truncated, see # {OpenAI::Models::Realtime::RealtimeSession} for more details. # - # Realtime session object. + # Realtime session object for the beta interface. # # @param id [String] Unique identifier for the session that looks like `sess_1234567890abcdef`. # @@ -220,7 +223,7 @@ class RealtimeSession < OpenAI::Internal::Type::BaseModel # # @param tracing [Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration, nil] Configuration options for tracing. Set to null to disable tracing. Once # - # @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # # @param voice [String, Symbol, OpenAI::Models::Realtime::RealtimeSession::Voice] The voice the model uses to respond. Voice cannot be changed during the @@ -401,127 +404,185 @@ class TracingConfiguration < OpenAI::Internal::Type::BaseModel # @return [Array(Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration)] end + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + # # @see OpenAI::Models::Realtime::RealtimeSession#turn_detection - class TurnDetection < OpenAI::Internal::Type::BaseModel - # @!attribute create_response - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - # - # @return [Boolean, nil] - optional :create_response, OpenAI::Internal::Type::Boolean - - # @!attribute eagerness - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness, nil] - optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness } + module TurnDetection + extend OpenAI::Internal::Type::Union - # @!attribute idle_timeout_ms - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received. - # - # @return [Integer, nil] - optional :idle_timeout_ms, Integer, nil?: true + discriminator :type - # @!attribute interrupt_response - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - # - # @return [Boolean, nil] - optional :interrupt_response, OpenAI::Internal::Type::Boolean + # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence. + variant :server_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad } - # @!attribute prefix_padding_ms - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - # - # @return [Integer, nil] - optional :prefix_padding_ms, Integer + # Server-side semantic turn detection which uses a model to determine when the user has finished speaking. + variant :semantic_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad } - # @!attribute silence_duration_ms - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - # - # @return [Integer, nil] - optional :silence_duration_ms, Integer + class ServerVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `server_vad` to turn on simple Server VAD. + # + # @return [Symbol, :server_vad] + required :type, const: :server_vad - # @!attribute threshold - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - # - # @return [Float, nil] - optional :threshold, Float + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean + + # @!attribute idle_timeout_ms + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + # + # @return [Integer, nil] + optional :idle_timeout_ms, Integer, nil?: true - # @!attribute type - # Type of turn detection. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Type, nil] - optional :type, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::Type } + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean - # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil) - # Some parameter documentations has been truncated, see - # {OpenAI::Models::Realtime::RealtimeSession::TurnDetection} for more details. - # - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - # - # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs - # - # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # - # @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when - # - # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th - # - # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec - # - # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m - # - # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # - # @param type [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Type] Type of turn detection. + # @!attribute prefix_padding_ms + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + # + # @return [Integer, nil] + optional :prefix_padding_ms, Integer - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - # - # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection#eagerness - module Eagerness - extend OpenAI::Internal::Type::Enum + # @!attribute silence_duration_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + # + # @return [Integer, nil] + optional :silence_duration_ms, Integer - LOW = :low - MEDIUM = :medium - HIGH = :high - AUTO = :auto + # @!attribute threshold + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + # + # @return [Float, nil] + optional :threshold, Float - # @!method self.values - # @return [Array] + # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad} for more + # details. + # + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec + # + # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m + # + # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # + # @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD. end - # Type of turn detection. - # - # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection#type - module Type - extend OpenAI::Internal::Type::Enum + class SemanticVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + # + # @return [Symbol, :semantic_vad] + required :type, const: :semantic_vad + + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean + + # @!attribute eagerness + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness, nil] + optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness } - SERVER_VAD = :server_vad - SEMANTIC_VAD = :semantic_vad + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean - # @!method self.values - # @return [Array] + # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad} for more + # details. + # + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD. + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad#eagerness + module Eagerness + extend OpenAI::Internal::Type::Enum + + LOW = :low + MEDIUM = :medium + HIGH = :high + AUTO = :auto + + # @!method self.values + # @return [Array] + end end + + # @!method self.variants + # @return [Array(OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad)] end # The voice the model uses to respond. Voice cannot be changed during the session diff --git a/lib/openai/models/realtime/realtime_session_create_response.rb b/lib/openai/models/realtime/realtime_session_create_response.rb index 3adf8f09..79dcc176 100644 --- a/lib/openai/models/realtime/realtime_session_create_response.rb +++ b/lib/openai/models/realtime/realtime_session_create_response.rb @@ -198,18 +198,24 @@ class Input < OpenAI::Internal::Type::BaseModel # @!attribute turn_detection # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - # - # @return [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection, nil] + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + # + # @return [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad, nil] optional :turn_detection, - -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection } + union: -> { + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection + }, + nil?: true # @!method initialize(format_: nil, noise_reduction: nil, transcription: nil, turn_detection: nil) # Some parameter documentations has been truncated, see @@ -222,7 +228,7 @@ class Input < OpenAI::Internal::Type::BaseModel # # @param transcription [OpenAI::Models::Realtime::AudioTranscription] Configuration for input audio transcription, defaults to off and can be set to ` # - # @param turn_detection [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # @param turn_detection [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input#noise_reduction class NoiseReduction < OpenAI::Internal::Type::BaseModel @@ -248,132 +254,188 @@ class NoiseReduction < OpenAI::Internal::Type::BaseModel # @param type [Symbol, OpenAI::Models::Realtime::NoiseReductionType] Type of noise reduction. `near_field` is for close-talking microphones such as h end + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + # # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input#turn_detection - class TurnDetection < OpenAI::Internal::Type::BaseModel - # @!attribute create_response - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - # - # @return [Boolean, nil] - optional :create_response, OpenAI::Internal::Type::Boolean - - # @!attribute eagerness - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness, nil] - optional :eagerness, - enum: -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness } + module TurnDetection + extend OpenAI::Internal::Type::Union - # @!attribute idle_timeout_ms - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received and emits a `timeout_triggered` event. - # - # @return [Integer, nil] - optional :idle_timeout_ms, Integer, nil?: true + discriminator :type - # @!attribute interrupt_response - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - # - # @return [Boolean, nil] - optional :interrupt_response, OpenAI::Internal::Type::Boolean + # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence. + variant :server_vad, + -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad } - # @!attribute prefix_padding_ms - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - # - # @return [Integer, nil] - optional :prefix_padding_ms, Integer + # Server-side semantic turn detection which uses a model to determine when the user has finished speaking. + variant :semantic_vad, + -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad } - # @!attribute silence_duration_ms - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - # - # @return [Integer, nil] - optional :silence_duration_ms, Integer + class ServerVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `server_vad` to turn on simple Server VAD. + # + # @return [Symbol, :server_vad] + required :type, const: :server_vad - # @!attribute threshold - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - # - # @return [Float, nil] - optional :threshold, Float + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean - # @!attribute type - # Type of turn detection. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type, nil] - optional :type, - enum: -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type } + # @!attribute idle_timeout_ms + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + # + # @return [Integer, nil] + optional :idle_timeout_ms, Integer, nil?: true - # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil) - # Some parameter documentations has been truncated, see - # {OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection} - # for more details. - # - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - # - # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs - # - # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # - # @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when - # - # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th - # - # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec - # - # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m - # - # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # - # @param type [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type] Type of turn detection. + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - # - # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection#eagerness - module Eagerness - extend OpenAI::Internal::Type::Enum + # @!attribute prefix_padding_ms + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + # + # @return [Integer, nil] + optional :prefix_padding_ms, Integer - LOW = :low - MEDIUM = :medium - HIGH = :high - AUTO = :auto + # @!attribute silence_duration_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + # + # @return [Integer, nil] + optional :silence_duration_ms, Integer - # @!method self.values - # @return [Array] + # @!attribute threshold + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + # + # @return [Float, nil] + optional :threshold, Float + + # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad} + # for more details. + # + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec + # + # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m + # + # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # + # @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD. end - # Type of turn detection. - # - # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection#type - module Type - extend OpenAI::Internal::Type::Enum + class SemanticVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + # + # @return [Symbol, :semantic_vad] + required :type, const: :semantic_vad + + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean - SERVER_VAD = :server_vad - SEMANTIC_VAD = :semantic_vad + # @!attribute eagerness + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @return [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness, nil] + optional :eagerness, + enum: -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness } + + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean - # @!method self.values - # @return [Array] + # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad} + # for more details. + # + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD. + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad#eagerness + module Eagerness + extend OpenAI::Internal::Type::Enum + + LOW = :low + MEDIUM = :medium + HIGH = :high + AUTO = :auto + + # @!method self.values + # @return [Array] + end end + + # @!method self.variants + # @return [Array(OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad)] end end diff --git a/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb b/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb index a4019b2c..fc5fb231 100644 --- a/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +++ b/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb @@ -36,17 +36,22 @@ class RealtimeTranscriptionSessionAudioInput < OpenAI::Internal::Type::BaseModel # @!attribute turn_detection # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. # - # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection, nil] - optional :turn_detection, -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection } + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + # + # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad, nil] + optional :turn_detection, + union: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection }, + nil?: true # @!method initialize(format_: nil, noise_reduction: nil, transcription: nil, turn_detection: nil) # Some parameter documentations has been truncated, see @@ -59,7 +64,7 @@ class RealtimeTranscriptionSessionAudioInput < OpenAI::Internal::Type::BaseModel # # @param transcription [OpenAI::Models::Realtime::AudioTranscription] Configuration for input audio transcription, defaults to off and can be set to ` # - # @param turn_detection [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # @param turn_detection [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInput#noise_reduction class NoiseReduction < OpenAI::Internal::Type::BaseModel diff --git a/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb b/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb index 984b2774..814e0b8b 100644 --- a/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +++ b/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb @@ -3,128 +3,186 @@ module OpenAI module Models module Realtime - class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel - # @!attribute create_response - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - # - # @return [Boolean, nil] - optional :create_response, OpenAI::Internal::Type::Boolean - - # @!attribute eagerness - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness, nil] - optional :eagerness, - enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness } - - # @!attribute idle_timeout_ms - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received. - # - # @return [Integer, nil] - optional :idle_timeout_ms, Integer, nil?: true - - # @!attribute interrupt_response - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - # - # @return [Boolean, nil] - optional :interrupt_response, OpenAI::Internal::Type::Boolean - - # @!attribute prefix_padding_ms - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - # - # @return [Integer, nil] - optional :prefix_padding_ms, Integer - - # @!attribute silence_duration_ms - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - # - # @return [Integer, nil] - optional :silence_duration_ms, Integer - - # @!attribute threshold - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - # - # @return [Float, nil] - optional :threshold, Float - - # @!attribute type - # Type of turn detection. - # - # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type, nil] - optional :type, enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type } - - # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil) - # Some parameter documentations has been truncated, see - # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection} - # for more details. - # - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - # - # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs - # - # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # - # @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when - # - # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th - # - # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec - # - # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m - # - # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # - # @param type [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type] Type of turn detection. - - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - # - # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection#eagerness - module Eagerness - extend OpenAI::Internal::Type::Enum - - LOW = :low - MEDIUM = :medium - HIGH = :high - AUTO = :auto - - # @!method self.values - # @return [Array] + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + module RealtimeTranscriptionSessionAudioInputTurnDetection + extend OpenAI::Internal::Type::Union + + discriminator :type + + # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence. + variant :server_vad, + -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad } + + # Server-side semantic turn detection which uses a model to determine when the user has finished speaking. + variant :semantic_vad, + -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad } + + class ServerVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `server_vad` to turn on simple Server VAD. + # + # @return [Symbol, :server_vad] + required :type, const: :server_vad + + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean + + # @!attribute idle_timeout_ms + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + # + # @return [Integer, nil] + optional :idle_timeout_ms, Integer, nil?: true + + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean + + # @!attribute prefix_padding_ms + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + # + # @return [Integer, nil] + optional :prefix_padding_ms, Integer + + # @!attribute silence_duration_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + # + # @return [Integer, nil] + optional :silence_duration_ms, Integer + + # @!attribute threshold + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + # + # @return [Float, nil] + optional :threshold, Float + + # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad} + # for more details. + # + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec + # + # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m + # + # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # + # @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD. end - # Type of turn detection. - # - # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection#type - module Type - extend OpenAI::Internal::Type::Enum + class SemanticVad < OpenAI::Internal::Type::BaseModel + # @!attribute type + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + # + # @return [Symbol, :semantic_vad] + required :type, const: :semantic_vad + + # @!attribute create_response + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + # + # @return [Boolean, nil] + optional :create_response, OpenAI::Internal::Type::Boolean + + # @!attribute eagerness + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness, nil] + optional :eagerness, + enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness } - SERVER_VAD = :server_vad - SEMANTIC_VAD = :semantic_vad + # @!attribute interrupt_response + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + # + # @return [Boolean, nil] + optional :interrupt_response, OpenAI::Internal::Type::Boolean - # @!method self.values - # @return [Array] + # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad) + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad} + # for more details. + # + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + # + # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs + # + # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # + # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th + # + # @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD. + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + # + # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad#eagerness + module Eagerness + extend OpenAI::Internal::Type::Enum + + LOW = :low + MEDIUM = :medium + HIGH = :high + AUTO = :auto + + # @!method self.values + # @return [Array] + end end + + # @!method self.variants + # @return [Array(OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad)] end end end diff --git a/lib/openai/models/responses/response.rb b/lib/openai/models/responses/response.rb index 781529d3..b327e434 100644 --- a/lib/openai/models/responses/response.rb +++ b/lib/openai/models/responses/response.rb @@ -259,10 +259,10 @@ class Response < OpenAI::Internal::Type::BaseModel # @!attribute truncation # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. # # @return [Symbol, OpenAI::Models::Responses::Response::Truncation, nil] @@ -510,10 +510,10 @@ module ServiceTier # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. # # @see OpenAI::Models::Responses::Response#truncation diff --git a/lib/openai/models/responses/response_create_params.rb b/lib/openai/models/responses/response_create_params.rb index 53c8b5c7..1d5036d8 100644 --- a/lib/openai/models/responses/response_create_params.rb +++ b/lib/openai/models/responses/response_create_params.rb @@ -276,10 +276,10 @@ class ResponseCreateParams < OpenAI::Internal::Type::BaseModel # @!attribute truncation # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. # # @return [Symbol, OpenAI::Models::Responses::ResponseCreateParams::Truncation, nil] @@ -485,10 +485,10 @@ module ToolChoice # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. module Truncation extend OpenAI::Internal::Type::Enum diff --git a/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi b/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi index 93ec928f..e117efa9 100644 --- a/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +++ b/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi @@ -12,11 +12,13 @@ module OpenAI ) end - # Millisecond offset where speech ended within the buffered audio. + # Millisecond offset of audio written to the input audio buffer at the time the + # timeout was triggered. sig { returns(Integer) } attr_accessor :audio_end_ms - # Millisecond offset where speech started within the buffered audio. + # Millisecond offset of audio written to the input audio buffer that was after the + # playback time of the last model response. sig { returns(Integer) } attr_accessor :audio_start_ms @@ -32,7 +34,22 @@ module OpenAI sig { returns(Symbol) } attr_accessor :type - # Returned when the server VAD timeout is triggered for the input audio buffer. + # Returned when the Server VAD timeout is triggered for the input audio buffer. + # This is configured with `idle_timeout_ms` in the `turn_detection` settings of + # the session, and it indicates that there hasn't been any speech detected for the + # configured duration. + # + # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio + # after the last model response up to the triggering time, as an offset from the + # beginning of audio written to the input audio buffer. This means it demarcates + # the segment of audio that was silent and the difference between the start and + # end values will roughly match the configured timeout. + # + # The empty audio will be committed to the conversation as an `input_audio` item + # (there will be a `input_audio_buffer.committed` event) and a model response will + # be generated. There may be speech that didn't trigger VAD but is still detected + # by the model, so the model may respond with something relevant to the + # conversation or a prompt to continue speaking. sig do params( audio_end_ms: Integer, @@ -43,9 +60,11 @@ module OpenAI ).returns(T.attached_class) end def self.new( - # Millisecond offset where speech ended within the buffered audio. + # Millisecond offset of audio written to the input audio buffer at the time the + # timeout was triggered. audio_end_ms:, - # Millisecond offset where speech started within the buffered audio. + # Millisecond offset of audio written to the input audio buffer that was after the + # playback time of the last model response. audio_start_ms:, # The unique ID of the server event. event_id:, diff --git a/rbi/openai/models/realtime/realtime_audio_config_input.rbi b/rbi/openai/models/realtime/realtime_audio_config_input.rbi index e33b3fce..fb2e2697 100644 --- a/rbi/openai/models/realtime/realtime_audio_config_input.rbi +++ b/rbi/openai/models/realtime/realtime_audio_config_input.rbi @@ -80,26 +80,28 @@ module OpenAI # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. sig do - returns(T.nilable(OpenAI::Realtime::RealtimeAudioInputTurnDetection)) - end - attr_reader :turn_detection - - sig do - params( - turn_detection: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::OrHash - ).void + returns( + T.nilable( + T.any( + OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad + ) + ) + ) end - attr_writer :turn_detection + attr_accessor :turn_detection sig do params( @@ -113,7 +115,12 @@ module OpenAI OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction::OrHash, transcription: OpenAI::Realtime::AudioTranscription::OrHash, turn_detection: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::OrHash + T.nilable( + T.any( + OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad::OrHash, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::OrHash + ) + ) ).returns(T.attached_class) end def self.new( @@ -136,14 +143,17 @@ module OpenAI transcription: nil, # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. turn_detection: nil ) end @@ -160,7 +170,13 @@ module OpenAI noise_reduction: OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, - turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection + turn_detection: + T.nilable( + T.any( + OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad + ) + ) } ) end diff --git a/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi b/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi index 08a137c8..062c31fe 100644 --- a/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +++ b/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi @@ -3,259 +3,320 @@ module OpenAI module Models module Realtime - class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel - OrHash = + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + module RealtimeAudioInputTurnDetection + extend OpenAI::Internal::Type::Union + + Variants = T.type_alias do T.any( - OpenAI::Realtime::RealtimeAudioInputTurnDetection, - OpenAI::Internal::AnyHash + OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad ) end - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :create_response - - sig { params(create_response: T::Boolean).void } - attr_writer :create_response - - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol - ) - ) - end - attr_reader :eagerness - - sig do - params( - eagerness: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol - ).void - end - attr_writer :eagerness - - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received and emits a `timeout_triggered` event. - sig { returns(T.nilable(Integer)) } - attr_accessor :idle_timeout_ms - - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :interrupt_response - - sig { params(interrupt_response: T::Boolean).void } - attr_writer :interrupt_response - - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - sig { returns(T.nilable(Integer)) } - attr_reader :prefix_padding_ms - - sig { params(prefix_padding_ms: Integer).void } - attr_writer :prefix_padding_ms - - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - sig { returns(T.nilable(Integer)) } - attr_reader :silence_duration_ms + class ServerVad < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad, + OpenAI::Internal::AnyHash + ) + end - sig { params(silence_duration_ms: Integer).void } - attr_writer :silence_duration_ms + # Type of turn detection, `server_vad` to turn on simple Server VAD. + sig { returns(Symbol) } + attr_accessor :type - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - sig { returns(T.nilable(Float)) } - attr_reader :threshold + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response - sig { params(threshold: Float).void } - attr_writer :threshold + sig { params(create_response: T::Boolean).void } + attr_writer :create_response - # Type of turn detection. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol - ) - ) - end - attr_reader :type + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + sig { returns(T.nilable(Integer)) } + attr_accessor :idle_timeout_ms - sig do - params( - type: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol - ).void - end - attr_writer :type - - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - sig do - params( - create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol, - idle_timeout_ms: T.nilable(Integer), - interrupt_response: T::Boolean, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol - ).returns(T.attached_class) - end - def self.new( - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - create_response: nil, - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - eagerness: nil, - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received and emits a `timeout_triggered` event. - idle_timeout_ms: nil, # Whether or not to automatically interrupt any ongoing response with output to # the default conversation (i.e. `conversation` of `auto`) when a VAD start event # occurs. - interrupt_response: nil, + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response + + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response + # Used only for `server_vad` mode. Amount of audio to include before the VAD # detected speech (in milliseconds). Defaults to 300ms. - prefix_padding_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :prefix_padding_ms + + sig { params(prefix_padding_ms: Integer).void } + attr_writer :prefix_padding_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in # milliseconds). Defaults to 500ms. With shorter values the model will respond # more quickly, but may jump in on short pauses from the user. - silence_duration_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :silence_duration_ms + + sig { params(silence_duration_ms: Integer).void } + attr_writer :silence_duration_ms + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this # defaults to 0.5. A higher threshold will require louder audio to activate the # model, and thus might perform better in noisy environments. - threshold: nil, - # Type of turn detection. - type: nil - ) - end + sig { returns(T.nilable(Float)) } + attr_reader :threshold - sig do - override.returns( - { + sig { params(threshold: Float).void } + attr_writer :threshold + + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + sig do + params( create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol, idle_timeout_ms: T.nilable(Integer), interrupt_response: T::Boolean, prefix_padding_ms: Integer, silence_duration_ms: Integer, threshold: Float, - type: - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol - } + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + idle_timeout_ms: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + prefix_padding_ms: nil, + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + silence_duration_ms: nil, + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + threshold: nil, + # Type of turn detection, `server_vad` to turn on simple Server VAD. + type: :server_vad ) - end - def to_hash - end + end - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - module Eagerness - extend OpenAI::Internal::Type::Enum + sig do + override.returns( + { + type: Symbol, + create_response: T::Boolean, + idle_timeout_ms: T.nilable(Integer), + interrupt_response: T::Boolean, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + ) + end + def to_hash + end + end - TaggedSymbol = + class SemanticVad < OpenAI::Internal::Type::BaseModel + OrHash = T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness + T.any( + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad, + OpenAI::Internal::AnyHash ) end - OrSymbol = T.type_alias { T.any(Symbol, String) } - LOW = - T.let( - :low, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol - ) - MEDIUM = - T.let( - :medium, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol - ) - HIGH = - T.let( - :high, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol - ) - AUTO = - T.let( - :auto, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol - ) + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + sig { returns(Symbol) } + attr_accessor :type + + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response + + sig { params(create_response: T::Boolean).void } + attr_writer :create_response + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. sig do - override.returns( - T::Array[ - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol - ] + returns( + T.nilable( + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol + ) ) end - def self.values + attr_reader :eagerness + + sig do + params( + eagerness: + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol + ).void end - end + attr_writer :eagerness - # Type of turn detection. - module Type - extend OpenAI::Internal::Type::Enum + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response - TaggedSymbol = - T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type - ) - end - OrSymbol = T.type_alias { T.any(Symbol, String) } + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response - SERVER_VAD = - T.let( - :server_vad, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol - ) - SEMANTIC_VAD = - T.let( - :semantic_vad, - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol - ) + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + sig do + params( + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol, + interrupt_response: T::Boolean, + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + eagerness: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + type: :semantic_vad + ) + end sig do override.returns( - T::Array[ - OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol - ] + { + type: Symbol, + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol, + interrupt_response: T::Boolean + } ) end - def self.values + def to_hash + end + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + module Eagerness + extend OpenAI::Internal::Type::Enum + + TaggedSymbol = + T.type_alias do + T.all( + Symbol, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness + ) + end + OrSymbol = T.type_alias { T.any(Symbol, String) } + + LOW = + T.let( + :low, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + MEDIUM = + T.let( + :medium, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + HIGH = + T.let( + :high, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + AUTO = + T.let( + :auto, + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ] + ) + end + def self.values + end end end + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeAudioInputTurnDetection::Variants + ] + ) + end + def self.variants + end end end end diff --git a/rbi/openai/models/realtime/realtime_session.rbi b/rbi/openai/models/realtime/realtime_session.rbi index 42a7c70d..e305375c 100644 --- a/rbi/openai/models/realtime/realtime_session.rbi +++ b/rbi/openai/models/realtime/realtime_session.rbi @@ -256,28 +256,28 @@ module OpenAI # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - sig do - returns(T.nilable(OpenAI::Realtime::RealtimeSession::TurnDetection)) - end - attr_reader :turn_detection - + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. sig do - params( - turn_detection: - T.nilable( - OpenAI::Realtime::RealtimeSession::TurnDetection::OrHash + returns( + T.nilable( + T.any( + OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad ) - ).void + ) + ) end - attr_writer :turn_detection + attr_accessor :turn_detection # The voice the model uses to respond. Voice cannot be changed during the session # once the model has responded with audio at least once. Current voice options are @@ -299,7 +299,7 @@ module OpenAI end attr_writer :voice - # Realtime session object. + # Realtime session object for the beta interface. sig do params( id: String, @@ -336,7 +336,10 @@ module OpenAI ), turn_detection: T.nilable( - OpenAI::Realtime::RealtimeSession::TurnDetection::OrHash + T.any( + OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad::OrHash, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::OrHash + ) ), voice: T.any(String, OpenAI::Realtime::RealtimeSession::Voice::OrSymbol) @@ -420,14 +423,17 @@ module OpenAI tracing: nil, # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. turn_detection: nil, # The voice the model uses to respond. Voice cannot be changed during the session # once the model has responded with audio at least once. Current voice options are @@ -472,7 +478,12 @@ module OpenAI ) ), turn_detection: - T.nilable(OpenAI::Realtime::RealtimeSession::TurnDetection), + T.nilable( + T.any( + OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad + ) + ), voice: T.any( String, @@ -864,256 +875,320 @@ module OpenAI end end - class TurnDetection < OpenAI::Internal::Type::BaseModel - OrHash = + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + module TurnDetection + extend OpenAI::Internal::Type::Union + + Variants = T.type_alias do T.any( - OpenAI::Realtime::RealtimeSession::TurnDetection, - OpenAI::Internal::AnyHash + OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad ) end - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :create_response - - sig { params(create_response: T::Boolean).void } - attr_writer :create_response - - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol - ) - ) - end - attr_reader :eagerness - - sig do - params( - eagerness: - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol - ).void - end - attr_writer :eagerness - - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received. - sig { returns(T.nilable(Integer)) } - attr_accessor :idle_timeout_ms - - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :interrupt_response - - sig { params(interrupt_response: T::Boolean).void } - attr_writer :interrupt_response - - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - sig { returns(T.nilable(Integer)) } - attr_reader :prefix_padding_ms - - sig { params(prefix_padding_ms: Integer).void } - attr_writer :prefix_padding_ms - - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - sig { returns(T.nilable(Integer)) } - attr_reader :silence_duration_ms - - sig { params(silence_duration_ms: Integer).void } - attr_writer :silence_duration_ms - - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - sig { returns(T.nilable(Float)) } - attr_reader :threshold - - sig { params(threshold: Float).void } - attr_writer :threshold - - # Type of turn detection. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol - ) - ) - end - attr_reader :type + class ServerVad < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad, + OpenAI::Internal::AnyHash + ) + end - sig do - params( - type: - OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol - ).void - end - attr_writer :type + # Type of turn detection, `server_vad` to turn on simple Server VAD. + sig { returns(Symbol) } + attr_accessor :type - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - sig do - params( - create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol, - idle_timeout_ms: T.nilable(Integer), - interrupt_response: T::Boolean, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: - OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol - ).returns(T.attached_class) - end - def self.new( # Whether or not to automatically generate a response when a VAD stop event # occurs. - create_response: nil, - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - eagerness: nil, - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received. - idle_timeout_ms: nil, + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response + + sig { params(create_response: T::Boolean).void } + attr_writer :create_response + + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + sig { returns(T.nilable(Integer)) } + attr_accessor :idle_timeout_ms + # Whether or not to automatically interrupt any ongoing response with output to # the default conversation (i.e. `conversation` of `auto`) when a VAD start event # occurs. - interrupt_response: nil, + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response + + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response + # Used only for `server_vad` mode. Amount of audio to include before the VAD # detected speech (in milliseconds). Defaults to 300ms. - prefix_padding_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :prefix_padding_ms + + sig { params(prefix_padding_ms: Integer).void } + attr_writer :prefix_padding_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in # milliseconds). Defaults to 500ms. With shorter values the model will respond # more quickly, but may jump in on short pauses from the user. - silence_duration_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :silence_duration_ms + + sig { params(silence_duration_ms: Integer).void } + attr_writer :silence_duration_ms + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this # defaults to 0.5. A higher threshold will require louder audio to activate the # model, and thus might perform better in noisy environments. - threshold: nil, - # Type of turn detection. - type: nil - ) - end + sig { returns(T.nilable(Float)) } + attr_reader :threshold - sig do - override.returns( - { + sig { params(threshold: Float).void } + attr_writer :threshold + + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + sig do + params( create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol, idle_timeout_ms: T.nilable(Integer), interrupt_response: T::Boolean, prefix_padding_ms: Integer, silence_duration_ms: Integer, threshold: Float, - type: - OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol - } + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + idle_timeout_ms: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + prefix_padding_ms: nil, + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + silence_duration_ms: nil, + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + threshold: nil, + # Type of turn detection, `server_vad` to turn on simple Server VAD. + type: :server_vad ) - end - def to_hash - end + end - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - module Eagerness - extend OpenAI::Internal::Type::Enum + sig do + override.returns( + { + type: Symbol, + create_response: T::Boolean, + idle_timeout_ms: T.nilable(Integer), + interrupt_response: T::Boolean, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + ) + end + def to_hash + end + end - TaggedSymbol = + class SemanticVad < OpenAI::Internal::Type::BaseModel + OrHash = T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness + T.any( + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad, + OpenAI::Internal::AnyHash ) end - OrSymbol = T.type_alias { T.any(Symbol, String) } - LOW = - T.let( - :low, - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol - ) - MEDIUM = - T.let( - :medium, - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol - ) - HIGH = - T.let( - :high, - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol - ) - AUTO = - T.let( - :auto, - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol - ) + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + sig { returns(Symbol) } + attr_accessor :type + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response + + sig { params(create_response: T::Boolean).void } + attr_writer :create_response + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. sig do - override.returns( - T::Array[ - OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol - ] + returns( + T.nilable( + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol + ) ) end - def self.values + attr_reader :eagerness + + sig do + params( + eagerness: + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol + ).void end - end + attr_writer :eagerness - # Type of turn detection. - module Type - extend OpenAI::Internal::Type::Enum + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response - TaggedSymbol = - T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeSession::TurnDetection::Type - ) - end - OrSymbol = T.type_alias { T.any(Symbol, String) } + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response - SERVER_VAD = - T.let( - :server_vad, - OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol - ) - SEMANTIC_VAD = - T.let( - :semantic_vad, - OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol - ) + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + sig do + params( + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol, + interrupt_response: T::Boolean, + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + eagerness: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + type: :semantic_vad + ) + end sig do override.returns( - T::Array[ - OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol - ] + { + type: Symbol, + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol, + interrupt_response: T::Boolean + } ) end - def self.values + def to_hash + end + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + module Eagerness + extend OpenAI::Internal::Type::Enum + + TaggedSymbol = + T.type_alias do + T.all( + Symbol, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness + ) + end + OrSymbol = T.type_alias { T.any(Symbol, String) } + + LOW = + T.let( + :low, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + MEDIUM = + T.let( + :medium, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + HIGH = + T.let( + :high, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + AUTO = + T.let( + :auto, + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ] + ) + end + def self.values + end end end + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeSession::TurnDetection::Variants + ] + ) + end + def self.variants + end end # The voice the model uses to respond. Voice cannot be changed during the session diff --git a/rbi/openai/models/realtime/realtime_session_create_response.rbi b/rbi/openai/models/realtime/realtime_session_create_response.rbi index 0518a759..6cca5872 100644 --- a/rbi/openai/models/realtime/realtime_session_create_response.rbi +++ b/rbi/openai/models/realtime/realtime_session_create_response.rbi @@ -525,30 +525,25 @@ module OpenAI # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. sig do returns( T.nilable( - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants ) ) end - attr_reader :turn_detection - - sig do - params( - turn_detection: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::OrHash - ).void - end - attr_writer :turn_detection + attr_accessor :turn_detection sig do params( @@ -562,7 +557,12 @@ module OpenAI OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction::OrHash, transcription: OpenAI::Realtime::AudioTranscription::OrHash, turn_detection: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::OrHash + T.nilable( + T.any( + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad::OrHash, + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::OrHash + ) + ) ).returns(T.attached_class) end def self.new( @@ -585,14 +585,17 @@ module OpenAI transcription: nil, # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. turn_detection: nil ) end @@ -605,7 +608,9 @@ module OpenAI OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, turn_detection: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection + T.nilable( + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants + ) } ) end @@ -665,259 +670,320 @@ module OpenAI end end - class TurnDetection < OpenAI::Internal::Type::BaseModel - OrHash = + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + module TurnDetection + extend OpenAI::Internal::Type::Union + + Variants = T.type_alias do T.any( - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection, - OpenAI::Internal::AnyHash + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad ) end - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :create_response - - sig { params(create_response: T::Boolean).void } - attr_writer :create_response - - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol - ) - ) - end - attr_reader :eagerness - - sig do - params( - eagerness: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol - ).void - end - attr_writer :eagerness - - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received and emits a `timeout_triggered` event. - sig { returns(T.nilable(Integer)) } - attr_accessor :idle_timeout_ms - - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :interrupt_response - - sig { params(interrupt_response: T::Boolean).void } - attr_writer :interrupt_response - - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - sig { returns(T.nilable(Integer)) } - attr_reader :prefix_padding_ms - - sig { params(prefix_padding_ms: Integer).void } - attr_writer :prefix_padding_ms - - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - sig { returns(T.nilable(Integer)) } - attr_reader :silence_duration_ms - - sig { params(silence_duration_ms: Integer).void } - attr_writer :silence_duration_ms - - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - sig { returns(T.nilable(Float)) } - attr_reader :threshold - - sig { params(threshold: Float).void } - attr_writer :threshold - - # Type of turn detection. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol - ) - ) - end - attr_reader :type + class ServerVad < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, + OpenAI::Internal::AnyHash + ) + end - sig do - params( - type: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol - ).void - end - attr_writer :type + # Type of turn detection, `server_vad` to turn on simple Server VAD. + sig { returns(Symbol) } + attr_accessor :type - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - sig do - params( - create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol, - idle_timeout_ms: T.nilable(Integer), - interrupt_response: T::Boolean, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol - ).returns(T.attached_class) - end - def self.new( # Whether or not to automatically generate a response when a VAD stop event # occurs. - create_response: nil, - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - eagerness: nil, - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received and emits a `timeout_triggered` event. - idle_timeout_ms: nil, + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response + + sig { params(create_response: T::Boolean).void } + attr_writer :create_response + + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + sig { returns(T.nilable(Integer)) } + attr_accessor :idle_timeout_ms + # Whether or not to automatically interrupt any ongoing response with output to # the default conversation (i.e. `conversation` of `auto`) when a VAD start event # occurs. - interrupt_response: nil, + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response + + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response + # Used only for `server_vad` mode. Amount of audio to include before the VAD # detected speech (in milliseconds). Defaults to 300ms. - prefix_padding_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :prefix_padding_ms + + sig { params(prefix_padding_ms: Integer).void } + attr_writer :prefix_padding_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in # milliseconds). Defaults to 500ms. With shorter values the model will respond # more quickly, but may jump in on short pauses from the user. - silence_duration_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :silence_duration_ms + + sig { params(silence_duration_ms: Integer).void } + attr_writer :silence_duration_ms + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this # defaults to 0.5. A higher threshold will require louder audio to activate the # model, and thus might perform better in noisy environments. - threshold: nil, - # Type of turn detection. - type: nil - ) - end + sig { returns(T.nilable(Float)) } + attr_reader :threshold - sig do - override.returns( - { + sig { params(threshold: Float).void } + attr_writer :threshold + + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + sig do + params( create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol, idle_timeout_ms: T.nilable(Integer), interrupt_response: T::Boolean, prefix_padding_ms: Integer, silence_duration_ms: Integer, threshold: Float, - type: - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol - } + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + idle_timeout_ms: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + prefix_padding_ms: nil, + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + silence_duration_ms: nil, + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + threshold: nil, + # Type of turn detection, `server_vad` to turn on simple Server VAD. + type: :server_vad ) - end - def to_hash - end + end - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - # and `high` have max timeouts of 8s, 4s, and 2s respectively. - module Eagerness - extend OpenAI::Internal::Type::Enum + sig do + override.returns( + { + type: Symbol, + create_response: T::Boolean, + idle_timeout_ms: T.nilable(Integer), + interrupt_response: T::Boolean, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + ) + end + def to_hash + end + end - TaggedSymbol = + class SemanticVad < OpenAI::Internal::Type::BaseModel + OrHash = T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness + T.any( + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad, + OpenAI::Internal::AnyHash ) end - OrSymbol = T.type_alias { T.any(Symbol, String) } - LOW = - T.let( - :low, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol - ) - MEDIUM = - T.let( - :medium, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol - ) - HIGH = - T.let( - :high, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol - ) - AUTO = - T.let( - :auto, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol - ) + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + sig { returns(Symbol) } + attr_accessor :type + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response + + sig { params(create_response: T::Boolean).void } + attr_writer :create_response + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. sig do - override.returns( - T::Array[ - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol - ] + returns( + T.nilable( + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) ) end - def self.values + attr_reader :eagerness + + sig do + params( + eagerness: + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol + ).void end - end + attr_writer :eagerness - # Type of turn detection. - module Type - extend OpenAI::Internal::Type::Enum + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response - TaggedSymbol = - T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type - ) - end - OrSymbol = T.type_alias { T.any(Symbol, String) } + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response - SERVER_VAD = - T.let( - :server_vad, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol - ) - SEMANTIC_VAD = - T.let( - :semantic_vad, - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol - ) + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + sig do + params( + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol, + interrupt_response: T::Boolean, + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + eagerness: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + type: :semantic_vad + ) + end sig do override.returns( - T::Array[ - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol - ] + { + type: Symbol, + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol, + interrupt_response: T::Boolean + } ) end - def self.values + def to_hash + end + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + module Eagerness + extend OpenAI::Internal::Type::Enum + + TaggedSymbol = + T.type_alias do + T.all( + Symbol, + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness + ) + end + OrSymbol = T.type_alias { T.any(Symbol, String) } + + LOW = + T.let( + :low, + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + MEDIUM = + T.let( + :medium, + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + HIGH = + T.let( + :high, + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + AUTO = + T.let( + :auto, + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol + ] + ) + end + def self.values + end end end + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants + ] + ) + end + def self.variants + end end end diff --git a/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi b/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi index 360679f8..a07f9361 100644 --- a/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +++ b/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi @@ -80,30 +80,28 @@ module OpenAI # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. sig do returns( T.nilable( - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection + T.any( + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad + ) ) ) end - attr_reader :turn_detection - - sig do - params( - turn_detection: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::OrHash - ).void - end - attr_writer :turn_detection + attr_accessor :turn_detection sig do params( @@ -117,7 +115,12 @@ module OpenAI OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction::OrHash, transcription: OpenAI::Realtime::AudioTranscription::OrHash, turn_detection: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::OrHash + T.nilable( + T.any( + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad::OrHash, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::OrHash + ) + ) ).returns(T.attached_class) end def self.new( @@ -140,14 +143,17 @@ module OpenAI transcription: nil, # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. turn_detection: nil ) end @@ -165,7 +171,12 @@ module OpenAI OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, turn_detection: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection + T.nilable( + T.any( + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad + ) + ) } ) end diff --git a/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi b/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi index 676cf1eb..3dc51534 100644 --- a/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +++ b/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi @@ -3,256 +3,320 @@ module OpenAI module Models module Realtime - class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel - OrHash = + # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + # set to `null` to turn off, in which case the client must manually trigger model + # response. + # + # Server VAD means that the model will detect the start and end of speech based on + # audio volume and respond at the end of user speech. + # + # Semantic VAD is more advanced and uses a turn detection model (in conjunction + # with VAD) to semantically estimate whether the user has finished speaking, then + # dynamically sets a timeout based on this probability. For example, if user audio + # trails off with "uhhm", the model will score a low probability of turn end and + # wait longer for the user to continue speaking. This can be useful for more + # natural conversations, but may have a higher latency. + module RealtimeTranscriptionSessionAudioInputTurnDetection + extend OpenAI::Internal::Type::Union + + Variants = T.type_alias do T.any( - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection, - OpenAI::Internal::AnyHash + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad ) end - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :create_response - - sig { params(create_response: T::Boolean).void } - attr_writer :create_response - - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol - ) - ) - end - attr_reader :eagerness - - sig do - params( - eagerness: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol - ).void - end - attr_writer :eagerness - - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received. - sig { returns(T.nilable(Integer)) } - attr_accessor :idle_timeout_ms - - # Whether or not to automatically interrupt any ongoing response with output to - # the default conversation (i.e. `conversation` of `auto`) when a VAD start event - # occurs. - sig { returns(T.nilable(T::Boolean)) } - attr_reader :interrupt_response - - sig { params(interrupt_response: T::Boolean).void } - attr_writer :interrupt_response - - # Used only for `server_vad` mode. Amount of audio to include before the VAD - # detected speech (in milliseconds). Defaults to 300ms. - sig { returns(T.nilable(Integer)) } - attr_reader :prefix_padding_ms - - sig { params(prefix_padding_ms: Integer).void } - attr_writer :prefix_padding_ms + class ServerVad < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, + OpenAI::Internal::AnyHash + ) + end - # Used only for `server_vad` mode. Duration of silence to detect speech stop (in - # milliseconds). Defaults to 500ms. With shorter values the model will respond - # more quickly, but may jump in on short pauses from the user. - sig { returns(T.nilable(Integer)) } - attr_reader :silence_duration_ms + # Type of turn detection, `server_vad` to turn on simple Server VAD. + sig { returns(Symbol) } + attr_accessor :type - sig { params(silence_duration_ms: Integer).void } - attr_writer :silence_duration_ms + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response - # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this - # defaults to 0.5. A higher threshold will require louder audio to activate the - # model, and thus might perform better in noisy environments. - sig { returns(T.nilable(Float)) } - attr_reader :threshold + sig { params(create_response: T::Boolean).void } + attr_writer :create_response - sig { params(threshold: Float).void } - attr_writer :threshold + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + sig { returns(T.nilable(Integer)) } + attr_accessor :idle_timeout_ms - # Type of turn detection. - sig do - returns( - T.nilable( - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol - ) - ) - end - attr_reader :type - - sig do - params( - type: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol - ).void - end - attr_writer :type - - # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - # set to `null` to turn off, in which case the client must manually trigger model - # response. Server VAD means that the model will detect the start and end of - # speech based on audio volume and respond at the end of user speech. Semantic VAD - # is more advanced and uses a turn detection model (in conjunction with VAD) to - # semantically estimate whether the user has finished speaking, then dynamically - # sets a timeout based on this probability. For example, if user audio trails off - # with "uhhm", the model will score a low probability of turn end and wait longer - # for the user to continue speaking. This can be useful for more natural - # conversations, but may have a higher latency. - sig do - params( - create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol, - idle_timeout_ms: T.nilable(Integer), - interrupt_response: T::Boolean, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol - ).returns(T.attached_class) - end - def self.new( - # Whether or not to automatically generate a response when a VAD stop event - # occurs. - create_response: nil, - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - eagerness: nil, - # Optional idle timeout after which turn detection will auto-timeout when no - # additional audio is received. - idle_timeout_ms: nil, # Whether or not to automatically interrupt any ongoing response with output to # the default conversation (i.e. `conversation` of `auto`) when a VAD start event # occurs. - interrupt_response: nil, + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response + + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response + # Used only for `server_vad` mode. Amount of audio to include before the VAD # detected speech (in milliseconds). Defaults to 300ms. - prefix_padding_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :prefix_padding_ms + + sig { params(prefix_padding_ms: Integer).void } + attr_writer :prefix_padding_ms + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in # milliseconds). Defaults to 500ms. With shorter values the model will respond # more quickly, but may jump in on short pauses from the user. - silence_duration_ms: nil, + sig { returns(T.nilable(Integer)) } + attr_reader :silence_duration_ms + + sig { params(silence_duration_ms: Integer).void } + attr_writer :silence_duration_ms + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this # defaults to 0.5. A higher threshold will require louder audio to activate the # model, and thus might perform better in noisy environments. - threshold: nil, - # Type of turn detection. - type: nil - ) - end + sig { returns(T.nilable(Float)) } + attr_reader :threshold - sig do - override.returns( - { + sig { params(threshold: Float).void } + attr_writer :threshold + + # Server-side voice activity detection (VAD) which flips on when user speech is + # detected and off after a period of silence. + sig do + params( create_response: T::Boolean, - eagerness: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol, idle_timeout_ms: T.nilable(Integer), interrupt_response: T::Boolean, prefix_padding_ms: Integer, silence_duration_ms: Integer, threshold: Float, - type: - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol - } + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Optional timeout after which a model response will be triggered automatically. + # This is useful for situations in which a long pause from the user is unexpected, + # such as a phone call. The model will effectively prompt the user to continue the + # conversation based on the current context. + # + # The timeout value will be applied after the last model response's audio has + # finished playing, i.e. it's set to the `response.done` time plus audio playback + # duration. + # + # An `input_audio_buffer.timeout_triggered` event (plus events associated with the + # Response) will be emitted when the timeout is reached. Idle timeout is currently + # only supported for `server_vad` mode. + idle_timeout_ms: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Used only for `server_vad` mode. Amount of audio to include before the VAD + # detected speech (in milliseconds). Defaults to 300ms. + prefix_padding_ms: nil, + # Used only for `server_vad` mode. Duration of silence to detect speech stop (in + # milliseconds). Defaults to 500ms. With shorter values the model will respond + # more quickly, but may jump in on short pauses from the user. + silence_duration_ms: nil, + # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this + # defaults to 0.5. A higher threshold will require louder audio to activate the + # model, and thus might perform better in noisy environments. + threshold: nil, + # Type of turn detection, `server_vad` to turn on simple Server VAD. + type: :server_vad ) - end - def to_hash - end + end - # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - # will wait longer for the user to continue speaking, `high` will respond more - # quickly. `auto` is the default and is equivalent to `medium`. - module Eagerness - extend OpenAI::Internal::Type::Enum + sig do + override.returns( + { + type: Symbol, + create_response: T::Boolean, + idle_timeout_ms: T.nilable(Integer), + interrupt_response: T::Boolean, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + ) + end + def to_hash + end + end - TaggedSymbol = + class SemanticVad < OpenAI::Internal::Type::BaseModel + OrHash = T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness + T.any( + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad, + OpenAI::Internal::AnyHash ) end - OrSymbol = T.type_alias { T.any(Symbol, String) } - LOW = - T.let( - :low, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol - ) - MEDIUM = - T.let( - :medium, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol - ) - HIGH = - T.let( - :high, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol - ) - AUTO = - T.let( - :auto, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol - ) + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + sig { returns(Symbol) } + attr_accessor :type + + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :create_response + sig { params(create_response: T::Boolean).void } + attr_writer :create_response + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. sig do - override.returns( - T::Array[ - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol - ] + returns( + T.nilable( + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol + ) ) end - def self.values + attr_reader :eagerness + + sig do + params( + eagerness: + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol + ).void end - end + attr_writer :eagerness - # Type of turn detection. - module Type - extend OpenAI::Internal::Type::Enum + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + sig { returns(T.nilable(T::Boolean)) } + attr_reader :interrupt_response - TaggedSymbol = - T.type_alias do - T.all( - Symbol, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type - ) - end - OrSymbol = T.type_alias { T.any(Symbol, String) } + sig { params(interrupt_response: T::Boolean).void } + attr_writer :interrupt_response - SERVER_VAD = - T.let( - :server_vad, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol - ) - SEMANTIC_VAD = - T.let( - :semantic_vad, - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol - ) + # Server-side semantic turn detection which uses a model to determine when the + # user has finished speaking. + sig do + params( + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol, + interrupt_response: T::Boolean, + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Whether or not to automatically generate a response when a VAD stop event + # occurs. + create_response: nil, + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + eagerness: nil, + # Whether or not to automatically interrupt any ongoing response with output to + # the default conversation (i.e. `conversation` of `auto`) when a VAD start event + # occurs. + interrupt_response: nil, + # Type of turn detection, `semantic_vad` to turn on Semantic VAD. + type: :semantic_vad + ) + end sig do override.returns( - T::Array[ - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol - ] + { + type: Symbol, + create_response: T::Boolean, + eagerness: + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol, + interrupt_response: T::Boolean + } ) end - def self.values + def to_hash + end + + # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + # will wait longer for the user to continue speaking, `high` will respond more + # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + # and `high` have max timeouts of 8s, 4s, and 2s respectively. + module Eagerness + extend OpenAI::Internal::Type::Enum + + TaggedSymbol = + T.type_alias do + T.all( + Symbol, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness + ) + end + OrSymbol = T.type_alias { T.any(Symbol, String) } + + LOW = + T.let( + :low, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + MEDIUM = + T.let( + :medium, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + HIGH = + T.let( + :high, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + AUTO = + T.let( + :auto, + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ) + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol + ] + ) + end + def self.values + end end end + + sig do + override.returns( + T::Array[ + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Variants + ] + ) + end + def self.variants + end end end end diff --git a/rbi/openai/models/responses/response.rbi b/rbi/openai/models/responses/response.rbi index dd1d405b..80f62655 100644 --- a/rbi/openai/models/responses/response.rbi +++ b/rbi/openai/models/responses/response.rbi @@ -265,10 +265,10 @@ module OpenAI # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. sig do returns( @@ -521,10 +521,10 @@ module OpenAI top_logprobs: nil, # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. truncation: nil, # Represents token usage details including input tokens, output tokens, a @@ -819,10 +819,10 @@ module OpenAI # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. module Truncation extend OpenAI::Internal::Type::Enum diff --git a/rbi/openai/models/responses/response_create_params.rbi b/rbi/openai/models/responses/response_create_params.rbi index 9d019c87..42b3aaa1 100644 --- a/rbi/openai/models/responses/response_create_params.rbi +++ b/rbi/openai/models/responses/response_create_params.rbi @@ -378,10 +378,10 @@ module OpenAI # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. sig do returns( @@ -637,10 +637,10 @@ module OpenAI top_p: nil, # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. truncation: nil, # This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use @@ -920,10 +920,10 @@ module OpenAI # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. module Truncation extend OpenAI::Internal::Type::Enum diff --git a/rbi/openai/resources/responses.rbi b/rbi/openai/resources/responses.rbi index e82f7866..e032693f 100644 --- a/rbi/openai/resources/responses.rbi +++ b/rbi/openai/resources/responses.rbi @@ -258,10 +258,10 @@ module OpenAI top_p: nil, # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. truncation: nil, # This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use @@ -535,10 +535,10 @@ module OpenAI top_p: nil, # The truncation strategy to use for the model response. # - # - `auto`: If the context of this response and previous ones exceeds the model's - # context window size, the model will truncate the response to fit the context - # window by dropping input items in the middle of the conversation. - # - `disabled` (default): If a model response will exceed the context window size + # - `auto`: If the input to this Response exceeds the model's context window size, + # the model will truncate the response to fit the context window by dropping + # items from the beginning of the conversation. + # - `disabled` (default): If the input size will exceed the context window size # for a model, the request will fail with a 400 error. truncation: nil, # This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use diff --git a/sig/openai/models/realtime/realtime_audio_config_input.rbs b/sig/openai/models/realtime/realtime_audio_config_input.rbs index 08d072de..5c1a430e 100644 --- a/sig/openai/models/realtime/realtime_audio_config_input.rbs +++ b/sig/openai/models/realtime/realtime_audio_config_input.rbs @@ -6,7 +6,7 @@ module OpenAI format_: OpenAI::Models::Realtime::realtime_audio_formats, noise_reduction: OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, - turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection + turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection? } class RealtimeAudioConfigInput < OpenAI::Internal::Type::BaseModel @@ -28,24 +28,20 @@ module OpenAI OpenAI::Realtime::AudioTranscription ) -> OpenAI::Realtime::AudioTranscription - attr_reader turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection? - - def turn_detection=: ( - OpenAI::Realtime::RealtimeAudioInputTurnDetection - ) -> OpenAI::Realtime::RealtimeAudioInputTurnDetection + attr_accessor turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection? def initialize: ( ?format_: OpenAI::Models::Realtime::realtime_audio_formats, ?noise_reduction: OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction, ?transcription: OpenAI::Realtime::AudioTranscription, - ?turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection + ?turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection? ) -> void def to_hash: -> { format_: OpenAI::Models::Realtime::realtime_audio_formats, noise_reduction: OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, - turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection + turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection? } type noise_reduction = diff --git a/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs b/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs index 3a8b1c9e..4c6593ea 100644 --- a/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +++ b/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs @@ -2,97 +2,123 @@ module OpenAI module Models module Realtime type realtime_audio_input_turn_detection = - { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_ - } + OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad + | OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad - class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel - attr_reader create_response: bool? + module RealtimeAudioInputTurnDetection + extend OpenAI::Internal::Type::Union - def create_response=: (bool) -> bool + type server_vad = + { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } - attr_reader eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness? + class ServerVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :server_vad - def eagerness=: ( - OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness - ) -> OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness + attr_reader create_response: bool? - attr_accessor idle_timeout_ms: Integer? + def create_response=: (bool) -> bool - attr_reader interrupt_response: bool? + attr_accessor idle_timeout_ms: Integer? - def interrupt_response=: (bool) -> bool + attr_reader interrupt_response: bool? - attr_reader prefix_padding_ms: Integer? + def interrupt_response=: (bool) -> bool - def prefix_padding_ms=: (Integer) -> Integer + attr_reader prefix_padding_ms: Integer? - attr_reader silence_duration_ms: Integer? + def prefix_padding_ms=: (Integer) -> Integer - def silence_duration_ms=: (Integer) -> Integer + attr_reader silence_duration_ms: Integer? - attr_reader threshold: Float? + def silence_duration_ms=: (Integer) -> Integer - def threshold=: (Float) -> Float + attr_reader threshold: Float? - attr_reader type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_? + def threshold=: (Float) -> Float - def type=: ( - OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_ - ) -> OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_ + def initialize: ( + ?create_response: bool, + ?idle_timeout_ms: Integer?, + ?interrupt_response: bool, + ?prefix_padding_ms: Integer, + ?silence_duration_ms: Integer, + ?threshold: Float, + ?type: :server_vad + ) -> void - def initialize: ( - ?create_response: bool, - ?eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness, - ?idle_timeout_ms: Integer?, - ?interrupt_response: bool, - ?prefix_padding_ms: Integer, - ?silence_duration_ms: Integer, - ?threshold: Float, - ?type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_ - ) -> void + def to_hash: -> { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + end - def to_hash: -> { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_ - } + type semantic_vad = + { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } - type eagerness = :low | :medium | :high | :auto + class SemanticVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :semantic_vad - module Eagerness - extend OpenAI::Internal::Type::Enum + attr_reader create_response: bool? - LOW: :low - MEDIUM: :medium - HIGH: :high - AUTO: :auto + def create_response=: (bool) -> bool - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness] - end + attr_reader eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness? + + def eagerness=: ( + OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness + ) -> OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness + + attr_reader interrupt_response: bool? - type type_ = :server_vad | :semantic_vad + def interrupt_response=: (bool) -> bool - module Type - extend OpenAI::Internal::Type::Enum + def initialize: ( + ?create_response: bool, + ?eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness, + ?interrupt_response: bool, + ?type: :semantic_vad + ) -> void - SERVER_VAD: :server_vad - SEMANTIC_VAD: :semantic_vad + def to_hash: -> { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_] + type eagerness = :low | :medium | :high | :auto + + module Eagerness + extend OpenAI::Internal::Type::Enum + + LOW: :low + MEDIUM: :medium + HIGH: :high + AUTO: :auto + + def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness] + end end + + def self?.variants: -> ::Array[OpenAI::Models::Realtime::realtime_audio_input_turn_detection] end end end diff --git a/sig/openai/models/realtime/realtime_session.rbs b/sig/openai/models/realtime/realtime_session.rbs index 3f239c05..480c4857 100644 --- a/sig/openai/models/realtime/realtime_session.rbs +++ b/sig/openai/models/realtime/realtime_session.rbs @@ -21,7 +21,7 @@ module OpenAI tool_choice: String, tools: ::Array[OpenAI::Realtime::RealtimeFunctionTool], tracing: OpenAI::Models::Realtime::RealtimeSession::tracing?, - turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection?, + turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection?, voice: OpenAI::Models::Realtime::RealtimeSession::voice } @@ -106,7 +106,7 @@ module OpenAI attr_accessor tracing: OpenAI::Models::Realtime::RealtimeSession::tracing? - attr_accessor turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection? + attr_accessor turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection? attr_reader voice: OpenAI::Models::Realtime::RealtimeSession::voice? @@ -133,7 +133,7 @@ module OpenAI ?tool_choice: String, ?tools: ::Array[OpenAI::Realtime::RealtimeFunctionTool], ?tracing: OpenAI::Models::Realtime::RealtimeSession::tracing?, - ?turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection?, + ?turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection?, ?voice: OpenAI::Models::Realtime::RealtimeSession::voice ) -> void @@ -156,7 +156,7 @@ module OpenAI tool_choice: String, tools: ::Array[OpenAI::Realtime::RealtimeFunctionTool], tracing: OpenAI::Models::Realtime::RealtimeSession::tracing?, - turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection?, + turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection?, voice: OpenAI::Models::Realtime::RealtimeSession::voice } @@ -307,97 +307,123 @@ module OpenAI end type turn_detection = - { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_ - } + OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad + | OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad - class TurnDetection < OpenAI::Internal::Type::BaseModel - attr_reader create_response: bool? + module TurnDetection + extend OpenAI::Internal::Type::Union - def create_response=: (bool) -> bool + type server_vad = + { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } - attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness? + class ServerVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :server_vad - def eagerness=: ( - OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness - ) -> OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness + attr_reader create_response: bool? - attr_accessor idle_timeout_ms: Integer? + def create_response=: (bool) -> bool - attr_reader interrupt_response: bool? + attr_accessor idle_timeout_ms: Integer? - def interrupt_response=: (bool) -> bool + attr_reader interrupt_response: bool? - attr_reader prefix_padding_ms: Integer? + def interrupt_response=: (bool) -> bool - def prefix_padding_ms=: (Integer) -> Integer + attr_reader prefix_padding_ms: Integer? - attr_reader silence_duration_ms: Integer? + def prefix_padding_ms=: (Integer) -> Integer - def silence_duration_ms=: (Integer) -> Integer + attr_reader silence_duration_ms: Integer? - attr_reader threshold: Float? + def silence_duration_ms=: (Integer) -> Integer - def threshold=: (Float) -> Float + attr_reader threshold: Float? - attr_reader type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_? + def threshold=: (Float) -> Float - def type=: ( - OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_ - ) -> OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_ + def initialize: ( + ?create_response: bool, + ?idle_timeout_ms: Integer?, + ?interrupt_response: bool, + ?prefix_padding_ms: Integer, + ?silence_duration_ms: Integer, + ?threshold: Float, + ?type: :server_vad + ) -> void - def initialize: ( - ?create_response: bool, - ?eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness, - ?idle_timeout_ms: Integer?, - ?interrupt_response: bool, - ?prefix_padding_ms: Integer, - ?silence_duration_ms: Integer, - ?threshold: Float, - ?type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_ - ) -> void + def to_hash: -> { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + end - def to_hash: -> { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_ - } + type semantic_vad = + { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } - type eagerness = :low | :medium | :high | :auto + class SemanticVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :semantic_vad - module Eagerness - extend OpenAI::Internal::Type::Enum + attr_reader create_response: bool? - LOW: :low - MEDIUM: :medium - HIGH: :high - AUTO: :auto + def create_response=: (bool) -> bool - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness] - end + attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness? + + def eagerness=: ( + OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness + ) -> OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness + + attr_reader interrupt_response: bool? + + def interrupt_response=: (bool) -> bool + + def initialize: ( + ?create_response: bool, + ?eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness, + ?interrupt_response: bool, + ?type: :semantic_vad + ) -> void + + def to_hash: -> { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } - type type_ = :server_vad | :semantic_vad + type eagerness = :low | :medium | :high | :auto - module Type - extend OpenAI::Internal::Type::Enum + module Eagerness + extend OpenAI::Internal::Type::Enum - SERVER_VAD: :server_vad - SEMANTIC_VAD: :semantic_vad + LOW: :low + MEDIUM: :medium + HIGH: :high + AUTO: :auto - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_] + def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness] + end end + + def self?.variants: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::turn_detection] end type voice = diff --git a/sig/openai/models/realtime/realtime_session_create_response.rbs b/sig/openai/models/realtime/realtime_session_create_response.rbs index fbecbd5d..e4daa170 100644 --- a/sig/openai/models/realtime/realtime_session_create_response.rbs +++ b/sig/openai/models/realtime/realtime_session_create_response.rbs @@ -147,7 +147,7 @@ module OpenAI format_: OpenAI::Models::Realtime::realtime_audio_formats, noise_reduction: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, - turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection + turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection? } class Input < OpenAI::Internal::Type::BaseModel @@ -169,24 +169,20 @@ module OpenAI OpenAI::Realtime::AudioTranscription ) -> OpenAI::Realtime::AudioTranscription - attr_reader turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection? - - def turn_detection=: ( - OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection - ) -> OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection + attr_accessor turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection? def initialize: ( ?format_: OpenAI::Models::Realtime::realtime_audio_formats, ?noise_reduction: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction, ?transcription: OpenAI::Realtime::AudioTranscription, - ?turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection + ?turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection? ) -> void def to_hash: -> { format_: OpenAI::Models::Realtime::realtime_audio_formats, noise_reduction: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, - turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection + turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection? } type noise_reduction = @@ -209,97 +205,123 @@ module OpenAI end type turn_detection = - { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_ - } + OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad + | OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad - class TurnDetection < OpenAI::Internal::Type::BaseModel - attr_reader create_response: bool? + module TurnDetection + extend OpenAI::Internal::Type::Union - def create_response=: (bool) -> bool + type server_vad = + { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } - attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness? + class ServerVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :server_vad - def eagerness=: ( - OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness - ) -> OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness + attr_reader create_response: bool? - attr_accessor idle_timeout_ms: Integer? + def create_response=: (bool) -> bool - attr_reader interrupt_response: bool? + attr_accessor idle_timeout_ms: Integer? - def interrupt_response=: (bool) -> bool + attr_reader interrupt_response: bool? - attr_reader prefix_padding_ms: Integer? + def interrupt_response=: (bool) -> bool - def prefix_padding_ms=: (Integer) -> Integer + attr_reader prefix_padding_ms: Integer? - attr_reader silence_duration_ms: Integer? + def prefix_padding_ms=: (Integer) -> Integer - def silence_duration_ms=: (Integer) -> Integer + attr_reader silence_duration_ms: Integer? - attr_reader threshold: Float? + def silence_duration_ms=: (Integer) -> Integer - def threshold=: (Float) -> Float + attr_reader threshold: Float? - attr_reader type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_? + def threshold=: (Float) -> Float - def type=: ( - OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_ - ) -> OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_ + def initialize: ( + ?create_response: bool, + ?idle_timeout_ms: Integer?, + ?interrupt_response: bool, + ?prefix_padding_ms: Integer, + ?silence_duration_ms: Integer, + ?threshold: Float, + ?type: :server_vad + ) -> void - def initialize: ( - ?create_response: bool, - ?eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness, - ?idle_timeout_ms: Integer?, - ?interrupt_response: bool, - ?prefix_padding_ms: Integer, - ?silence_duration_ms: Integer, - ?threshold: Float, - ?type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_ - ) -> void + def to_hash: -> { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + end - def to_hash: -> { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_ - } + type semantic_vad = + { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } - type eagerness = :low | :medium | :high | :auto + class SemanticVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :semantic_vad - module Eagerness - extend OpenAI::Internal::Type::Enum + attr_reader create_response: bool? - LOW: :low - MEDIUM: :medium - HIGH: :high - AUTO: :auto + def create_response=: (bool) -> bool - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness] - end + attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness? - type type_ = :server_vad | :semantic_vad + def eagerness=: ( + OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness + ) -> OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness - module Type - extend OpenAI::Internal::Type::Enum + attr_reader interrupt_response: bool? - SERVER_VAD: :server_vad - SEMANTIC_VAD: :semantic_vad + def interrupt_response=: (bool) -> bool - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_] + def initialize: ( + ?create_response: bool, + ?eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness, + ?interrupt_response: bool, + ?type: :semantic_vad + ) -> void + + def to_hash: -> { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } + + type eagerness = :low | :medium | :high | :auto + + module Eagerness + extend OpenAI::Internal::Type::Enum + + LOW: :low + MEDIUM: :medium + HIGH: :high + AUTO: :auto + + def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness] + end end + + def self?.variants: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection] end end diff --git a/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs b/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs index 44b5b8fa..33d8b172 100644 --- a/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +++ b/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs @@ -6,7 +6,7 @@ module OpenAI format_: OpenAI::Models::Realtime::realtime_audio_formats, noise_reduction: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, - turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection + turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection? } class RealtimeTranscriptionSessionAudioInput < OpenAI::Internal::Type::BaseModel @@ -28,24 +28,20 @@ module OpenAI OpenAI::Realtime::AudioTranscription ) -> OpenAI::Realtime::AudioTranscription - attr_reader turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection? - - def turn_detection=: ( - OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection - ) -> OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection + attr_accessor turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection? def initialize: ( ?format_: OpenAI::Models::Realtime::realtime_audio_formats, ?noise_reduction: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction, ?transcription: OpenAI::Realtime::AudioTranscription, - ?turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection + ?turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection? ) -> void def to_hash: -> { format_: OpenAI::Models::Realtime::realtime_audio_formats, noise_reduction: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction, transcription: OpenAI::Realtime::AudioTranscription, - turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection + turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection? } type noise_reduction = diff --git a/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs b/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs index 56ac5314..1a61a5e7 100644 --- a/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +++ b/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs @@ -2,97 +2,123 @@ module OpenAI module Models module Realtime type realtime_transcription_session_audio_input_turn_detection = - { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_ - } + OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad + | OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad - class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel - attr_reader create_response: bool? + module RealtimeTranscriptionSessionAudioInputTurnDetection + extend OpenAI::Internal::Type::Union - def create_response=: (bool) -> bool + type server_vad = + { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } - attr_reader eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness? + class ServerVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :server_vad - def eagerness=: ( - OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness - ) -> OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness + attr_reader create_response: bool? - attr_accessor idle_timeout_ms: Integer? + def create_response=: (bool) -> bool - attr_reader interrupt_response: bool? + attr_accessor idle_timeout_ms: Integer? - def interrupt_response=: (bool) -> bool + attr_reader interrupt_response: bool? - attr_reader prefix_padding_ms: Integer? + def interrupt_response=: (bool) -> bool - def prefix_padding_ms=: (Integer) -> Integer + attr_reader prefix_padding_ms: Integer? - attr_reader silence_duration_ms: Integer? + def prefix_padding_ms=: (Integer) -> Integer - def silence_duration_ms=: (Integer) -> Integer + attr_reader silence_duration_ms: Integer? - attr_reader threshold: Float? + def silence_duration_ms=: (Integer) -> Integer - def threshold=: (Float) -> Float + attr_reader threshold: Float? - attr_reader type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_? + def threshold=: (Float) -> Float - def type=: ( - OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_ - ) -> OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_ + def initialize: ( + ?create_response: bool, + ?idle_timeout_ms: Integer?, + ?interrupt_response: bool, + ?prefix_padding_ms: Integer, + ?silence_duration_ms: Integer, + ?threshold: Float, + ?type: :server_vad + ) -> void - def initialize: ( - ?create_response: bool, - ?eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness, - ?idle_timeout_ms: Integer?, - ?interrupt_response: bool, - ?prefix_padding_ms: Integer, - ?silence_duration_ms: Integer, - ?threshold: Float, - ?type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_ - ) -> void + def to_hash: -> { + type: :server_vad, + create_response: bool, + idle_timeout_ms: Integer?, + interrupt_response: bool, + prefix_padding_ms: Integer, + silence_duration_ms: Integer, + threshold: Float + } + end - def to_hash: -> { - create_response: bool, - eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness, - idle_timeout_ms: Integer?, - interrupt_response: bool, - prefix_padding_ms: Integer, - silence_duration_ms: Integer, - threshold: Float, - type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_ - } + type semantic_vad = + { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } - type eagerness = :low | :medium | :high | :auto + class SemanticVad < OpenAI::Internal::Type::BaseModel + attr_accessor type: :semantic_vad - module Eagerness - extend OpenAI::Internal::Type::Enum + attr_reader create_response: bool? - LOW: :low - MEDIUM: :medium - HIGH: :high - AUTO: :auto + def create_response=: (bool) -> bool - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness] - end + attr_reader eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness? + + def eagerness=: ( + OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness + ) -> OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness + + attr_reader interrupt_response: bool? - type type_ = :server_vad | :semantic_vad + def interrupt_response=: (bool) -> bool - module Type - extend OpenAI::Internal::Type::Enum + def initialize: ( + ?create_response: bool, + ?eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness, + ?interrupt_response: bool, + ?type: :semantic_vad + ) -> void - SERVER_VAD: :server_vad - SEMANTIC_VAD: :semantic_vad + def to_hash: -> { + type: :semantic_vad, + create_response: bool, + eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness, + interrupt_response: bool + } - def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_] + type eagerness = :low | :medium | :high | :auto + + module Eagerness + extend OpenAI::Internal::Type::Enum + + LOW: :low + MEDIUM: :medium + HIGH: :high + AUTO: :auto + + def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness] + end end + + def self?.variants: -> ::Array[OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection] end end end From 31fe462be11f92561d9d0083bd28f76f43db0fe7 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:27:46 +0000 Subject: [PATCH 2/3] codegen metadata --- .stats.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.stats.yml b/.stats.yml index 5388f246..e3897189 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 118 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml -openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-94b1e3cb0bdc616ff0c2f267c33dadd95f133b1f64e647aab6c64afb292b2793.yml +openapi_spec_hash: 2395319ac9befd59b6536ae7f9564a05 config_hash: 930dac3aa861344867e4ac84f037b5df From cd99af965697a05ec394578270a5ebdfd3e742df Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:28:09 +0000 Subject: [PATCH 3/3] release: 0.23.2 --- .release-please-manifest.json | 2 +- CHANGELOG.md | 8 ++++++++ Gemfile.lock | 2 +- README.md | 2 +- lib/openai/version.rb | 2 +- 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 354c2fa8..c9da8cc1 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.23.1" + ".": "0.23.2" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index d8b3a290..6df92bf4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.23.2 (2025-09-11) + +Full Changelog: [v0.23.1...v0.23.2](https://github.com/openai/openai-ruby/compare/v0.23.1...v0.23.2) + +### Chores + +* **api:** Minor docs and type updates for realtime ([ccef982](https://github.com/openai/openai-ruby/commit/ccef9827b31206fc9ba40d2b6165eeefda7621f5)) + ## 0.23.1 (2025-09-10) Full Changelog: [v0.23.0...v0.23.1](https://github.com/openai/openai-ruby/compare/v0.23.0...v0.23.1) diff --git a/Gemfile.lock b/Gemfile.lock index 85e14eb4..04b60951 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -11,7 +11,7 @@ GIT PATH remote: . specs: - openai (0.23.1) + openai (0.23.2) connection_pool GEM diff --git a/README.md b/README.md index ace31963..9a262951 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ To use this gem, install via Bundler by adding the following to your application ```ruby -gem "openai", "~> 0.23.1" +gem "openai", "~> 0.23.2" ``` diff --git a/lib/openai/version.rb b/lib/openai/version.rb index 4ec01a14..fbf5600a 100644 --- a/lib/openai/version.rb +++ b/lib/openai/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module OpenAI - VERSION = "0.23.1" + VERSION = "0.23.2" end