From ccef9827b31206fc9ba40d2b6165eeefda7621f5 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:04:26 +0000
Subject: [PATCH 1/3] chore(api): Minor docs and type updates for realtime

---
 .stats.yml                                    |   4 +-
 .../input_audio_buffer_timeout_triggered.rb   |  30 +-
 .../realtime/realtime_audio_config_input.rb   |  25 +-
 .../realtime_audio_input_turn_detection.rb    | 290 ++++++----
 .../models/realtime/realtime_server_event.rb  |  14 +-
 .../models/realtime/realtime_session.rb       | 297 ++++++----
 .../realtime_session_create_response.rb       | 306 ++++++----
 ...ltime_transcription_session_audio_input.rb |  27 +-
 ...tion_session_audio_input_turn_detection.rb | 292 ++++++----
 lib/openai/models/responses/response.rb       |  16 +-
 .../responses/response_create_params.rb       |  16 +-
 .../input_audio_buffer_timeout_triggered.rbi  |  29 +-
 .../realtime/realtime_audio_config_input.rbi  |  72 ++-
 .../realtime_audio_input_turn_detection.rbi   | 467 ++++++++-------
 .../models/realtime/realtime_session.rbi      | 537 ++++++++++--------
 .../realtime_session_create_response.rbi      | 530 +++++++++--------
 ...time_transcription_session_audio_input.rbi |  67 ++-
 ...ion_session_audio_input_turn_detection.rbi | 464 ++++++++-------
 rbi/openai/models/responses/response.rbi      |  24 +-
 .../responses/response_create_params.rbi      |  24 +-
 rbi/openai/resources/responses.rbi            |  16 +-
 .../realtime/realtime_audio_config_input.rbs  |  12 +-
 .../realtime_audio_input_turn_detection.rbs   | 156 ++---
 .../models/realtime/realtime_session.rbs      | 164 +++---
 .../realtime_session_create_response.rbs      | 168 +++---
 ...time_transcription_session_audio_input.rbs |  12 +-
 ...ion_session_audio_input_turn_detection.rbs | 156 ++---
 27 files changed, 2448 insertions(+), 1767 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 2aa16be8..5388f246 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
-openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
+openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
 config_hash: 930dac3aa861344867e4ac84f037b5df
diff --git a/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb b/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb
index 91227a91..2d9af6dd 100644
--- a/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb
+++ b/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb
@@ -5,13 +5,15 @@ module Models
     module Realtime
       class InputAudioBufferTimeoutTriggered < OpenAI::Internal::Type::BaseModel
         # @!attribute audio_end_ms
-        #   Millisecond offset where speech ended within the buffered audio.
+        #   Millisecond offset of audio written to the input audio buffer at the time the
+        #   timeout was triggered.
         #
         #   @return [Integer]
         required :audio_end_ms, Integer
 
         # @!attribute audio_start_ms
-        #   Millisecond offset where speech started within the buffered audio.
+        #   Millisecond offset of audio written to the input audio buffer that was after the
+        #   playback time of the last model response.
         #
         #   @return [Integer]
         required :audio_start_ms, Integer
@@ -35,11 +37,29 @@ class InputAudioBufferTimeoutTriggered < OpenAI::Internal::Type::BaseModel
         required :type, const: :"input_audio_buffer.timeout_triggered"
 
         # @!method initialize(audio_end_ms:, audio_start_ms:, event_id:, item_id:, type: :"input_audio_buffer.timeout_triggered")
-        #   Returned when the server VAD timeout is triggered for the input audio buffer.
+        #   Some parameter documentations has been truncated, see
+        #   {OpenAI::Models::Realtime::InputAudioBufferTimeoutTriggered} for more details.
         #
-        #   @param audio_end_ms [Integer] Millisecond offset where speech ended within the buffered audio.
+        #   Returned when the Server VAD timeout is triggered for the input audio buffer.
+        #   This is configured with `idle_timeout_ms` in the `turn_detection` settings of
+        #   the session, and it indicates that there hasn't been any speech detected for the
+        #   configured duration.
         #
-        #   @param audio_start_ms [Integer] Millisecond offset where speech started within the buffered audio.
+        #   The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio
+        #   after the last model response up to the triggering time, as an offset from the
+        #   beginning of audio written to the input audio buffer. This means it demarcates
+        #   the segment of audio that was silent and the difference between the start and
+        #   end values will roughly match the configured timeout.
+        #
+        #   The empty audio will be committed to the conversation as an `input_audio` item
+        #   (there will be a `input_audio_buffer.committed` event) and a model response will
+        #   be generated. There may be speech that didn't trigger VAD but is still detected
+        #   by the model, so the model may respond with something relevant to the
+        #   conversation or a prompt to continue speaking.
+        #
+        #   @param audio_end_ms [Integer] Millisecond offset of audio written to the input audio buffer at the time the ti
+        #
+        #   @param audio_start_ms [Integer] Millisecond offset of audio written to the input audio buffer that was after the
         #
         #   @param event_id [String] The unique ID of the server event.
         #
diff --git a/lib/openai/models/realtime/realtime_audio_config_input.rb b/lib/openai/models/realtime/realtime_audio_config_input.rb
index 89f70507..37ca5874 100644
--- a/lib/openai/models/realtime/realtime_audio_config_input.rb
+++ b/lib/openai/models/realtime/realtime_audio_config_input.rb
@@ -36,17 +36,20 @@ class RealtimeAudioConfigInput < OpenAI::Internal::Type::BaseModel
         # @!attribute turn_detection
         #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
         #   set to `null` to turn off, in which case the client must manually trigger model
-        #   response. Server VAD means that the model will detect the start and end of
-        #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-        #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-        #   semantically estimate whether the user has finished speaking, then dynamically
-        #   sets a timeout based on this probability. For example, if user audio trails off
-        #   with "uhhm", the model will score a low probability of turn end and wait longer
-        #   for the user to continue speaking. This can be useful for more natural
-        #   conversations, but may have a higher latency.
+        #   response.
         #
-        #   @return [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection, nil]
-        optional :turn_detection, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection }
+        #   Server VAD means that the model will detect the start and end of speech based on
+        #   audio volume and respond at the end of user speech.
+        #
+        #   Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        #   with VAD) to semantically estimate whether the user has finished speaking, then
+        #   dynamically sets a timeout based on this probability. For example, if user audio
+        #   trails off with "uhhm", the model will score a low probability of turn end and
+        #   wait longer for the user to continue speaking. This can be useful for more
+        #   natural conversations, but may have a higher latency.
+        #
+        #   @return [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad, nil]
+        optional :turn_detection, union: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection }, nil?: true
 
         # @!method initialize(format_: nil, noise_reduction: nil, transcription: nil, turn_detection: nil)
         #   Some parameter documentations has been truncated, see
@@ -58,7 +61,7 @@ class RealtimeAudioConfigInput < OpenAI::Internal::Type::BaseModel
         #
         #   @param transcription [OpenAI::Models::Realtime::AudioTranscription] Configuration for input audio transcription, defaults to off and can be set to `
         #
-        #   @param turn_detection [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+        #   @param turn_detection [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
 
         # @see OpenAI::Models::Realtime::RealtimeAudioConfigInput#noise_reduction
         class NoiseReduction < OpenAI::Internal::Type::BaseModel
diff --git a/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb b/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb
index c1695bed..376f499b 100644
--- a/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb
+++ b/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb
@@ -3,128 +3,184 @@
 module OpenAI
   module Models
     module Realtime
-      class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
-        # @!attribute create_response
-        #   Whether or not to automatically generate a response when a VAD stop event
-        #   occurs.
-        #
-        #   @return [Boolean, nil]
-        optional :create_response, OpenAI::Internal::Type::Boolean
-
-        # @!attribute eagerness
-        #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        #   will wait longer for the user to continue speaking, `high` will respond more
-        #   quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-        #   and `high` have max timeouts of 8s, 4s, and 2s respectively.
-        #
-        #   @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Eagerness, nil]
-        optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness }
-
-        # @!attribute idle_timeout_ms
-        #   Optional idle timeout after which turn detection will auto-timeout when no
-        #   additional audio is received and emits a `timeout_triggered` event.
-        #
-        #   @return [Integer, nil]
-        optional :idle_timeout_ms, Integer, nil?: true
-
-        # @!attribute interrupt_response
-        #   Whether or not to automatically interrupt any ongoing response with output to
-        #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-        #   occurs.
-        #
-        #   @return [Boolean, nil]
-        optional :interrupt_response, OpenAI::Internal::Type::Boolean
-
-        # @!attribute prefix_padding_ms
-        #   Used only for `server_vad` mode. Amount of audio to include before the VAD
-        #   detected speech (in milliseconds). Defaults to 300ms.
-        #
-        #   @return [Integer, nil]
-        optional :prefix_padding_ms, Integer
-
-        # @!attribute silence_duration_ms
-        #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-        #   milliseconds). Defaults to 500ms. With shorter values the model will respond
-        #   more quickly, but may jump in on short pauses from the user.
-        #
-        #   @return [Integer, nil]
-        optional :silence_duration_ms, Integer
-
-        # @!attribute threshold
-        #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-        #   defaults to 0.5. A higher threshold will require louder audio to activate the
-        #   model, and thus might perform better in noisy environments.
-        #
-        #   @return [Float, nil]
-        optional :threshold, Float
-
-        # @!attribute type
-        #   Type of turn detection.
-        #
-        #   @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Type, nil]
-        optional :type, enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type }
-
-        # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
-        #   Some parameter documentations has been truncated, see
-        #   {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection} for more details.
-        #
-        #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-        #   set to `null` to turn off, in which case the client must manually trigger model
-        #   response. Server VAD means that the model will detect the start and end of
-        #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-        #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-        #   semantically estimate whether the user has finished speaking, then dynamically
-        #   sets a timeout based on this probability. For example, if user audio trails off
-        #   with "uhhm", the model will score a low probability of turn end and wait longer
-        #   for the user to continue speaking. This can be useful for more natural
-        #   conversations, but may have a higher latency.
-        #
-        #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
-        #
-        #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        #
-        #   @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
-        #
-        #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
-        #
-        #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
-        #
-        #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
-        #
-        #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-        #
-        #   @param type [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Type] Type of turn detection.
-
-        # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        # will wait longer for the user to continue speaking, `high` will respond more
-        # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-        # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-        #
-        # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection#eagerness
-        module Eagerness
-          extend OpenAI::Internal::Type::Enum
-
-          LOW = :low
-          MEDIUM = :medium
-          HIGH = :high
-          AUTO = :auto
-
-          # @!method self.values
-          #   @return [Array<Symbol>]
+      # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+      # set to `null` to turn off, in which case the client must manually trigger model
+      # response.
+      #
+      # Server VAD means that the model will detect the start and end of speech based on
+      # audio volume and respond at the end of user speech.
+      #
+      # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+      # with VAD) to semantically estimate whether the user has finished speaking, then
+      # dynamically sets a timeout based on this probability. For example, if user audio
+      # trails off with "uhhm", the model will score a low probability of turn end and
+      # wait longer for the user to continue speaking. This can be useful for more
+      # natural conversations, but may have a higher latency.
+      module RealtimeAudioInputTurnDetection
+        extend OpenAI::Internal::Type::Union
+
+        discriminator :type
+
+        # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
+        variant :server_vad, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad }
+
+        # Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
+        variant :semantic_vad, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad }
+
+        class ServerVad < OpenAI::Internal::Type::BaseModel
+          # @!attribute type
+          #   Type of turn detection, `server_vad` to turn on simple Server VAD.
+          #
+          #   @return [Symbol, :server_vad]
+          required :type, const: :server_vad
+
+          # @!attribute create_response
+          #   Whether or not to automatically generate a response when a VAD stop event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :create_response, OpenAI::Internal::Type::Boolean
+
+          # @!attribute idle_timeout_ms
+          #   Optional timeout after which a model response will be triggered automatically.
+          #   This is useful for situations in which a long pause from the user is unexpected,
+          #   such as a phone call. The model will effectively prompt the user to continue the
+          #   conversation based on the current context.
+          #
+          #   The timeout value will be applied after the last model response's audio has
+          #   finished playing, i.e. it's set to the `response.done` time plus audio playback
+          #   duration.
+          #
+          #   An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+          #   Response) will be emitted when the timeout is reached. Idle timeout is currently
+          #   only supported for `server_vad` mode.
+          #
+          #   @return [Integer, nil]
+          optional :idle_timeout_ms, Integer, nil?: true
+
+          # @!attribute interrupt_response
+          #   Whether or not to automatically interrupt any ongoing response with output to
+          #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :interrupt_response, OpenAI::Internal::Type::Boolean
+
+          # @!attribute prefix_padding_ms
+          #   Used only for `server_vad` mode. Amount of audio to include before the VAD
+          #   detected speech (in milliseconds). Defaults to 300ms.
+          #
+          #   @return [Integer, nil]
+          optional :prefix_padding_ms, Integer
+
+          # @!attribute silence_duration_ms
+          #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+          #   milliseconds). Defaults to 500ms. With shorter values the model will respond
+          #   more quickly, but may jump in on short pauses from the user.
+          #
+          #   @return [Integer, nil]
+          optional :silence_duration_ms, Integer
+
+          # @!attribute threshold
+          #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+          #   defaults to 0.5. A higher threshold will require louder audio to activate the
+          #   model, and thus might perform better in noisy environments.
+          #
+          #   @return [Float, nil]
+          optional :threshold, Float
+
+          # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
+          #   Some parameter documentations has been truncated, see
+          #   {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad} for more
+          #   details.
+          #
+          #   Server-side voice activity detection (VAD) which flips on when user speech is
+          #   detected and off after a period of silence.
+          #
+          #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+          #
+          #   @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
+          #
+          #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+          #
+          #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
+          #
+          #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
+          #
+          #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+          #
+          #   @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
         end
 
-        # Type of turn detection.
-        #
-        # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection#type
-        module Type
-          extend OpenAI::Internal::Type::Enum
+        class SemanticVad < OpenAI::Internal::Type::BaseModel
+          # @!attribute type
+          #   Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+          #
+          #   @return [Symbol, :semantic_vad]
+          required :type, const: :semantic_vad
+
+          # @!attribute create_response
+          #   Whether or not to automatically generate a response when a VAD stop event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :create_response, OpenAI::Internal::Type::Boolean
+
+          # @!attribute eagerness
+          #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          #   will wait longer for the user to continue speaking, `high` will respond more
+          #   quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          #   and `high` have max timeouts of 8s, 4s, and 2s respectively.
+          #
+          #   @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness, nil]
+          optional :eagerness,
+                   enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness }
 
-          SERVER_VAD = :server_vad
-          SEMANTIC_VAD = :semantic_vad
+          # @!attribute interrupt_response
+          #   Whether or not to automatically interrupt any ongoing response with output to
+          #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :interrupt_response, OpenAI::Internal::Type::Boolean
 
-          # @!method self.values
-          #   @return [Array<Symbol>]
+          # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
+          #   Some parameter documentations has been truncated, see
+          #   {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad} for
+          #   more details.
+          #
+          #   Server-side semantic turn detection which uses a model to determine when the
+          #   user has finished speaking.
+          #
+          #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+          #
+          #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          #
+          #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+          #
+          #   @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+
+          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          # will wait longer for the user to continue speaking, `high` will respond more
+          # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+          #
+          # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad#eagerness
+          module Eagerness
+            extend OpenAI::Internal::Type::Enum
+
+            LOW = :low
+            MEDIUM = :medium
+            HIGH = :high
+            AUTO = :auto
+
+            # @!method self.values
+            #   @return [Array<Symbol>]
+          end
         end
+
+        # @!method self.variants
+        #   @return [Array(OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad)]
       end
     end
   end
diff --git a/lib/openai/models/realtime/realtime_server_event.rb b/lib/openai/models/realtime/realtime_server_event.rb
index af9f4692..9ea21835 100644
--- a/lib/openai/models/realtime/realtime_server_event.rb
+++ b/lib/openai/models/realtime/realtime_server_event.rb
@@ -208,7 +208,19 @@ module RealtimeServerEvent
         # The event will include the full content of the Item except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if needed.
         variant :"conversation.item.done", -> { OpenAI::Realtime::ConversationItemDone }
 
-        # Returned when the server VAD timeout is triggered for the input audio buffer.
+        # Returned when the Server VAD timeout is triggered for the input audio buffer. This is configured
+        # with `idle_timeout_ms` in the `turn_detection` settings of the session, and it indicates that
+        # there hasn't been any speech detected for the configured duration.
+        #
+        # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio after the last
+        # model response up to the triggering time, as an offset from the beginning of audio written
+        # to the input audio buffer. This means it demarcates the segment of audio that was silent and
+        # the difference between the start and end values will roughly match the configured timeout.
+        #
+        # The empty audio will be committed to the conversation as an `input_audio` item (there will be a
+        # `input_audio_buffer.committed` event) and a model response will be generated. There may be speech
+        # that didn't trigger VAD but is still detected by the model, so the model may respond with
+        # something relevant to the conversation or a prompt to continue speaking.
         variant :"input_audio_buffer.timeout_triggered", -> { OpenAI::Realtime::InputAudioBufferTimeoutTriggered }
 
         # Returned when an input audio transcription segment is identified for an item.
diff --git a/lib/openai/models/realtime/realtime_session.rb b/lib/openai/models/realtime/realtime_session.rb
index 035c49f7..74db4075 100644
--- a/lib/openai/models/realtime/realtime_session.rb
+++ b/lib/openai/models/realtime/realtime_session.rb
@@ -158,17 +158,20 @@ class RealtimeSession < OpenAI::Internal::Type::BaseModel
         # @!attribute turn_detection
         #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
         #   set to `null` to turn off, in which case the client must manually trigger model
-        #   response. Server VAD means that the model will detect the start and end of
-        #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-        #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-        #   semantically estimate whether the user has finished speaking, then dynamically
-        #   sets a timeout based on this probability. For example, if user audio trails off
-        #   with "uhhm", the model will score a low probability of turn end and wait longer
-        #   for the user to continue speaking. This can be useful for more natural
-        #   conversations, but may have a higher latency.
-        #
-        #   @return [OpenAI::Models::Realtime::RealtimeSession::TurnDetection, nil]
-        optional :turn_detection, -> { OpenAI::Realtime::RealtimeSession::TurnDetection }, nil?: true
+        #   response.
+        #
+        #   Server VAD means that the model will detect the start and end of speech based on
+        #   audio volume and respond at the end of user speech.
+        #
+        #   Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        #   with VAD) to semantically estimate whether the user has finished speaking, then
+        #   dynamically sets a timeout based on this probability. For example, if user audio
+        #   trails off with "uhhm", the model will score a low probability of turn end and
+        #   wait longer for the user to continue speaking. This can be useful for more
+        #   natural conversations, but may have a higher latency.
+        #
+        #   @return [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil]
+        optional :turn_detection, union: -> { OpenAI::Realtime::RealtimeSession::TurnDetection }, nil?: true
 
         # @!attribute voice
         #   The voice the model uses to respond. Voice cannot be changed during the session
@@ -182,7 +185,7 @@ class RealtimeSession < OpenAI::Internal::Type::BaseModel
         #   Some parameter documentations has been truncated, see
         #   {OpenAI::Models::Realtime::RealtimeSession} for more details.
         #
-        #   Realtime session object.
+        #   Realtime session object for the beta interface.
         #
         #   @param id [String] Unique identifier for the session that looks like `sess_1234567890abcdef`.
         #
@@ -220,7 +223,7 @@ class RealtimeSession < OpenAI::Internal::Type::BaseModel
         #
         #   @param tracing [Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration, nil] Configuration options for tracing. Set to null to disable tracing. Once
         #
-        #   @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+        #   @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
         #
         #   @param voice [String, Symbol, OpenAI::Models::Realtime::RealtimeSession::Voice] The voice the model uses to respond. Voice cannot be changed during the
 
@@ -401,127 +404,185 @@ class TracingConfiguration < OpenAI::Internal::Type::BaseModel
           #   @return [Array(Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration)]
         end
 
+        # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+        # set to `null` to turn off, in which case the client must manually trigger model
+        # response.
+        #
+        # Server VAD means that the model will detect the start and end of speech based on
+        # audio volume and respond at the end of user speech.
+        #
+        # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        # with VAD) to semantically estimate whether the user has finished speaking, then
+        # dynamically sets a timeout based on this probability. For example, if user audio
+        # trails off with "uhhm", the model will score a low probability of turn end and
+        # wait longer for the user to continue speaking. This can be useful for more
+        # natural conversations, but may have a higher latency.
+        #
         # @see OpenAI::Models::Realtime::RealtimeSession#turn_detection
-        class TurnDetection < OpenAI::Internal::Type::BaseModel
-          # @!attribute create_response
-          #   Whether or not to automatically generate a response when a VAD stop event
-          #   occurs.
-          #
-          #   @return [Boolean, nil]
-          optional :create_response, OpenAI::Internal::Type::Boolean
-
-          # @!attribute eagerness
-          #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-          #   will wait longer for the user to continue speaking, `high` will respond more
-          #   quickly. `auto` is the default and is equivalent to `medium`.
-          #
-          #   @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness, nil]
-          optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness }
+        module TurnDetection
+          extend OpenAI::Internal::Type::Union
 
-          # @!attribute idle_timeout_ms
-          #   Optional idle timeout after which turn detection will auto-timeout when no
-          #   additional audio is received.
-          #
-          #   @return [Integer, nil]
-          optional :idle_timeout_ms, Integer, nil?: true
+          discriminator :type
 
-          # @!attribute interrupt_response
-          #   Whether or not to automatically interrupt any ongoing response with output to
-          #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-          #   occurs.
-          #
-          #   @return [Boolean, nil]
-          optional :interrupt_response, OpenAI::Internal::Type::Boolean
+          # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
+          variant :server_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad }
 
-          # @!attribute prefix_padding_ms
-          #   Used only for `server_vad` mode. Amount of audio to include before the VAD
-          #   detected speech (in milliseconds). Defaults to 300ms.
-          #
-          #   @return [Integer, nil]
-          optional :prefix_padding_ms, Integer
+          # Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
+          variant :semantic_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad }
 
-          # @!attribute silence_duration_ms
-          #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-          #   milliseconds). Defaults to 500ms. With shorter values the model will respond
-          #   more quickly, but may jump in on short pauses from the user.
-          #
-          #   @return [Integer, nil]
-          optional :silence_duration_ms, Integer
+          class ServerVad < OpenAI::Internal::Type::BaseModel
+            # @!attribute type
+            #   Type of turn detection, `server_vad` to turn on simple Server VAD.
+            #
+            #   @return [Symbol, :server_vad]
+            required :type, const: :server_vad
 
-          # @!attribute threshold
-          #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-          #   defaults to 0.5. A higher threshold will require louder audio to activate the
-          #   model, and thus might perform better in noisy environments.
-          #
-          #   @return [Float, nil]
-          optional :threshold, Float
+            # @!attribute create_response
+            #   Whether or not to automatically generate a response when a VAD stop event
+            #   occurs.
+            #
+            #   @return [Boolean, nil]
+            optional :create_response, OpenAI::Internal::Type::Boolean
+
+            # @!attribute idle_timeout_ms
+            #   Optional timeout after which a model response will be triggered automatically.
+            #   This is useful for situations in which a long pause from the user is unexpected,
+            #   such as a phone call. The model will effectively prompt the user to continue the
+            #   conversation based on the current context.
+            #
+            #   The timeout value will be applied after the last model response's audio has
+            #   finished playing, i.e. it's set to the `response.done` time plus audio playback
+            #   duration.
+            #
+            #   An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+            #   Response) will be emitted when the timeout is reached. Idle timeout is currently
+            #   only supported for `server_vad` mode.
+            #
+            #   @return [Integer, nil]
+            optional :idle_timeout_ms, Integer, nil?: true
 
-          # @!attribute type
-          #   Type of turn detection.
-          #
-          #   @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Type, nil]
-          optional :type, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::Type }
+            # @!attribute interrupt_response
+            #   Whether or not to automatically interrupt any ongoing response with output to
+            #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+            #   occurs.
+            #
+            #   @return [Boolean, nil]
+            optional :interrupt_response, OpenAI::Internal::Type::Boolean
 
-          # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
-          #   Some parameter documentations has been truncated, see
-          #   {OpenAI::Models::Realtime::RealtimeSession::TurnDetection} for more details.
-          #
-          #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-          #   set to `null` to turn off, in which case the client must manually trigger model
-          #   response. Server VAD means that the model will detect the start and end of
-          #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-          #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-          #   semantically estimate whether the user has finished speaking, then dynamically
-          #   sets a timeout based on this probability. For example, if user audio trails off
-          #   with "uhhm", the model will score a low probability of turn end and wait longer
-          #   for the user to continue speaking. This can be useful for more natural
-          #   conversations, but may have a higher latency.
-          #
-          #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
-          #
-          #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-          #
-          #   @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
-          #
-          #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
-          #
-          #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
-          #
-          #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
-          #
-          #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-          #
-          #   @param type [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Type] Type of turn detection.
+            # @!attribute prefix_padding_ms
+            #   Used only for `server_vad` mode. Amount of audio to include before the VAD
+            #   detected speech (in milliseconds). Defaults to 300ms.
+            #
+            #   @return [Integer, nil]
+            optional :prefix_padding_ms, Integer
 
-          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-          # will wait longer for the user to continue speaking, `high` will respond more
-          # quickly. `auto` is the default and is equivalent to `medium`.
-          #
-          # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection#eagerness
-          module Eagerness
-            extend OpenAI::Internal::Type::Enum
+            # @!attribute silence_duration_ms
+            #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+            #   milliseconds). Defaults to 500ms. With shorter values the model will respond
+            #   more quickly, but may jump in on short pauses from the user.
+            #
+            #   @return [Integer, nil]
+            optional :silence_duration_ms, Integer
 
-            LOW = :low
-            MEDIUM = :medium
-            HIGH = :high
-            AUTO = :auto
+            # @!attribute threshold
+            #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+            #   defaults to 0.5. A higher threshold will require louder audio to activate the
+            #   model, and thus might perform better in noisy environments.
+            #
+            #   @return [Float, nil]
+            optional :threshold, Float
 
-            # @!method self.values
-            #   @return [Array<Symbol>]
+            # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
+            #   Some parameter documentations has been truncated, see
+            #   {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad} for more
+            #   details.
+            #
+            #   Server-side voice activity detection (VAD) which flips on when user speech is
+            #   detected and off after a period of silence.
+            #
+            #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+            #
+            #   @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
+            #
+            #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+            #
+            #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
+            #
+            #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
+            #
+            #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+            #
+            #   @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
           end
 
-          # Type of turn detection.
-          #
-          # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection#type
-          module Type
-            extend OpenAI::Internal::Type::Enum
+          class SemanticVad < OpenAI::Internal::Type::BaseModel
+            # @!attribute type
+            #   Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+            #
+            #   @return [Symbol, :semantic_vad]
+            required :type, const: :semantic_vad
+
+            # @!attribute create_response
+            #   Whether or not to automatically generate a response when a VAD stop event
+            #   occurs.
+            #
+            #   @return [Boolean, nil]
+            optional :create_response, OpenAI::Internal::Type::Boolean
+
+            # @!attribute eagerness
+            #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+            #   will wait longer for the user to continue speaking, `high` will respond more
+            #   quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+            #   and `high` have max timeouts of 8s, 4s, and 2s respectively.
+            #
+            #   @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness, nil]
+            optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness }
 
-            SERVER_VAD = :server_vad
-            SEMANTIC_VAD = :semantic_vad
+            # @!attribute interrupt_response
+            #   Whether or not to automatically interrupt any ongoing response with output to
+            #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+            #   occurs.
+            #
+            #   @return [Boolean, nil]
+            optional :interrupt_response, OpenAI::Internal::Type::Boolean
 
-            # @!method self.values
-            #   @return [Array<Symbol>]
+            # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
+            #   Some parameter documentations has been truncated, see
+            #   {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad} for more
+            #   details.
+            #
+            #   Server-side semantic turn detection which uses a model to determine when the
+            #   user has finished speaking.
+            #
+            #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+            #
+            #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+            #
+            #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+            #
+            #   @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+
+            # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+            # will wait longer for the user to continue speaking, `high` will respond more
+            # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+            # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+            #
+            # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad#eagerness
+            module Eagerness
+              extend OpenAI::Internal::Type::Enum
+
+              LOW = :low
+              MEDIUM = :medium
+              HIGH = :high
+              AUTO = :auto
+
+              # @!method self.values
+              #   @return [Array<Symbol>]
+            end
           end
+
+          # @!method self.variants
+          #   @return [Array(OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad)]
         end
 
         # The voice the model uses to respond. Voice cannot be changed during the session
diff --git a/lib/openai/models/realtime/realtime_session_create_response.rb b/lib/openai/models/realtime/realtime_session_create_response.rb
index 3adf8f09..79dcc176 100644
--- a/lib/openai/models/realtime/realtime_session_create_response.rb
+++ b/lib/openai/models/realtime/realtime_session_create_response.rb
@@ -198,18 +198,24 @@ class Input < OpenAI::Internal::Type::BaseModel
             # @!attribute turn_detection
             #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
             #   set to `null` to turn off, in which case the client must manually trigger model
-            #   response. Server VAD means that the model will detect the start and end of
-            #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-            #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-            #   semantically estimate whether the user has finished speaking, then dynamically
-            #   sets a timeout based on this probability. For example, if user audio trails off
-            #   with "uhhm", the model will score a low probability of turn end and wait longer
-            #   for the user to continue speaking. This can be useful for more natural
-            #   conversations, but may have a higher latency.
-            #
-            #   @return [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection, nil]
+            #   response.
+            #
+            #   Server VAD means that the model will detect the start and end of speech based on
+            #   audio volume and respond at the end of user speech.
+            #
+            #   Semantic VAD is more advanced and uses a turn detection model (in conjunction
+            #   with VAD) to semantically estimate whether the user has finished speaking, then
+            #   dynamically sets a timeout based on this probability. For example, if user audio
+            #   trails off with "uhhm", the model will score a low probability of turn end and
+            #   wait longer for the user to continue speaking. This can be useful for more
+            #   natural conversations, but may have a higher latency.
+            #
+            #   @return [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad, nil]
             optional :turn_detection,
-                     -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection }
+                     union: -> {
+                       OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
+                     },
+                     nil?: true
 
             # @!method initialize(format_: nil, noise_reduction: nil, transcription: nil, turn_detection: nil)
             #   Some parameter documentations has been truncated, see
@@ -222,7 +228,7 @@ class Input < OpenAI::Internal::Type::BaseModel
             #
             #   @param transcription [OpenAI::Models::Realtime::AudioTranscription] Configuration for input audio transcription, defaults to off and can be set to `
             #
-            #   @param turn_detection [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+            #   @param turn_detection [OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
 
             # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input#noise_reduction
             class NoiseReduction < OpenAI::Internal::Type::BaseModel
@@ -248,132 +254,188 @@ class NoiseReduction < OpenAI::Internal::Type::BaseModel
               #   @param type [Symbol, OpenAI::Models::Realtime::NoiseReductionType] Type of noise reduction. `near_field` is for close-talking microphones such as h
             end
 
+            # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+            # set to `null` to turn off, in which case the client must manually trigger model
+            # response.
+            #
+            # Server VAD means that the model will detect the start and end of speech based on
+            # audio volume and respond at the end of user speech.
+            #
+            # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+            # with VAD) to semantically estimate whether the user has finished speaking, then
+            # dynamically sets a timeout based on this probability. For example, if user audio
+            # trails off with "uhhm", the model will score a low probability of turn end and
+            # wait longer for the user to continue speaking. This can be useful for more
+            # natural conversations, but may have a higher latency.
+            #
             # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input#turn_detection
-            class TurnDetection < OpenAI::Internal::Type::BaseModel
-              # @!attribute create_response
-              #   Whether or not to automatically generate a response when a VAD stop event
-              #   occurs.
-              #
-              #   @return [Boolean, nil]
-              optional :create_response, OpenAI::Internal::Type::Boolean
-
-              # @!attribute eagerness
-              #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-              #   will wait longer for the user to continue speaking, `high` will respond more
-              #   quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-              #   and `high` have max timeouts of 8s, 4s, and 2s respectively.
-              #
-              #   @return [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness, nil]
-              optional :eagerness,
-                       enum: -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness }
+            module TurnDetection
+              extend OpenAI::Internal::Type::Union
 
-              # @!attribute idle_timeout_ms
-              #   Optional idle timeout after which turn detection will auto-timeout when no
-              #   additional audio is received and emits a `timeout_triggered` event.
-              #
-              #   @return [Integer, nil]
-              optional :idle_timeout_ms, Integer, nil?: true
+              discriminator :type
 
-              # @!attribute interrupt_response
-              #   Whether or not to automatically interrupt any ongoing response with output to
-              #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-              #   occurs.
-              #
-              #   @return [Boolean, nil]
-              optional :interrupt_response, OpenAI::Internal::Type::Boolean
+              # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
+              variant :server_vad,
+                      -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad }
 
-              # @!attribute prefix_padding_ms
-              #   Used only for `server_vad` mode. Amount of audio to include before the VAD
-              #   detected speech (in milliseconds). Defaults to 300ms.
-              #
-              #   @return [Integer, nil]
-              optional :prefix_padding_ms, Integer
+              # Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
+              variant :semantic_vad,
+                      -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad }
 
-              # @!attribute silence_duration_ms
-              #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-              #   milliseconds). Defaults to 500ms. With shorter values the model will respond
-              #   more quickly, but may jump in on short pauses from the user.
-              #
-              #   @return [Integer, nil]
-              optional :silence_duration_ms, Integer
+              class ServerVad < OpenAI::Internal::Type::BaseModel
+                # @!attribute type
+                #   Type of turn detection, `server_vad` to turn on simple Server VAD.
+                #
+                #   @return [Symbol, :server_vad]
+                required :type, const: :server_vad
 
-              # @!attribute threshold
-              #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-              #   defaults to 0.5. A higher threshold will require louder audio to activate the
-              #   model, and thus might perform better in noisy environments.
-              #
-              #   @return [Float, nil]
-              optional :threshold, Float
+                # @!attribute create_response
+                #   Whether or not to automatically generate a response when a VAD stop event
+                #   occurs.
+                #
+                #   @return [Boolean, nil]
+                optional :create_response, OpenAI::Internal::Type::Boolean
 
-              # @!attribute type
-              #   Type of turn detection.
-              #
-              #   @return [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type, nil]
-              optional :type,
-                       enum: -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type }
+                # @!attribute idle_timeout_ms
+                #   Optional timeout after which a model response will be triggered automatically.
+                #   This is useful for situations in which a long pause from the user is unexpected,
+                #   such as a phone call. The model will effectively prompt the user to continue the
+                #   conversation based on the current context.
+                #
+                #   The timeout value will be applied after the last model response's audio has
+                #   finished playing, i.e. it's set to the `response.done` time plus audio playback
+                #   duration.
+                #
+                #   An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+                #   Response) will be emitted when the timeout is reached. Idle timeout is currently
+                #   only supported for `server_vad` mode.
+                #
+                #   @return [Integer, nil]
+                optional :idle_timeout_ms, Integer, nil?: true
 
-              # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
-              #   Some parameter documentations has been truncated, see
-              #   {OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection}
-              #   for more details.
-              #
-              #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-              #   set to `null` to turn off, in which case the client must manually trigger model
-              #   response. Server VAD means that the model will detect the start and end of
-              #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-              #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-              #   semantically estimate whether the user has finished speaking, then dynamically
-              #   sets a timeout based on this probability. For example, if user audio trails off
-              #   with "uhhm", the model will score a low probability of turn end and wait longer
-              #   for the user to continue speaking. This can be useful for more natural
-              #   conversations, but may have a higher latency.
-              #
-              #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
-              #
-              #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-              #
-              #   @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
-              #
-              #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
-              #
-              #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
-              #
-              #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
-              #
-              #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-              #
-              #   @param type [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type] Type of turn detection.
+                # @!attribute interrupt_response
+                #   Whether or not to automatically interrupt any ongoing response with output to
+                #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+                #   occurs.
+                #
+                #   @return [Boolean, nil]
+                optional :interrupt_response, OpenAI::Internal::Type::Boolean
 
-              # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-              # will wait longer for the user to continue speaking, `high` will respond more
-              # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-              # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-              #
-              # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection#eagerness
-              module Eagerness
-                extend OpenAI::Internal::Type::Enum
+                # @!attribute prefix_padding_ms
+                #   Used only for `server_vad` mode. Amount of audio to include before the VAD
+                #   detected speech (in milliseconds). Defaults to 300ms.
+                #
+                #   @return [Integer, nil]
+                optional :prefix_padding_ms, Integer
 
-                LOW = :low
-                MEDIUM = :medium
-                HIGH = :high
-                AUTO = :auto
+                # @!attribute silence_duration_ms
+                #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+                #   milliseconds). Defaults to 500ms. With shorter values the model will respond
+                #   more quickly, but may jump in on short pauses from the user.
+                #
+                #   @return [Integer, nil]
+                optional :silence_duration_ms, Integer
 
-                # @!method self.values
-                #   @return [Array<Symbol>]
+                # @!attribute threshold
+                #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+                #   defaults to 0.5. A higher threshold will require louder audio to activate the
+                #   model, and thus might perform better in noisy environments.
+                #
+                #   @return [Float, nil]
+                optional :threshold, Float
+
+                # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
+                #   Some parameter documentations has been truncated, see
+                #   {OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad}
+                #   for more details.
+                #
+                #   Server-side voice activity detection (VAD) which flips on when user speech is
+                #   detected and off after a period of silence.
+                #
+                #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+                #
+                #   @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
+                #
+                #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+                #
+                #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
+                #
+                #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
+                #
+                #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+                #
+                #   @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
               end
 
-              # Type of turn detection.
-              #
-              # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection#type
-              module Type
-                extend OpenAI::Internal::Type::Enum
+              class SemanticVad < OpenAI::Internal::Type::BaseModel
+                # @!attribute type
+                #   Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+                #
+                #   @return [Symbol, :semantic_vad]
+                required :type, const: :semantic_vad
+
+                # @!attribute create_response
+                #   Whether or not to automatically generate a response when a VAD stop event
+                #   occurs.
+                #
+                #   @return [Boolean, nil]
+                optional :create_response, OpenAI::Internal::Type::Boolean
 
-                SERVER_VAD = :server_vad
-                SEMANTIC_VAD = :semantic_vad
+                # @!attribute eagerness
+                #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+                #   will wait longer for the user to continue speaking, `high` will respond more
+                #   quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+                #   and `high` have max timeouts of 8s, 4s, and 2s respectively.
+                #
+                #   @return [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness, nil]
+                optional :eagerness,
+                         enum: -> { OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness }
+
+                # @!attribute interrupt_response
+                #   Whether or not to automatically interrupt any ongoing response with output to
+                #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+                #   occurs.
+                #
+                #   @return [Boolean, nil]
+                optional :interrupt_response, OpenAI::Internal::Type::Boolean
 
-                # @!method self.values
-                #   @return [Array<Symbol>]
+                # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
+                #   Some parameter documentations has been truncated, see
+                #   {OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad}
+                #   for more details.
+                #
+                #   Server-side semantic turn detection which uses a model to determine when the
+                #   user has finished speaking.
+                #
+                #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+                #
+                #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+                #
+                #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+                #
+                #   @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+
+                # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+                # will wait longer for the user to continue speaking, `high` will respond more
+                # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+                # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+                #
+                # @see OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad#eagerness
+                module Eagerness
+                  extend OpenAI::Internal::Type::Enum
+
+                  LOW = :low
+                  MEDIUM = :medium
+                  HIGH = :high
+                  AUTO = :auto
+
+                  # @!method self.values
+                  #   @return [Array<Symbol>]
+                end
               end
+
+              # @!method self.variants
+              #   @return [Array(OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad)]
             end
           end
 
diff --git a/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb b/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb
index a4019b2c..fc5fb231 100644
--- a/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb
+++ b/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb
@@ -36,17 +36,22 @@ class RealtimeTranscriptionSessionAudioInput < OpenAI::Internal::Type::BaseModel
         # @!attribute turn_detection
         #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
         #   set to `null` to turn off, in which case the client must manually trigger model
-        #   response. Server VAD means that the model will detect the start and end of
-        #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-        #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-        #   semantically estimate whether the user has finished speaking, then dynamically
-        #   sets a timeout based on this probability. For example, if user audio trails off
-        #   with "uhhm", the model will score a low probability of turn end and wait longer
-        #   for the user to continue speaking. This can be useful for more natural
-        #   conversations, but may have a higher latency.
+        #   response.
         #
-        #   @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection, nil]
-        optional :turn_detection, -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection }
+        #   Server VAD means that the model will detect the start and end of speech based on
+        #   audio volume and respond at the end of user speech.
+        #
+        #   Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        #   with VAD) to semantically estimate whether the user has finished speaking, then
+        #   dynamically sets a timeout based on this probability. For example, if user audio
+        #   trails off with "uhhm", the model will score a low probability of turn end and
+        #   wait longer for the user to continue speaking. This can be useful for more
+        #   natural conversations, but may have a higher latency.
+        #
+        #   @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad, nil]
+        optional :turn_detection,
+                 union: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection },
+                 nil?: true
 
         # @!method initialize(format_: nil, noise_reduction: nil, transcription: nil, turn_detection: nil)
         #   Some parameter documentations has been truncated, see
@@ -59,7 +64,7 @@ class RealtimeTranscriptionSessionAudioInput < OpenAI::Internal::Type::BaseModel
         #
         #   @param transcription [OpenAI::Models::Realtime::AudioTranscription] Configuration for input audio transcription, defaults to off and can be set to `
         #
-        #   @param turn_detection [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+        #   @param turn_detection [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
 
         # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInput#noise_reduction
         class NoiseReduction < OpenAI::Internal::Type::BaseModel
diff --git a/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb b/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb
index 984b2774..814e0b8b 100644
--- a/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb
+++ b/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb
@@ -3,128 +3,186 @@
 module OpenAI
   module Models
     module Realtime
-      class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
-        # @!attribute create_response
-        #   Whether or not to automatically generate a response when a VAD stop event
-        #   occurs.
-        #
-        #   @return [Boolean, nil]
-        optional :create_response, OpenAI::Internal::Type::Boolean
-
-        # @!attribute eagerness
-        #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        #   will wait longer for the user to continue speaking, `high` will respond more
-        #   quickly. `auto` is the default and is equivalent to `medium`.
-        #
-        #   @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness, nil]
-        optional :eagerness,
-                 enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness }
-
-        # @!attribute idle_timeout_ms
-        #   Optional idle timeout after which turn detection will auto-timeout when no
-        #   additional audio is received.
-        #
-        #   @return [Integer, nil]
-        optional :idle_timeout_ms, Integer, nil?: true
-
-        # @!attribute interrupt_response
-        #   Whether or not to automatically interrupt any ongoing response with output to
-        #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-        #   occurs.
-        #
-        #   @return [Boolean, nil]
-        optional :interrupt_response, OpenAI::Internal::Type::Boolean
-
-        # @!attribute prefix_padding_ms
-        #   Used only for `server_vad` mode. Amount of audio to include before the VAD
-        #   detected speech (in milliseconds). Defaults to 300ms.
-        #
-        #   @return [Integer, nil]
-        optional :prefix_padding_ms, Integer
-
-        # @!attribute silence_duration_ms
-        #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-        #   milliseconds). Defaults to 500ms. With shorter values the model will respond
-        #   more quickly, but may jump in on short pauses from the user.
-        #
-        #   @return [Integer, nil]
-        optional :silence_duration_ms, Integer
-
-        # @!attribute threshold
-        #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-        #   defaults to 0.5. A higher threshold will require louder audio to activate the
-        #   model, and thus might perform better in noisy environments.
-        #
-        #   @return [Float, nil]
-        optional :threshold, Float
-
-        # @!attribute type
-        #   Type of turn detection.
-        #
-        #   @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type, nil]
-        optional :type, enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type }
-
-        # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
-        #   Some parameter documentations has been truncated, see
-        #   {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection}
-        #   for more details.
-        #
-        #   Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-        #   set to `null` to turn off, in which case the client must manually trigger model
-        #   response. Server VAD means that the model will detect the start and end of
-        #   speech based on audio volume and respond at the end of user speech. Semantic VAD
-        #   is more advanced and uses a turn detection model (in conjunction with VAD) to
-        #   semantically estimate whether the user has finished speaking, then dynamically
-        #   sets a timeout based on this probability. For example, if user audio trails off
-        #   with "uhhm", the model will score a low probability of turn end and wait longer
-        #   for the user to continue speaking. This can be useful for more natural
-        #   conversations, but may have a higher latency.
-        #
-        #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
-        #
-        #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        #
-        #   @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
-        #
-        #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
-        #
-        #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
-        #
-        #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
-        #
-        #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-        #
-        #   @param type [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type] Type of turn detection.
-
-        # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        # will wait longer for the user to continue speaking, `high` will respond more
-        # quickly. `auto` is the default and is equivalent to `medium`.
-        #
-        # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection#eagerness
-        module Eagerness
-          extend OpenAI::Internal::Type::Enum
-
-          LOW = :low
-          MEDIUM = :medium
-          HIGH = :high
-          AUTO = :auto
-
-          # @!method self.values
-          #   @return [Array<Symbol>]
+      # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+      # set to `null` to turn off, in which case the client must manually trigger model
+      # response.
+      #
+      # Server VAD means that the model will detect the start and end of speech based on
+      # audio volume and respond at the end of user speech.
+      #
+      # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+      # with VAD) to semantically estimate whether the user has finished speaking, then
+      # dynamically sets a timeout based on this probability. For example, if user audio
+      # trails off with "uhhm", the model will score a low probability of turn end and
+      # wait longer for the user to continue speaking. This can be useful for more
+      # natural conversations, but may have a higher latency.
+      module RealtimeTranscriptionSessionAudioInputTurnDetection
+        extend OpenAI::Internal::Type::Union
+
+        discriminator :type
+
+        # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
+        variant :server_vad,
+                -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad }
+
+        # Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
+        variant :semantic_vad,
+                -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad }
+
+        class ServerVad < OpenAI::Internal::Type::BaseModel
+          # @!attribute type
+          #   Type of turn detection, `server_vad` to turn on simple Server VAD.
+          #
+          #   @return [Symbol, :server_vad]
+          required :type, const: :server_vad
+
+          # @!attribute create_response
+          #   Whether or not to automatically generate a response when a VAD stop event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :create_response, OpenAI::Internal::Type::Boolean
+
+          # @!attribute idle_timeout_ms
+          #   Optional timeout after which a model response will be triggered automatically.
+          #   This is useful for situations in which a long pause from the user is unexpected,
+          #   such as a phone call. The model will effectively prompt the user to continue the
+          #   conversation based on the current context.
+          #
+          #   The timeout value will be applied after the last model response's audio has
+          #   finished playing, i.e. it's set to the `response.done` time plus audio playback
+          #   duration.
+          #
+          #   An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+          #   Response) will be emitted when the timeout is reached. Idle timeout is currently
+          #   only supported for `server_vad` mode.
+          #
+          #   @return [Integer, nil]
+          optional :idle_timeout_ms, Integer, nil?: true
+
+          # @!attribute interrupt_response
+          #   Whether or not to automatically interrupt any ongoing response with output to
+          #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :interrupt_response, OpenAI::Internal::Type::Boolean
+
+          # @!attribute prefix_padding_ms
+          #   Used only for `server_vad` mode. Amount of audio to include before the VAD
+          #   detected speech (in milliseconds). Defaults to 300ms.
+          #
+          #   @return [Integer, nil]
+          optional :prefix_padding_ms, Integer
+
+          # @!attribute silence_duration_ms
+          #   Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+          #   milliseconds). Defaults to 500ms. With shorter values the model will respond
+          #   more quickly, but may jump in on short pauses from the user.
+          #
+          #   @return [Integer, nil]
+          optional :silence_duration_ms, Integer
+
+          # @!attribute threshold
+          #   Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+          #   defaults to 0.5. A higher threshold will require louder audio to activate the
+          #   model, and thus might perform better in noisy environments.
+          #
+          #   @return [Float, nil]
+          optional :threshold, Float
+
+          # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
+          #   Some parameter documentations has been truncated, see
+          #   {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad}
+          #   for more details.
+          #
+          #   Server-side voice activity detection (VAD) which flips on when user speech is
+          #   detected and off after a period of silence.
+          #
+          #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+          #
+          #   @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
+          #
+          #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+          #
+          #   @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
+          #
+          #   @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
+          #
+          #   @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+          #
+          #   @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
         end
 
-        # Type of turn detection.
-        #
-        # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection#type
-        module Type
-          extend OpenAI::Internal::Type::Enum
+        class SemanticVad < OpenAI::Internal::Type::BaseModel
+          # @!attribute type
+          #   Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+          #
+          #   @return [Symbol, :semantic_vad]
+          required :type, const: :semantic_vad
+
+          # @!attribute create_response
+          #   Whether or not to automatically generate a response when a VAD stop event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :create_response, OpenAI::Internal::Type::Boolean
+
+          # @!attribute eagerness
+          #   Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          #   will wait longer for the user to continue speaking, `high` will respond more
+          #   quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          #   and `high` have max timeouts of 8s, 4s, and 2s respectively.
+          #
+          #   @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness, nil]
+          optional :eagerness,
+                   enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness }
 
-          SERVER_VAD = :server_vad
-          SEMANTIC_VAD = :semantic_vad
+          # @!attribute interrupt_response
+          #   Whether or not to automatically interrupt any ongoing response with output to
+          #   the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+          #   occurs.
+          #
+          #   @return [Boolean, nil]
+          optional :interrupt_response, OpenAI::Internal::Type::Boolean
 
-          # @!method self.values
-          #   @return [Array<Symbol>]
+          # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
+          #   Some parameter documentations has been truncated, see
+          #   {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad}
+          #   for more details.
+          #
+          #   Server-side semantic turn detection which uses a model to determine when the
+          #   user has finished speaking.
+          #
+          #   @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
+          #
+          #   @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          #
+          #   @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
+          #
+          #   @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+
+          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          # will wait longer for the user to continue speaking, `high` will respond more
+          # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+          #
+          # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad#eagerness
+          module Eagerness
+            extend OpenAI::Internal::Type::Enum
+
+            LOW = :low
+            MEDIUM = :medium
+            HIGH = :high
+            AUTO = :auto
+
+            # @!method self.values
+            #   @return [Array<Symbol>]
+          end
         end
+
+        # @!method self.variants
+        #   @return [Array(OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad)]
       end
     end
   end
diff --git a/lib/openai/models/responses/response.rb b/lib/openai/models/responses/response.rb
index 781529d3..b327e434 100644
--- a/lib/openai/models/responses/response.rb
+++ b/lib/openai/models/responses/response.rb
@@ -259,10 +259,10 @@ class Response < OpenAI::Internal::Type::BaseModel
         # @!attribute truncation
         #   The truncation strategy to use for the model response.
         #
-        #   - `auto`: If the context of this response and previous ones exceeds the model's
-        #     context window size, the model will truncate the response to fit the context
-        #     window by dropping input items in the middle of the conversation.
-        #   - `disabled` (default): If a model response will exceed the context window size
+        #   - `auto`: If the input to this Response exceeds the model's context window size,
+        #     the model will truncate the response to fit the context window by dropping
+        #     items from the beginning of the conversation.
+        #   - `disabled` (default): If the input size will exceed the context window size
         #     for a model, the request will fail with a 400 error.
         #
         #   @return [Symbol, OpenAI::Models::Responses::Response::Truncation, nil]
@@ -510,10 +510,10 @@ module ServiceTier
 
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         #
         # @see OpenAI::Models::Responses::Response#truncation
diff --git a/lib/openai/models/responses/response_create_params.rb b/lib/openai/models/responses/response_create_params.rb
index 53c8b5c7..1d5036d8 100644
--- a/lib/openai/models/responses/response_create_params.rb
+++ b/lib/openai/models/responses/response_create_params.rb
@@ -276,10 +276,10 @@ class ResponseCreateParams < OpenAI::Internal::Type::BaseModel
         # @!attribute truncation
         #   The truncation strategy to use for the model response.
         #
-        #   - `auto`: If the context of this response and previous ones exceeds the model's
-        #     context window size, the model will truncate the response to fit the context
-        #     window by dropping input items in the middle of the conversation.
-        #   - `disabled` (default): If a model response will exceed the context window size
+        #   - `auto`: If the input to this Response exceeds the model's context window size,
+        #     the model will truncate the response to fit the context window by dropping
+        #     items from the beginning of the conversation.
+        #   - `disabled` (default): If the input size will exceed the context window size
         #     for a model, the request will fail with a 400 error.
         #
         #   @return [Symbol, OpenAI::Models::Responses::ResponseCreateParams::Truncation, nil]
@@ -485,10 +485,10 @@ module ToolChoice
 
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         module Truncation
           extend OpenAI::Internal::Type::Enum
diff --git a/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi b/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi
index 93ec928f..e117efa9 100644
--- a/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi
+++ b/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi
@@ -12,11 +12,13 @@ module OpenAI
             )
           end
 
-        # Millisecond offset where speech ended within the buffered audio.
+        # Millisecond offset of audio written to the input audio buffer at the time the
+        # timeout was triggered.
         sig { returns(Integer) }
         attr_accessor :audio_end_ms
 
-        # Millisecond offset where speech started within the buffered audio.
+        # Millisecond offset of audio written to the input audio buffer that was after the
+        # playback time of the last model response.
         sig { returns(Integer) }
         attr_accessor :audio_start_ms
 
@@ -32,7 +34,22 @@ module OpenAI
         sig { returns(Symbol) }
         attr_accessor :type
 
-        # Returned when the server VAD timeout is triggered for the input audio buffer.
+        # Returned when the Server VAD timeout is triggered for the input audio buffer.
+        # This is configured with `idle_timeout_ms` in the `turn_detection` settings of
+        # the session, and it indicates that there hasn't been any speech detected for the
+        # configured duration.
+        #
+        # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio
+        # after the last model response up to the triggering time, as an offset from the
+        # beginning of audio written to the input audio buffer. This means it demarcates
+        # the segment of audio that was silent and the difference between the start and
+        # end values will roughly match the configured timeout.
+        #
+        # The empty audio will be committed to the conversation as an `input_audio` item
+        # (there will be a `input_audio_buffer.committed` event) and a model response will
+        # be generated. There may be speech that didn't trigger VAD but is still detected
+        # by the model, so the model may respond with something relevant to the
+        # conversation or a prompt to continue speaking.
         sig do
           params(
             audio_end_ms: Integer,
@@ -43,9 +60,11 @@ module OpenAI
           ).returns(T.attached_class)
         end
         def self.new(
-          # Millisecond offset where speech ended within the buffered audio.
+          # Millisecond offset of audio written to the input audio buffer at the time the
+          # timeout was triggered.
           audio_end_ms:,
-          # Millisecond offset where speech started within the buffered audio.
+          # Millisecond offset of audio written to the input audio buffer that was after the
+          # playback time of the last model response.
           audio_start_ms:,
           # The unique ID of the server event.
           event_id:,
diff --git a/rbi/openai/models/realtime/realtime_audio_config_input.rbi b/rbi/openai/models/realtime/realtime_audio_config_input.rbi
index e33b3fce..fb2e2697 100644
--- a/rbi/openai/models/realtime/realtime_audio_config_input.rbi
+++ b/rbi/openai/models/realtime/realtime_audio_config_input.rbi
@@ -80,26 +80,28 @@ module OpenAI
 
         # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
         # set to `null` to turn off, in which case the client must manually trigger model
-        # response. Server VAD means that the model will detect the start and end of
-        # speech based on audio volume and respond at the end of user speech. Semantic VAD
-        # is more advanced and uses a turn detection model (in conjunction with VAD) to
-        # semantically estimate whether the user has finished speaking, then dynamically
-        # sets a timeout based on this probability. For example, if user audio trails off
-        # with "uhhm", the model will score a low probability of turn end and wait longer
-        # for the user to continue speaking. This can be useful for more natural
-        # conversations, but may have a higher latency.
+        # response.
+        #
+        # Server VAD means that the model will detect the start and end of speech based on
+        # audio volume and respond at the end of user speech.
+        #
+        # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        # with VAD) to semantically estimate whether the user has finished speaking, then
+        # dynamically sets a timeout based on this probability. For example, if user audio
+        # trails off with "uhhm", the model will score a low probability of turn end and
+        # wait longer for the user to continue speaking. This can be useful for more
+        # natural conversations, but may have a higher latency.
         sig do
-          returns(T.nilable(OpenAI::Realtime::RealtimeAudioInputTurnDetection))
-        end
-        attr_reader :turn_detection
-
-        sig do
-          params(
-            turn_detection:
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::OrHash
-          ).void
+          returns(
+            T.nilable(
+              T.any(
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
+              )
+            )
+          )
         end
-        attr_writer :turn_detection
+        attr_accessor :turn_detection
 
         sig do
           params(
@@ -113,7 +115,12 @@ module OpenAI
               OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction::OrHash,
             transcription: OpenAI::Realtime::AudioTranscription::OrHash,
             turn_detection:
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::OrHash
+              T.nilable(
+                T.any(
+                  OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad::OrHash,
+                  OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::OrHash
+                )
+              )
           ).returns(T.attached_class)
         end
         def self.new(
@@ -136,14 +143,17 @@ module OpenAI
           transcription: nil,
           # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
           # set to `null` to turn off, in which case the client must manually trigger model
-          # response. Server VAD means that the model will detect the start and end of
-          # speech based on audio volume and respond at the end of user speech. Semantic VAD
-          # is more advanced and uses a turn detection model (in conjunction with VAD) to
-          # semantically estimate whether the user has finished speaking, then dynamically
-          # sets a timeout based on this probability. For example, if user audio trails off
-          # with "uhhm", the model will score a low probability of turn end and wait longer
-          # for the user to continue speaking. This can be useful for more natural
-          # conversations, but may have a higher latency.
+          # response.
+          #
+          # Server VAD means that the model will detect the start and end of speech based on
+          # audio volume and respond at the end of user speech.
+          #
+          # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+          # with VAD) to semantically estimate whether the user has finished speaking, then
+          # dynamically sets a timeout based on this probability. For example, if user audio
+          # trails off with "uhhm", the model will score a low probability of turn end and
+          # wait longer for the user to continue speaking. This can be useful for more
+          # natural conversations, but may have a higher latency.
           turn_detection: nil
         )
         end
@@ -160,7 +170,13 @@ module OpenAI
               noise_reduction:
                 OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction,
               transcription: OpenAI::Realtime::AudioTranscription,
-              turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection
+              turn_detection:
+                T.nilable(
+                  T.any(
+                    OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
+                    OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
+                  )
+                )
             }
           )
         end
diff --git a/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi b/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi
index 08a137c8..062c31fe 100644
--- a/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi
+++ b/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi
@@ -3,259 +3,320 @@
 module OpenAI
   module Models
     module Realtime
-      class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
-        OrHash =
+      # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+      # set to `null` to turn off, in which case the client must manually trigger model
+      # response.
+      #
+      # Server VAD means that the model will detect the start and end of speech based on
+      # audio volume and respond at the end of user speech.
+      #
+      # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+      # with VAD) to semantically estimate whether the user has finished speaking, then
+      # dynamically sets a timeout based on this probability. For example, if user audio
+      # trails off with "uhhm", the model will score a low probability of turn end and
+      # wait longer for the user to continue speaking. This can be useful for more
+      # natural conversations, but may have a higher latency.
+      module RealtimeAudioInputTurnDetection
+        extend OpenAI::Internal::Type::Union
+
+        Variants =
           T.type_alias do
             T.any(
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection,
-              OpenAI::Internal::AnyHash
+              OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
+              OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
             )
           end
 
-        # Whether or not to automatically generate a response when a VAD stop event
-        # occurs.
-        sig { returns(T.nilable(T::Boolean)) }
-        attr_reader :create_response
-
-        sig { params(create_response: T::Boolean).void }
-        attr_writer :create_response
-
-        # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        # will wait longer for the user to continue speaking, `high` will respond more
-        # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-        # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-        sig do
-          returns(
-            T.nilable(
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol
-            )
-          )
-        end
-        attr_reader :eagerness
-
-        sig do
-          params(
-            eagerness:
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol
-          ).void
-        end
-        attr_writer :eagerness
-
-        # Optional idle timeout after which turn detection will auto-timeout when no
-        # additional audio is received and emits a `timeout_triggered` event.
-        sig { returns(T.nilable(Integer)) }
-        attr_accessor :idle_timeout_ms
-
-        # Whether or not to automatically interrupt any ongoing response with output to
-        # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-        # occurs.
-        sig { returns(T.nilable(T::Boolean)) }
-        attr_reader :interrupt_response
-
-        sig { params(interrupt_response: T::Boolean).void }
-        attr_writer :interrupt_response
-
-        # Used only for `server_vad` mode. Amount of audio to include before the VAD
-        # detected speech (in milliseconds). Defaults to 300ms.
-        sig { returns(T.nilable(Integer)) }
-        attr_reader :prefix_padding_ms
-
-        sig { params(prefix_padding_ms: Integer).void }
-        attr_writer :prefix_padding_ms
-
-        # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-        # milliseconds). Defaults to 500ms. With shorter values the model will respond
-        # more quickly, but may jump in on short pauses from the user.
-        sig { returns(T.nilable(Integer)) }
-        attr_reader :silence_duration_ms
+        class ServerVad < OpenAI::Internal::Type::BaseModel
+          OrHash =
+            T.type_alias do
+              T.any(
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
+                OpenAI::Internal::AnyHash
+              )
+            end
 
-        sig { params(silence_duration_ms: Integer).void }
-        attr_writer :silence_duration_ms
+          # Type of turn detection, `server_vad` to turn on simple Server VAD.
+          sig { returns(Symbol) }
+          attr_accessor :type
 
-        # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-        # defaults to 0.5. A higher threshold will require louder audio to activate the
-        # model, and thus might perform better in noisy environments.
-        sig { returns(T.nilable(Float)) }
-        attr_reader :threshold
+          # Whether or not to automatically generate a response when a VAD stop event
+          # occurs.
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :create_response
 
-        sig { params(threshold: Float).void }
-        attr_writer :threshold
+          sig { params(create_response: T::Boolean).void }
+          attr_writer :create_response
 
-        # Type of turn detection.
-        sig do
-          returns(
-            T.nilable(
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
-            )
-          )
-        end
-        attr_reader :type
+          # Optional timeout after which a model response will be triggered automatically.
+          # This is useful for situations in which a long pause from the user is unexpected,
+          # such as a phone call. The model will effectively prompt the user to continue the
+          # conversation based on the current context.
+          #
+          # The timeout value will be applied after the last model response's audio has
+          # finished playing, i.e. it's set to the `response.done` time plus audio playback
+          # duration.
+          #
+          # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+          # Response) will be emitted when the timeout is reached. Idle timeout is currently
+          # only supported for `server_vad` mode.
+          sig { returns(T.nilable(Integer)) }
+          attr_accessor :idle_timeout_ms
 
-        sig do
-          params(
-            type:
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
-          ).void
-        end
-        attr_writer :type
-
-        # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-        # set to `null` to turn off, in which case the client must manually trigger model
-        # response. Server VAD means that the model will detect the start and end of
-        # speech based on audio volume and respond at the end of user speech. Semantic VAD
-        # is more advanced and uses a turn detection model (in conjunction with VAD) to
-        # semantically estimate whether the user has finished speaking, then dynamically
-        # sets a timeout based on this probability. For example, if user audio trails off
-        # with "uhhm", the model will score a low probability of turn end and wait longer
-        # for the user to continue speaking. This can be useful for more natural
-        # conversations, but may have a higher latency.
-        sig do
-          params(
-            create_response: T::Boolean,
-            eagerness:
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol,
-            idle_timeout_ms: T.nilable(Integer),
-            interrupt_response: T::Boolean,
-            prefix_padding_ms: Integer,
-            silence_duration_ms: Integer,
-            threshold: Float,
-            type:
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
-          ).returns(T.attached_class)
-        end
-        def self.new(
-          # Whether or not to automatically generate a response when a VAD stop event
-          # occurs.
-          create_response: nil,
-          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-          # will wait longer for the user to continue speaking, `high` will respond more
-          # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-          # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-          eagerness: nil,
-          # Optional idle timeout after which turn detection will auto-timeout when no
-          # additional audio is received and emits a `timeout_triggered` event.
-          idle_timeout_ms: nil,
           # Whether or not to automatically interrupt any ongoing response with output to
           # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
           # occurs.
-          interrupt_response: nil,
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :interrupt_response
+
+          sig { params(interrupt_response: T::Boolean).void }
+          attr_writer :interrupt_response
+
           # Used only for `server_vad` mode. Amount of audio to include before the VAD
           # detected speech (in milliseconds). Defaults to 300ms.
-          prefix_padding_ms: nil,
+          sig { returns(T.nilable(Integer)) }
+          attr_reader :prefix_padding_ms
+
+          sig { params(prefix_padding_ms: Integer).void }
+          attr_writer :prefix_padding_ms
+
           # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
           # milliseconds). Defaults to 500ms. With shorter values the model will respond
           # more quickly, but may jump in on short pauses from the user.
-          silence_duration_ms: nil,
+          sig { returns(T.nilable(Integer)) }
+          attr_reader :silence_duration_ms
+
+          sig { params(silence_duration_ms: Integer).void }
+          attr_writer :silence_duration_ms
+
           # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
           # defaults to 0.5. A higher threshold will require louder audio to activate the
           # model, and thus might perform better in noisy environments.
-          threshold: nil,
-          # Type of turn detection.
-          type: nil
-        )
-        end
+          sig { returns(T.nilable(Float)) }
+          attr_reader :threshold
 
-        sig do
-          override.returns(
-            {
+          sig { params(threshold: Float).void }
+          attr_writer :threshold
+
+          # Server-side voice activity detection (VAD) which flips on when user speech is
+          # detected and off after a period of silence.
+          sig do
+            params(
               create_response: T::Boolean,
-              eagerness:
-                OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol,
               idle_timeout_ms: T.nilable(Integer),
               interrupt_response: T::Boolean,
               prefix_padding_ms: Integer,
               silence_duration_ms: Integer,
               threshold: Float,
-              type:
-                OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
-            }
+              type: Symbol
+            ).returns(T.attached_class)
+          end
+          def self.new(
+            # Whether or not to automatically generate a response when a VAD stop event
+            # occurs.
+            create_response: nil,
+            # Optional timeout after which a model response will be triggered automatically.
+            # This is useful for situations in which a long pause from the user is unexpected,
+            # such as a phone call. The model will effectively prompt the user to continue the
+            # conversation based on the current context.
+            #
+            # The timeout value will be applied after the last model response's audio has
+            # finished playing, i.e. it's set to the `response.done` time plus audio playback
+            # duration.
+            #
+            # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+            # Response) will be emitted when the timeout is reached. Idle timeout is currently
+            # only supported for `server_vad` mode.
+            idle_timeout_ms: nil,
+            # Whether or not to automatically interrupt any ongoing response with output to
+            # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+            # occurs.
+            interrupt_response: nil,
+            # Used only for `server_vad` mode. Amount of audio to include before the VAD
+            # detected speech (in milliseconds). Defaults to 300ms.
+            prefix_padding_ms: nil,
+            # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+            # milliseconds). Defaults to 500ms. With shorter values the model will respond
+            # more quickly, but may jump in on short pauses from the user.
+            silence_duration_ms: nil,
+            # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+            # defaults to 0.5. A higher threshold will require louder audio to activate the
+            # model, and thus might perform better in noisy environments.
+            threshold: nil,
+            # Type of turn detection, `server_vad` to turn on simple Server VAD.
+            type: :server_vad
           )
-        end
-        def to_hash
-        end
+          end
 
-        # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        # will wait longer for the user to continue speaking, `high` will respond more
-        # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-        # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-        module Eagerness
-          extend OpenAI::Internal::Type::Enum
+          sig do
+            override.returns(
+              {
+                type: Symbol,
+                create_response: T::Boolean,
+                idle_timeout_ms: T.nilable(Integer),
+                interrupt_response: T::Boolean,
+                prefix_padding_ms: Integer,
+                silence_duration_ms: Integer,
+                threshold: Float
+              }
+            )
+          end
+          def to_hash
+          end
+        end
 
-          TaggedSymbol =
+        class SemanticVad < OpenAI::Internal::Type::BaseModel
+          OrHash =
             T.type_alias do
-              T.all(
-                Symbol,
-                OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness
+              T.any(
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad,
+                OpenAI::Internal::AnyHash
               )
             end
-          OrSymbol = T.type_alias { T.any(Symbol, String) }
 
-          LOW =
-            T.let(
-              :low,
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
-          MEDIUM =
-            T.let(
-              :medium,
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
-          HIGH =
-            T.let(
-              :high,
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
-          AUTO =
-            T.let(
-              :auto,
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
+          # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+          sig { returns(Symbol) }
+          attr_accessor :type
+
+          # Whether or not to automatically generate a response when a VAD stop event
+          # occurs.
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :create_response
+
+          sig { params(create_response: T::Boolean).void }
+          attr_writer :create_response
 
+          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          # will wait longer for the user to continue speaking, `high` will respond more
+          # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          # and `high` have max timeouts of 8s, 4s, and 2s respectively.
           sig do
-            override.returns(
-              T::Array[
-                OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
-              ]
+            returns(
+              T.nilable(
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
+              )
             )
           end
-          def self.values
+          attr_reader :eagerness
+
+          sig do
+            params(
+              eagerness:
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
+            ).void
           end
-        end
+          attr_writer :eagerness
 
-        # Type of turn detection.
-        module Type
-          extend OpenAI::Internal::Type::Enum
+          # Whether or not to automatically interrupt any ongoing response with output to
+          # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+          # occurs.
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :interrupt_response
 
-          TaggedSymbol =
-            T.type_alias do
-              T.all(
-                Symbol,
-                OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type
-              )
-            end
-          OrSymbol = T.type_alias { T.any(Symbol, String) }
+          sig { params(interrupt_response: T::Boolean).void }
+          attr_writer :interrupt_response
 
-          SERVER_VAD =
-            T.let(
-              :server_vad,
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol
-            )
-          SEMANTIC_VAD =
-            T.let(
-              :semantic_vad,
-              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol
-            )
+          # Server-side semantic turn detection which uses a model to determine when the
+          # user has finished speaking.
+          sig do
+            params(
+              create_response: T::Boolean,
+              eagerness:
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
+              interrupt_response: T::Boolean,
+              type: Symbol
+            ).returns(T.attached_class)
+          end
+          def self.new(
+            # Whether or not to automatically generate a response when a VAD stop event
+            # occurs.
+            create_response: nil,
+            # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+            # will wait longer for the user to continue speaking, `high` will respond more
+            # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+            # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+            eagerness: nil,
+            # Whether or not to automatically interrupt any ongoing response with output to
+            # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+            # occurs.
+            interrupt_response: nil,
+            # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+            type: :semantic_vad
+          )
+          end
 
           sig do
             override.returns(
-              T::Array[
-                OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol
-              ]
+              {
+                type: Symbol,
+                create_response: T::Boolean,
+                eagerness:
+                  OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
+                interrupt_response: T::Boolean
+              }
             )
           end
-          def self.values
+          def to_hash
+          end
+
+          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          # will wait longer for the user to continue speaking, `high` will respond more
+          # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+          module Eagerness
+            extend OpenAI::Internal::Type::Enum
+
+            TaggedSymbol =
+              T.type_alias do
+                T.all(
+                  Symbol,
+                  OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness
+                )
+              end
+            OrSymbol = T.type_alias { T.any(Symbol, String) }
+
+            LOW =
+              T.let(
+                :low,
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+            MEDIUM =
+              T.let(
+                :medium,
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+            HIGH =
+              T.let(
+                :high,
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+            AUTO =
+              T.let(
+                :auto,
+                OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+
+            sig do
+              override.returns(
+                T::Array[
+                  OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                ]
+              )
+            end
+            def self.values
+            end
           end
         end
+
+        sig do
+          override.returns(
+            T::Array[
+              OpenAI::Realtime::RealtimeAudioInputTurnDetection::Variants
+            ]
+          )
+        end
+        def self.variants
+        end
       end
     end
   end
diff --git a/rbi/openai/models/realtime/realtime_session.rbi b/rbi/openai/models/realtime/realtime_session.rbi
index 42a7c70d..e305375c 100644
--- a/rbi/openai/models/realtime/realtime_session.rbi
+++ b/rbi/openai/models/realtime/realtime_session.rbi
@@ -256,28 +256,28 @@ module OpenAI
 
         # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
         # set to `null` to turn off, in which case the client must manually trigger model
-        # response. Server VAD means that the model will detect the start and end of
-        # speech based on audio volume and respond at the end of user speech. Semantic VAD
-        # is more advanced and uses a turn detection model (in conjunction with VAD) to
-        # semantically estimate whether the user has finished speaking, then dynamically
-        # sets a timeout based on this probability. For example, if user audio trails off
-        # with "uhhm", the model will score a low probability of turn end and wait longer
-        # for the user to continue speaking. This can be useful for more natural
-        # conversations, but may have a higher latency.
-        sig do
-          returns(T.nilable(OpenAI::Realtime::RealtimeSession::TurnDetection))
-        end
-        attr_reader :turn_detection
-
+        # response.
+        #
+        # Server VAD means that the model will detect the start and end of speech based on
+        # audio volume and respond at the end of user speech.
+        #
+        # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        # with VAD) to semantically estimate whether the user has finished speaking, then
+        # dynamically sets a timeout based on this probability. For example, if user audio
+        # trails off with "uhhm", the model will score a low probability of turn end and
+        # wait longer for the user to continue speaking. This can be useful for more
+        # natural conversations, but may have a higher latency.
         sig do
-          params(
-            turn_detection:
-              T.nilable(
-                OpenAI::Realtime::RealtimeSession::TurnDetection::OrHash
+          returns(
+            T.nilable(
+              T.any(
+                OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
+                OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
               )
-          ).void
+            )
+          )
         end
-        attr_writer :turn_detection
+        attr_accessor :turn_detection
 
         # The voice the model uses to respond. Voice cannot be changed during the session
         # once the model has responded with audio at least once. Current voice options are
@@ -299,7 +299,7 @@ module OpenAI
         end
         attr_writer :voice
 
-        # Realtime session object.
+        # Realtime session object for the beta interface.
         sig do
           params(
             id: String,
@@ -336,7 +336,10 @@ module OpenAI
               ),
             turn_detection:
               T.nilable(
-                OpenAI::Realtime::RealtimeSession::TurnDetection::OrHash
+                T.any(
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad::OrHash,
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::OrHash
+                )
               ),
             voice:
               T.any(String, OpenAI::Realtime::RealtimeSession::Voice::OrSymbol)
@@ -420,14 +423,17 @@ module OpenAI
           tracing: nil,
           # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
           # set to `null` to turn off, in which case the client must manually trigger model
-          # response. Server VAD means that the model will detect the start and end of
-          # speech based on audio volume and respond at the end of user speech. Semantic VAD
-          # is more advanced and uses a turn detection model (in conjunction with VAD) to
-          # semantically estimate whether the user has finished speaking, then dynamically
-          # sets a timeout based on this probability. For example, if user audio trails off
-          # with "uhhm", the model will score a low probability of turn end and wait longer
-          # for the user to continue speaking. This can be useful for more natural
-          # conversations, but may have a higher latency.
+          # response.
+          #
+          # Server VAD means that the model will detect the start and end of speech based on
+          # audio volume and respond at the end of user speech.
+          #
+          # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+          # with VAD) to semantically estimate whether the user has finished speaking, then
+          # dynamically sets a timeout based on this probability. For example, if user audio
+          # trails off with "uhhm", the model will score a low probability of turn end and
+          # wait longer for the user to continue speaking. This can be useful for more
+          # natural conversations, but may have a higher latency.
           turn_detection: nil,
           # The voice the model uses to respond. Voice cannot be changed during the session
           # once the model has responded with audio at least once. Current voice options are
@@ -472,7 +478,12 @@ module OpenAI
                   )
                 ),
               turn_detection:
-                T.nilable(OpenAI::Realtime::RealtimeSession::TurnDetection),
+                T.nilable(
+                  T.any(
+                    OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
+                    OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
+                  )
+                ),
               voice:
                 T.any(
                   String,
@@ -864,256 +875,320 @@ module OpenAI
           end
         end
 
-        class TurnDetection < OpenAI::Internal::Type::BaseModel
-          OrHash =
+        # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+        # set to `null` to turn off, in which case the client must manually trigger model
+        # response.
+        #
+        # Server VAD means that the model will detect the start and end of speech based on
+        # audio volume and respond at the end of user speech.
+        #
+        # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        # with VAD) to semantically estimate whether the user has finished speaking, then
+        # dynamically sets a timeout based on this probability. For example, if user audio
+        # trails off with "uhhm", the model will score a low probability of turn end and
+        # wait longer for the user to continue speaking. This can be useful for more
+        # natural conversations, but may have a higher latency.
+        module TurnDetection
+          extend OpenAI::Internal::Type::Union
+
+          Variants =
             T.type_alias do
               T.any(
-                OpenAI::Realtime::RealtimeSession::TurnDetection,
-                OpenAI::Internal::AnyHash
+                OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
+                OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
               )
             end
 
-          # Whether or not to automatically generate a response when a VAD stop event
-          # occurs.
-          sig { returns(T.nilable(T::Boolean)) }
-          attr_reader :create_response
-
-          sig { params(create_response: T::Boolean).void }
-          attr_writer :create_response
-
-          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-          # will wait longer for the user to continue speaking, `high` will respond more
-          # quickly. `auto` is the default and is equivalent to `medium`.
-          sig do
-            returns(
-              T.nilable(
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol
-              )
-            )
-          end
-          attr_reader :eagerness
-
-          sig do
-            params(
-              eagerness:
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol
-            ).void
-          end
-          attr_writer :eagerness
-
-          # Optional idle timeout after which turn detection will auto-timeout when no
-          # additional audio is received.
-          sig { returns(T.nilable(Integer)) }
-          attr_accessor :idle_timeout_ms
-
-          # Whether or not to automatically interrupt any ongoing response with output to
-          # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-          # occurs.
-          sig { returns(T.nilable(T::Boolean)) }
-          attr_reader :interrupt_response
-
-          sig { params(interrupt_response: T::Boolean).void }
-          attr_writer :interrupt_response
-
-          # Used only for `server_vad` mode. Amount of audio to include before the VAD
-          # detected speech (in milliseconds). Defaults to 300ms.
-          sig { returns(T.nilable(Integer)) }
-          attr_reader :prefix_padding_ms
-
-          sig { params(prefix_padding_ms: Integer).void }
-          attr_writer :prefix_padding_ms
-
-          # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-          # milliseconds). Defaults to 500ms. With shorter values the model will respond
-          # more quickly, but may jump in on short pauses from the user.
-          sig { returns(T.nilable(Integer)) }
-          attr_reader :silence_duration_ms
-
-          sig { params(silence_duration_ms: Integer).void }
-          attr_writer :silence_duration_ms
-
-          # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-          # defaults to 0.5. A higher threshold will require louder audio to activate the
-          # model, and thus might perform better in noisy environments.
-          sig { returns(T.nilable(Float)) }
-          attr_reader :threshold
-
-          sig { params(threshold: Float).void }
-          attr_writer :threshold
-
-          # Type of turn detection.
-          sig do
-            returns(
-              T.nilable(
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
-              )
-            )
-          end
-          attr_reader :type
+          class ServerVad < OpenAI::Internal::Type::BaseModel
+            OrHash =
+              T.type_alias do
+                T.any(
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
+                  OpenAI::Internal::AnyHash
+                )
+              end
 
-          sig do
-            params(
-              type:
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
-            ).void
-          end
-          attr_writer :type
+            # Type of turn detection, `server_vad` to turn on simple Server VAD.
+            sig { returns(Symbol) }
+            attr_accessor :type
 
-          # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-          # set to `null` to turn off, in which case the client must manually trigger model
-          # response. Server VAD means that the model will detect the start and end of
-          # speech based on audio volume and respond at the end of user speech. Semantic VAD
-          # is more advanced and uses a turn detection model (in conjunction with VAD) to
-          # semantically estimate whether the user has finished speaking, then dynamically
-          # sets a timeout based on this probability. For example, if user audio trails off
-          # with "uhhm", the model will score a low probability of turn end and wait longer
-          # for the user to continue speaking. This can be useful for more natural
-          # conversations, but may have a higher latency.
-          sig do
-            params(
-              create_response: T::Boolean,
-              eagerness:
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol,
-              idle_timeout_ms: T.nilable(Integer),
-              interrupt_response: T::Boolean,
-              prefix_padding_ms: Integer,
-              silence_duration_ms: Integer,
-              threshold: Float,
-              type:
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
-            ).returns(T.attached_class)
-          end
-          def self.new(
             # Whether or not to automatically generate a response when a VAD stop event
             # occurs.
-            create_response: nil,
-            # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-            # will wait longer for the user to continue speaking, `high` will respond more
-            # quickly. `auto` is the default and is equivalent to `medium`.
-            eagerness: nil,
-            # Optional idle timeout after which turn detection will auto-timeout when no
-            # additional audio is received.
-            idle_timeout_ms: nil,
+            sig { returns(T.nilable(T::Boolean)) }
+            attr_reader :create_response
+
+            sig { params(create_response: T::Boolean).void }
+            attr_writer :create_response
+
+            # Optional timeout after which a model response will be triggered automatically.
+            # This is useful for situations in which a long pause from the user is unexpected,
+            # such as a phone call. The model will effectively prompt the user to continue the
+            # conversation based on the current context.
+            #
+            # The timeout value will be applied after the last model response's audio has
+            # finished playing, i.e. it's set to the `response.done` time plus audio playback
+            # duration.
+            #
+            # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+            # Response) will be emitted when the timeout is reached. Idle timeout is currently
+            # only supported for `server_vad` mode.
+            sig { returns(T.nilable(Integer)) }
+            attr_accessor :idle_timeout_ms
+
             # Whether or not to automatically interrupt any ongoing response with output to
             # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
             # occurs.
-            interrupt_response: nil,
+            sig { returns(T.nilable(T::Boolean)) }
+            attr_reader :interrupt_response
+
+            sig { params(interrupt_response: T::Boolean).void }
+            attr_writer :interrupt_response
+
             # Used only for `server_vad` mode. Amount of audio to include before the VAD
             # detected speech (in milliseconds). Defaults to 300ms.
-            prefix_padding_ms: nil,
+            sig { returns(T.nilable(Integer)) }
+            attr_reader :prefix_padding_ms
+
+            sig { params(prefix_padding_ms: Integer).void }
+            attr_writer :prefix_padding_ms
+
             # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
             # milliseconds). Defaults to 500ms. With shorter values the model will respond
             # more quickly, but may jump in on short pauses from the user.
-            silence_duration_ms: nil,
+            sig { returns(T.nilable(Integer)) }
+            attr_reader :silence_duration_ms
+
+            sig { params(silence_duration_ms: Integer).void }
+            attr_writer :silence_duration_ms
+
             # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
             # defaults to 0.5. A higher threshold will require louder audio to activate the
             # model, and thus might perform better in noisy environments.
-            threshold: nil,
-            # Type of turn detection.
-            type: nil
-          )
-          end
+            sig { returns(T.nilable(Float)) }
+            attr_reader :threshold
 
-          sig do
-            override.returns(
-              {
+            sig { params(threshold: Float).void }
+            attr_writer :threshold
+
+            # Server-side voice activity detection (VAD) which flips on when user speech is
+            # detected and off after a period of silence.
+            sig do
+              params(
                 create_response: T::Boolean,
-                eagerness:
-                  OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol,
                 idle_timeout_ms: T.nilable(Integer),
                 interrupt_response: T::Boolean,
                 prefix_padding_ms: Integer,
                 silence_duration_ms: Integer,
                 threshold: Float,
-                type:
-                  OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
-              }
+                type: Symbol
+              ).returns(T.attached_class)
+            end
+            def self.new(
+              # Whether or not to automatically generate a response when a VAD stop event
+              # occurs.
+              create_response: nil,
+              # Optional timeout after which a model response will be triggered automatically.
+              # This is useful for situations in which a long pause from the user is unexpected,
+              # such as a phone call. The model will effectively prompt the user to continue the
+              # conversation based on the current context.
+              #
+              # The timeout value will be applied after the last model response's audio has
+              # finished playing, i.e. it's set to the `response.done` time plus audio playback
+              # duration.
+              #
+              # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+              # Response) will be emitted when the timeout is reached. Idle timeout is currently
+              # only supported for `server_vad` mode.
+              idle_timeout_ms: nil,
+              # Whether or not to automatically interrupt any ongoing response with output to
+              # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+              # occurs.
+              interrupt_response: nil,
+              # Used only for `server_vad` mode. Amount of audio to include before the VAD
+              # detected speech (in milliseconds). Defaults to 300ms.
+              prefix_padding_ms: nil,
+              # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+              # milliseconds). Defaults to 500ms. With shorter values the model will respond
+              # more quickly, but may jump in on short pauses from the user.
+              silence_duration_ms: nil,
+              # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+              # defaults to 0.5. A higher threshold will require louder audio to activate the
+              # model, and thus might perform better in noisy environments.
+              threshold: nil,
+              # Type of turn detection, `server_vad` to turn on simple Server VAD.
+              type: :server_vad
             )
-          end
-          def to_hash
-          end
+            end
 
-          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-          # will wait longer for the user to continue speaking, `high` will respond more
-          # quickly. `auto` is the default and is equivalent to `medium`.
-          module Eagerness
-            extend OpenAI::Internal::Type::Enum
+            sig do
+              override.returns(
+                {
+                  type: Symbol,
+                  create_response: T::Boolean,
+                  idle_timeout_ms: T.nilable(Integer),
+                  interrupt_response: T::Boolean,
+                  prefix_padding_ms: Integer,
+                  silence_duration_ms: Integer,
+                  threshold: Float
+                }
+              )
+            end
+            def to_hash
+            end
+          end
 
-            TaggedSymbol =
+          class SemanticVad < OpenAI::Internal::Type::BaseModel
+            OrHash =
               T.type_alias do
-                T.all(
-                  Symbol,
-                  OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness
+                T.any(
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad,
+                  OpenAI::Internal::AnyHash
                 )
               end
-            OrSymbol = T.type_alias { T.any(Symbol, String) }
 
-            LOW =
-              T.let(
-                :low,
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
-              )
-            MEDIUM =
-              T.let(
-                :medium,
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
-              )
-            HIGH =
-              T.let(
-                :high,
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
-              )
-            AUTO =
-              T.let(
-                :auto,
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
-              )
+            # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+            sig { returns(Symbol) }
+            attr_accessor :type
 
+            # Whether or not to automatically generate a response when a VAD stop event
+            # occurs.
+            sig { returns(T.nilable(T::Boolean)) }
+            attr_reader :create_response
+
+            sig { params(create_response: T::Boolean).void }
+            attr_writer :create_response
+
+            # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+            # will wait longer for the user to continue speaking, `high` will respond more
+            # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+            # and `high` have max timeouts of 8s, 4s, and 2s respectively.
             sig do
-              override.returns(
-                T::Array[
-                  OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
-                ]
+              returns(
+                T.nilable(
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol
+                )
               )
             end
-            def self.values
+            attr_reader :eagerness
+
+            sig do
+              params(
+                eagerness:
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol
+              ).void
             end
-          end
+            attr_writer :eagerness
 
-          # Type of turn detection.
-          module Type
-            extend OpenAI::Internal::Type::Enum
+            # Whether or not to automatically interrupt any ongoing response with output to
+            # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+            # occurs.
+            sig { returns(T.nilable(T::Boolean)) }
+            attr_reader :interrupt_response
 
-            TaggedSymbol =
-              T.type_alias do
-                T.all(
-                  Symbol,
-                  OpenAI::Realtime::RealtimeSession::TurnDetection::Type
-                )
-              end
-            OrSymbol = T.type_alias { T.any(Symbol, String) }
+            sig { params(interrupt_response: T::Boolean).void }
+            attr_writer :interrupt_response
 
-            SERVER_VAD =
-              T.let(
-                :server_vad,
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol
-              )
-            SEMANTIC_VAD =
-              T.let(
-                :semantic_vad,
-                OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol
-              )
+            # Server-side semantic turn detection which uses a model to determine when the
+            # user has finished speaking.
+            sig do
+              params(
+                create_response: T::Boolean,
+                eagerness:
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol,
+                interrupt_response: T::Boolean,
+                type: Symbol
+              ).returns(T.attached_class)
+            end
+            def self.new(
+              # Whether or not to automatically generate a response when a VAD stop event
+              # occurs.
+              create_response: nil,
+              # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+              # will wait longer for the user to continue speaking, `high` will respond more
+              # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+              # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+              eagerness: nil,
+              # Whether or not to automatically interrupt any ongoing response with output to
+              # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+              # occurs.
+              interrupt_response: nil,
+              # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+              type: :semantic_vad
+            )
+            end
 
             sig do
               override.returns(
-                T::Array[
-                  OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol
-                ]
+                {
+                  type: Symbol,
+                  create_response: T::Boolean,
+                  eagerness:
+                    OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol,
+                  interrupt_response: T::Boolean
+                }
               )
             end
-            def self.values
+            def to_hash
+            end
+
+            # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+            # will wait longer for the user to continue speaking, `high` will respond more
+            # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+            # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+            module Eagerness
+              extend OpenAI::Internal::Type::Enum
+
+              TaggedSymbol =
+                T.type_alias do
+                  T.all(
+                    Symbol,
+                    OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness
+                  )
+                end
+              OrSymbol = T.type_alias { T.any(Symbol, String) }
+
+              LOW =
+                T.let(
+                  :low,
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                )
+              MEDIUM =
+                T.let(
+                  :medium,
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                )
+              HIGH =
+                T.let(
+                  :high,
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                )
+              AUTO =
+                T.let(
+                  :auto,
+                  OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                )
+
+              sig do
+                override.returns(
+                  T::Array[
+                    OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                  ]
+                )
+              end
+              def self.values
+              end
             end
           end
+
+          sig do
+            override.returns(
+              T::Array[
+                OpenAI::Realtime::RealtimeSession::TurnDetection::Variants
+              ]
+            )
+          end
+          def self.variants
+          end
         end
 
         # The voice the model uses to respond. Voice cannot be changed during the session
diff --git a/rbi/openai/models/realtime/realtime_session_create_response.rbi b/rbi/openai/models/realtime/realtime_session_create_response.rbi
index 0518a759..6cca5872 100644
--- a/rbi/openai/models/realtime/realtime_session_create_response.rbi
+++ b/rbi/openai/models/realtime/realtime_session_create_response.rbi
@@ -525,30 +525,25 @@ module OpenAI
 
             # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
             # set to `null` to turn off, in which case the client must manually trigger model
-            # response. Server VAD means that the model will detect the start and end of
-            # speech based on audio volume and respond at the end of user speech. Semantic VAD
-            # is more advanced and uses a turn detection model (in conjunction with VAD) to
-            # semantically estimate whether the user has finished speaking, then dynamically
-            # sets a timeout based on this probability. For example, if user audio trails off
-            # with "uhhm", the model will score a low probability of turn end and wait longer
-            # for the user to continue speaking. This can be useful for more natural
-            # conversations, but may have a higher latency.
+            # response.
+            #
+            # Server VAD means that the model will detect the start and end of speech based on
+            # audio volume and respond at the end of user speech.
+            #
+            # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+            # with VAD) to semantically estimate whether the user has finished speaking, then
+            # dynamically sets a timeout based on this probability. For example, if user audio
+            # trails off with "uhhm", the model will score a low probability of turn end and
+            # wait longer for the user to continue speaking. This can be useful for more
+            # natural conversations, but may have a higher latency.
             sig do
               returns(
                 T.nilable(
-                  OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
+                  OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
                 )
               )
             end
-            attr_reader :turn_detection
-
-            sig do
-              params(
-                turn_detection:
-                  OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::OrHash
-              ).void
-            end
-            attr_writer :turn_detection
+            attr_accessor :turn_detection
 
             sig do
               params(
@@ -562,7 +557,12 @@ module OpenAI
                   OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction::OrHash,
                 transcription: OpenAI::Realtime::AudioTranscription::OrHash,
                 turn_detection:
-                  OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::OrHash
+                  T.nilable(
+                    T.any(
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad::OrHash,
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::OrHash
+                    )
+                  )
               ).returns(T.attached_class)
             end
             def self.new(
@@ -585,14 +585,17 @@ module OpenAI
               transcription: nil,
               # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
               # set to `null` to turn off, in which case the client must manually trigger model
-              # response. Server VAD means that the model will detect the start and end of
-              # speech based on audio volume and respond at the end of user speech. Semantic VAD
-              # is more advanced and uses a turn detection model (in conjunction with VAD) to
-              # semantically estimate whether the user has finished speaking, then dynamically
-              # sets a timeout based on this probability. For example, if user audio trails off
-              # with "uhhm", the model will score a low probability of turn end and wait longer
-              # for the user to continue speaking. This can be useful for more natural
-              # conversations, but may have a higher latency.
+              # response.
+              #
+              # Server VAD means that the model will detect the start and end of speech based on
+              # audio volume and respond at the end of user speech.
+              #
+              # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+              # with VAD) to semantically estimate whether the user has finished speaking, then
+              # dynamically sets a timeout based on this probability. For example, if user audio
+              # trails off with "uhhm", the model will score a low probability of turn end and
+              # wait longer for the user to continue speaking. This can be useful for more
+              # natural conversations, but may have a higher latency.
               turn_detection: nil
             )
             end
@@ -605,7 +608,9 @@ module OpenAI
                     OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction,
                   transcription: OpenAI::Realtime::AudioTranscription,
                   turn_detection:
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
+                    T.nilable(
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
+                    )
                 }
               )
             end
@@ -665,259 +670,320 @@ module OpenAI
               end
             end
 
-            class TurnDetection < OpenAI::Internal::Type::BaseModel
-              OrHash =
+            # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+            # set to `null` to turn off, in which case the client must manually trigger model
+            # response.
+            #
+            # Server VAD means that the model will detect the start and end of speech based on
+            # audio volume and respond at the end of user speech.
+            #
+            # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+            # with VAD) to semantically estimate whether the user has finished speaking, then
+            # dynamically sets a timeout based on this probability. For example, if user audio
+            # trails off with "uhhm", the model will score a low probability of turn end and
+            # wait longer for the user to continue speaking. This can be useful for more
+            # natural conversations, but may have a higher latency.
+            module TurnDetection
+              extend OpenAI::Internal::Type::Union
+
+              Variants =
                 T.type_alias do
                   T.any(
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection,
-                    OpenAI::Internal::AnyHash
+                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad,
+                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad
                   )
                 end
 
-              # Whether or not to automatically generate a response when a VAD stop event
-              # occurs.
-              sig { returns(T.nilable(T::Boolean)) }
-              attr_reader :create_response
-
-              sig { params(create_response: T::Boolean).void }
-              attr_writer :create_response
-
-              # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-              # will wait longer for the user to continue speaking, `high` will respond more
-              # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-              # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-              sig do
-                returns(
-                  T.nilable(
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
-                  )
-                )
-              end
-              attr_reader :eagerness
-
-              sig do
-                params(
-                  eagerness:
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol
-                ).void
-              end
-              attr_writer :eagerness
-
-              # Optional idle timeout after which turn detection will auto-timeout when no
-              # additional audio is received and emits a `timeout_triggered` event.
-              sig { returns(T.nilable(Integer)) }
-              attr_accessor :idle_timeout_ms
-
-              # Whether or not to automatically interrupt any ongoing response with output to
-              # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-              # occurs.
-              sig { returns(T.nilable(T::Boolean)) }
-              attr_reader :interrupt_response
-
-              sig { params(interrupt_response: T::Boolean).void }
-              attr_writer :interrupt_response
-
-              # Used only for `server_vad` mode. Amount of audio to include before the VAD
-              # detected speech (in milliseconds). Defaults to 300ms.
-              sig { returns(T.nilable(Integer)) }
-              attr_reader :prefix_padding_ms
-
-              sig { params(prefix_padding_ms: Integer).void }
-              attr_writer :prefix_padding_ms
-
-              # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-              # milliseconds). Defaults to 500ms. With shorter values the model will respond
-              # more quickly, but may jump in on short pauses from the user.
-              sig { returns(T.nilable(Integer)) }
-              attr_reader :silence_duration_ms
-
-              sig { params(silence_duration_ms: Integer).void }
-              attr_writer :silence_duration_ms
-
-              # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-              # defaults to 0.5. A higher threshold will require louder audio to activate the
-              # model, and thus might perform better in noisy environments.
-              sig { returns(T.nilable(Float)) }
-              attr_reader :threshold
-
-              sig { params(threshold: Float).void }
-              attr_writer :threshold
-
-              # Type of turn detection.
-              sig do
-                returns(
-                  T.nilable(
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
-                  )
-                )
-              end
-              attr_reader :type
+              class ServerVad < OpenAI::Internal::Type::BaseModel
+                OrHash =
+                  T.type_alias do
+                    T.any(
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad,
+                      OpenAI::Internal::AnyHash
+                    )
+                  end
 
-              sig do
-                params(
-                  type:
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol
-                ).void
-              end
-              attr_writer :type
+                # Type of turn detection, `server_vad` to turn on simple Server VAD.
+                sig { returns(Symbol) }
+                attr_accessor :type
 
-              # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-              # set to `null` to turn off, in which case the client must manually trigger model
-              # response. Server VAD means that the model will detect the start and end of
-              # speech based on audio volume and respond at the end of user speech. Semantic VAD
-              # is more advanced and uses a turn detection model (in conjunction with VAD) to
-              # semantically estimate whether the user has finished speaking, then dynamically
-              # sets a timeout based on this probability. For example, if user audio trails off
-              # with "uhhm", the model will score a low probability of turn end and wait longer
-              # for the user to continue speaking. This can be useful for more natural
-              # conversations, but may have a higher latency.
-              sig do
-                params(
-                  create_response: T::Boolean,
-                  eagerness:
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol,
-                  idle_timeout_ms: T.nilable(Integer),
-                  interrupt_response: T::Boolean,
-                  prefix_padding_ms: Integer,
-                  silence_duration_ms: Integer,
-                  threshold: Float,
-                  type:
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol
-                ).returns(T.attached_class)
-              end
-              def self.new(
                 # Whether or not to automatically generate a response when a VAD stop event
                 # occurs.
-                create_response: nil,
-                # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-                # will wait longer for the user to continue speaking, `high` will respond more
-                # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-                # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-                eagerness: nil,
-                # Optional idle timeout after which turn detection will auto-timeout when no
-                # additional audio is received and emits a `timeout_triggered` event.
-                idle_timeout_ms: nil,
+                sig { returns(T.nilable(T::Boolean)) }
+                attr_reader :create_response
+
+                sig { params(create_response: T::Boolean).void }
+                attr_writer :create_response
+
+                # Optional timeout after which a model response will be triggered automatically.
+                # This is useful for situations in which a long pause from the user is unexpected,
+                # such as a phone call. The model will effectively prompt the user to continue the
+                # conversation based on the current context.
+                #
+                # The timeout value will be applied after the last model response's audio has
+                # finished playing, i.e. it's set to the `response.done` time plus audio playback
+                # duration.
+                #
+                # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+                # Response) will be emitted when the timeout is reached. Idle timeout is currently
+                # only supported for `server_vad` mode.
+                sig { returns(T.nilable(Integer)) }
+                attr_accessor :idle_timeout_ms
+
                 # Whether or not to automatically interrupt any ongoing response with output to
                 # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
                 # occurs.
-                interrupt_response: nil,
+                sig { returns(T.nilable(T::Boolean)) }
+                attr_reader :interrupt_response
+
+                sig { params(interrupt_response: T::Boolean).void }
+                attr_writer :interrupt_response
+
                 # Used only for `server_vad` mode. Amount of audio to include before the VAD
                 # detected speech (in milliseconds). Defaults to 300ms.
-                prefix_padding_ms: nil,
+                sig { returns(T.nilable(Integer)) }
+                attr_reader :prefix_padding_ms
+
+                sig { params(prefix_padding_ms: Integer).void }
+                attr_writer :prefix_padding_ms
+
                 # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
                 # milliseconds). Defaults to 500ms. With shorter values the model will respond
                 # more quickly, but may jump in on short pauses from the user.
-                silence_duration_ms: nil,
+                sig { returns(T.nilable(Integer)) }
+                attr_reader :silence_duration_ms
+
+                sig { params(silence_duration_ms: Integer).void }
+                attr_writer :silence_duration_ms
+
                 # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
                 # defaults to 0.5. A higher threshold will require louder audio to activate the
                 # model, and thus might perform better in noisy environments.
-                threshold: nil,
-                # Type of turn detection.
-                type: nil
-              )
-              end
+                sig { returns(T.nilable(Float)) }
+                attr_reader :threshold
 
-              sig do
-                override.returns(
-                  {
+                sig { params(threshold: Float).void }
+                attr_writer :threshold
+
+                # Server-side voice activity detection (VAD) which flips on when user speech is
+                # detected and off after a period of silence.
+                sig do
+                  params(
                     create_response: T::Boolean,
-                    eagerness:
-                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol,
                     idle_timeout_ms: T.nilable(Integer),
                     interrupt_response: T::Boolean,
                     prefix_padding_ms: Integer,
                     silence_duration_ms: Integer,
                     threshold: Float,
-                    type:
-                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
-                  }
+                    type: Symbol
+                  ).returns(T.attached_class)
+                end
+                def self.new(
+                  # Whether or not to automatically generate a response when a VAD stop event
+                  # occurs.
+                  create_response: nil,
+                  # Optional timeout after which a model response will be triggered automatically.
+                  # This is useful for situations in which a long pause from the user is unexpected,
+                  # such as a phone call. The model will effectively prompt the user to continue the
+                  # conversation based on the current context.
+                  #
+                  # The timeout value will be applied after the last model response's audio has
+                  # finished playing, i.e. it's set to the `response.done` time plus audio playback
+                  # duration.
+                  #
+                  # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+                  # Response) will be emitted when the timeout is reached. Idle timeout is currently
+                  # only supported for `server_vad` mode.
+                  idle_timeout_ms: nil,
+                  # Whether or not to automatically interrupt any ongoing response with output to
+                  # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+                  # occurs.
+                  interrupt_response: nil,
+                  # Used only for `server_vad` mode. Amount of audio to include before the VAD
+                  # detected speech (in milliseconds). Defaults to 300ms.
+                  prefix_padding_ms: nil,
+                  # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+                  # milliseconds). Defaults to 500ms. With shorter values the model will respond
+                  # more quickly, but may jump in on short pauses from the user.
+                  silence_duration_ms: nil,
+                  # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+                  # defaults to 0.5. A higher threshold will require louder audio to activate the
+                  # model, and thus might perform better in noisy environments.
+                  threshold: nil,
+                  # Type of turn detection, `server_vad` to turn on simple Server VAD.
+                  type: :server_vad
                 )
-              end
-              def to_hash
-              end
+                end
 
-              # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-              # will wait longer for the user to continue speaking, `high` will respond more
-              # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-              # and `high` have max timeouts of 8s, 4s, and 2s respectively.
-              module Eagerness
-                extend OpenAI::Internal::Type::Enum
+                sig do
+                  override.returns(
+                    {
+                      type: Symbol,
+                      create_response: T::Boolean,
+                      idle_timeout_ms: T.nilable(Integer),
+                      interrupt_response: T::Boolean,
+                      prefix_padding_ms: Integer,
+                      silence_duration_ms: Integer,
+                      threshold: Float
+                    }
+                  )
+                end
+                def to_hash
+                end
+              end
 
-                TaggedSymbol =
+              class SemanticVad < OpenAI::Internal::Type::BaseModel
+                OrHash =
                   T.type_alias do
-                    T.all(
-                      Symbol,
-                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness
+                    T.any(
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad,
+                      OpenAI::Internal::AnyHash
                     )
                   end
-                OrSymbol = T.type_alias { T.any(Symbol, String) }
 
-                LOW =
-                  T.let(
-                    :low,
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
-                  )
-                MEDIUM =
-                  T.let(
-                    :medium,
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
-                  )
-                HIGH =
-                  T.let(
-                    :high,
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
-                  )
-                AUTO =
-                  T.let(
-                    :auto,
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
-                  )
+                # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+                sig { returns(Symbol) }
+                attr_accessor :type
 
+                # Whether or not to automatically generate a response when a VAD stop event
+                # occurs.
+                sig { returns(T.nilable(T::Boolean)) }
+                attr_reader :create_response
+
+                sig { params(create_response: T::Boolean).void }
+                attr_writer :create_response
+
+                # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+                # will wait longer for the user to continue speaking, `high` will respond more
+                # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+                # and `high` have max timeouts of 8s, 4s, and 2s respectively.
                 sig do
-                  override.returns(
-                    T::Array[
-                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
-                    ]
+                  returns(
+                    T.nilable(
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                    )
                   )
                 end
-                def self.values
+                attr_reader :eagerness
+
+                sig do
+                  params(
+                    eagerness:
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol
+                  ).void
                 end
-              end
+                attr_writer :eagerness
 
-              # Type of turn detection.
-              module Type
-                extend OpenAI::Internal::Type::Enum
+                # Whether or not to automatically interrupt any ongoing response with output to
+                # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+                # occurs.
+                sig { returns(T.nilable(T::Boolean)) }
+                attr_reader :interrupt_response
 
-                TaggedSymbol =
-                  T.type_alias do
-                    T.all(
-                      Symbol,
-                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type
-                    )
-                  end
-                OrSymbol = T.type_alias { T.any(Symbol, String) }
+                sig { params(interrupt_response: T::Boolean).void }
+                attr_writer :interrupt_response
 
-                SERVER_VAD =
-                  T.let(
-                    :server_vad,
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
-                  )
-                SEMANTIC_VAD =
-                  T.let(
-                    :semantic_vad,
-                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
-                  )
+                # Server-side semantic turn detection which uses a model to determine when the
+                # user has finished speaking.
+                sig do
+                  params(
+                    create_response: T::Boolean,
+                    eagerness:
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol,
+                    interrupt_response: T::Boolean,
+                    type: Symbol
+                  ).returns(T.attached_class)
+                end
+                def self.new(
+                  # Whether or not to automatically generate a response when a VAD stop event
+                  # occurs.
+                  create_response: nil,
+                  # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+                  # will wait longer for the user to continue speaking, `high` will respond more
+                  # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+                  # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+                  eagerness: nil,
+                  # Whether or not to automatically interrupt any ongoing response with output to
+                  # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+                  # occurs.
+                  interrupt_response: nil,
+                  # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+                  type: :semantic_vad
+                )
+                end
 
                 sig do
                   override.returns(
-                    T::Array[
-                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
-                    ]
+                    {
+                      type: Symbol,
+                      create_response: T::Boolean,
+                      eagerness:
+                        OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol,
+                      interrupt_response: T::Boolean
+                    }
                   )
                 end
-                def self.values
+                def to_hash
+                end
+
+                # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+                # will wait longer for the user to continue speaking, `high` will respond more
+                # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+                # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+                module Eagerness
+                  extend OpenAI::Internal::Type::Enum
+
+                  TaggedSymbol =
+                    T.type_alias do
+                      T.all(
+                        Symbol,
+                        OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness
+                      )
+                    end
+                  OrSymbol = T.type_alias { T.any(Symbol, String) }
+
+                  LOW =
+                    T.let(
+                      :low,
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                    )
+                  MEDIUM =
+                    T.let(
+                      :medium,
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                    )
+                  HIGH =
+                    T.let(
+                      :high,
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                    )
+                  AUTO =
+                    T.let(
+                      :auto,
+                      OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                    )
+
+                  sig do
+                    override.returns(
+                      T::Array[
+                        OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                      ]
+                    )
+                  end
+                  def self.values
+                  end
                 end
               end
+
+              sig do
+                override.returns(
+                  T::Array[
+                    OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
+                  ]
+                )
+              end
+              def self.variants
+              end
             end
           end
 
diff --git a/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi b/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi
index 360679f8..a07f9361 100644
--- a/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi
+++ b/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi
@@ -80,30 +80,28 @@ module OpenAI
 
         # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
         # set to `null` to turn off, in which case the client must manually trigger model
-        # response. Server VAD means that the model will detect the start and end of
-        # speech based on audio volume and respond at the end of user speech. Semantic VAD
-        # is more advanced and uses a turn detection model (in conjunction with VAD) to
-        # semantically estimate whether the user has finished speaking, then dynamically
-        # sets a timeout based on this probability. For example, if user audio trails off
-        # with "uhhm", the model will score a low probability of turn end and wait longer
-        # for the user to continue speaking. This can be useful for more natural
-        # conversations, but may have a higher latency.
+        # response.
+        #
+        # Server VAD means that the model will detect the start and end of speech based on
+        # audio volume and respond at the end of user speech.
+        #
+        # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+        # with VAD) to semantically estimate whether the user has finished speaking, then
+        # dynamically sets a timeout based on this probability. For example, if user audio
+        # trails off with "uhhm", the model will score a low probability of turn end and
+        # wait longer for the user to continue speaking. This can be useful for more
+        # natural conversations, but may have a higher latency.
         sig do
           returns(
             T.nilable(
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
+              T.any(
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
+              )
             )
           )
         end
-        attr_reader :turn_detection
-
-        sig do
-          params(
-            turn_detection:
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::OrHash
-          ).void
-        end
-        attr_writer :turn_detection
+        attr_accessor :turn_detection
 
         sig do
           params(
@@ -117,7 +115,12 @@ module OpenAI
               OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction::OrHash,
             transcription: OpenAI::Realtime::AudioTranscription::OrHash,
             turn_detection:
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::OrHash
+              T.nilable(
+                T.any(
+                  OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad::OrHash,
+                  OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::OrHash
+                )
+              )
           ).returns(T.attached_class)
         end
         def self.new(
@@ -140,14 +143,17 @@ module OpenAI
           transcription: nil,
           # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
           # set to `null` to turn off, in which case the client must manually trigger model
-          # response. Server VAD means that the model will detect the start and end of
-          # speech based on audio volume and respond at the end of user speech. Semantic VAD
-          # is more advanced and uses a turn detection model (in conjunction with VAD) to
-          # semantically estimate whether the user has finished speaking, then dynamically
-          # sets a timeout based on this probability. For example, if user audio trails off
-          # with "uhhm", the model will score a low probability of turn end and wait longer
-          # for the user to continue speaking. This can be useful for more natural
-          # conversations, but may have a higher latency.
+          # response.
+          #
+          # Server VAD means that the model will detect the start and end of speech based on
+          # audio volume and respond at the end of user speech.
+          #
+          # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+          # with VAD) to semantically estimate whether the user has finished speaking, then
+          # dynamically sets a timeout based on this probability. For example, if user audio
+          # trails off with "uhhm", the model will score a low probability of turn end and
+          # wait longer for the user to continue speaking. This can be useful for more
+          # natural conversations, but may have a higher latency.
           turn_detection: nil
         )
         end
@@ -165,7 +171,12 @@ module OpenAI
                 OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction,
               transcription: OpenAI::Realtime::AudioTranscription,
               turn_detection:
-                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
+                T.nilable(
+                  T.any(
+                    OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
+                    OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
+                  )
+                )
             }
           )
         end
diff --git a/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi b/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi
index 676cf1eb..3dc51534 100644
--- a/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi
+++ b/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi
@@ -3,256 +3,320 @@
 module OpenAI
   module Models
     module Realtime
-      class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
-        OrHash =
+      # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+      # set to `null` to turn off, in which case the client must manually trigger model
+      # response.
+      #
+      # Server VAD means that the model will detect the start and end of speech based on
+      # audio volume and respond at the end of user speech.
+      #
+      # Semantic VAD is more advanced and uses a turn detection model (in conjunction
+      # with VAD) to semantically estimate whether the user has finished speaking, then
+      # dynamically sets a timeout based on this probability. For example, if user audio
+      # trails off with "uhhm", the model will score a low probability of turn end and
+      # wait longer for the user to continue speaking. This can be useful for more
+      # natural conversations, but may have a higher latency.
+      module RealtimeTranscriptionSessionAudioInputTurnDetection
+        extend OpenAI::Internal::Type::Union
+
+        Variants =
           T.type_alias do
             T.any(
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection,
-              OpenAI::Internal::AnyHash
+              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
+              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
             )
           end
 
-        # Whether or not to automatically generate a response when a VAD stop event
-        # occurs.
-        sig { returns(T.nilable(T::Boolean)) }
-        attr_reader :create_response
-
-        sig { params(create_response: T::Boolean).void }
-        attr_writer :create_response
-
-        # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        # will wait longer for the user to continue speaking, `high` will respond more
-        # quickly. `auto` is the default and is equivalent to `medium`.
-        sig do
-          returns(
-            T.nilable(
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol
-            )
-          )
-        end
-        attr_reader :eagerness
-
-        sig do
-          params(
-            eagerness:
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol
-          ).void
-        end
-        attr_writer :eagerness
-
-        # Optional idle timeout after which turn detection will auto-timeout when no
-        # additional audio is received.
-        sig { returns(T.nilable(Integer)) }
-        attr_accessor :idle_timeout_ms
-
-        # Whether or not to automatically interrupt any ongoing response with output to
-        # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
-        # occurs.
-        sig { returns(T.nilable(T::Boolean)) }
-        attr_reader :interrupt_response
-
-        sig { params(interrupt_response: T::Boolean).void }
-        attr_writer :interrupt_response
-
-        # Used only for `server_vad` mode. Amount of audio to include before the VAD
-        # detected speech (in milliseconds). Defaults to 300ms.
-        sig { returns(T.nilable(Integer)) }
-        attr_reader :prefix_padding_ms
-
-        sig { params(prefix_padding_ms: Integer).void }
-        attr_writer :prefix_padding_ms
+        class ServerVad < OpenAI::Internal::Type::BaseModel
+          OrHash =
+            T.type_alias do
+              T.any(
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
+                OpenAI::Internal::AnyHash
+              )
+            end
 
-        # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
-        # milliseconds). Defaults to 500ms. With shorter values the model will respond
-        # more quickly, but may jump in on short pauses from the user.
-        sig { returns(T.nilable(Integer)) }
-        attr_reader :silence_duration_ms
+          # Type of turn detection, `server_vad` to turn on simple Server VAD.
+          sig { returns(Symbol) }
+          attr_accessor :type
 
-        sig { params(silence_duration_ms: Integer).void }
-        attr_writer :silence_duration_ms
+          # Whether or not to automatically generate a response when a VAD stop event
+          # occurs.
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :create_response
 
-        # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
-        # defaults to 0.5. A higher threshold will require louder audio to activate the
-        # model, and thus might perform better in noisy environments.
-        sig { returns(T.nilable(Float)) }
-        attr_reader :threshold
+          sig { params(create_response: T::Boolean).void }
+          attr_writer :create_response
 
-        sig { params(threshold: Float).void }
-        attr_writer :threshold
+          # Optional timeout after which a model response will be triggered automatically.
+          # This is useful for situations in which a long pause from the user is unexpected,
+          # such as a phone call. The model will effectively prompt the user to continue the
+          # conversation based on the current context.
+          #
+          # The timeout value will be applied after the last model response's audio has
+          # finished playing, i.e. it's set to the `response.done` time plus audio playback
+          # duration.
+          #
+          # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+          # Response) will be emitted when the timeout is reached. Idle timeout is currently
+          # only supported for `server_vad` mode.
+          sig { returns(T.nilable(Integer)) }
+          attr_accessor :idle_timeout_ms
 
-        # Type of turn detection.
-        sig do
-          returns(
-            T.nilable(
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
-            )
-          )
-        end
-        attr_reader :type
-
-        sig do
-          params(
-            type:
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
-          ).void
-        end
-        attr_writer :type
-
-        # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-        # set to `null` to turn off, in which case the client must manually trigger model
-        # response. Server VAD means that the model will detect the start and end of
-        # speech based on audio volume and respond at the end of user speech. Semantic VAD
-        # is more advanced and uses a turn detection model (in conjunction with VAD) to
-        # semantically estimate whether the user has finished speaking, then dynamically
-        # sets a timeout based on this probability. For example, if user audio trails off
-        # with "uhhm", the model will score a low probability of turn end and wait longer
-        # for the user to continue speaking. This can be useful for more natural
-        # conversations, but may have a higher latency.
-        sig do
-          params(
-            create_response: T::Boolean,
-            eagerness:
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol,
-            idle_timeout_ms: T.nilable(Integer),
-            interrupt_response: T::Boolean,
-            prefix_padding_ms: Integer,
-            silence_duration_ms: Integer,
-            threshold: Float,
-            type:
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
-          ).returns(T.attached_class)
-        end
-        def self.new(
-          # Whether or not to automatically generate a response when a VAD stop event
-          # occurs.
-          create_response: nil,
-          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-          # will wait longer for the user to continue speaking, `high` will respond more
-          # quickly. `auto` is the default and is equivalent to `medium`.
-          eagerness: nil,
-          # Optional idle timeout after which turn detection will auto-timeout when no
-          # additional audio is received.
-          idle_timeout_ms: nil,
           # Whether or not to automatically interrupt any ongoing response with output to
           # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
           # occurs.
-          interrupt_response: nil,
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :interrupt_response
+
+          sig { params(interrupt_response: T::Boolean).void }
+          attr_writer :interrupt_response
+
           # Used only for `server_vad` mode. Amount of audio to include before the VAD
           # detected speech (in milliseconds). Defaults to 300ms.
-          prefix_padding_ms: nil,
+          sig { returns(T.nilable(Integer)) }
+          attr_reader :prefix_padding_ms
+
+          sig { params(prefix_padding_ms: Integer).void }
+          attr_writer :prefix_padding_ms
+
           # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
           # milliseconds). Defaults to 500ms. With shorter values the model will respond
           # more quickly, but may jump in on short pauses from the user.
-          silence_duration_ms: nil,
+          sig { returns(T.nilable(Integer)) }
+          attr_reader :silence_duration_ms
+
+          sig { params(silence_duration_ms: Integer).void }
+          attr_writer :silence_duration_ms
+
           # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
           # defaults to 0.5. A higher threshold will require louder audio to activate the
           # model, and thus might perform better in noisy environments.
-          threshold: nil,
-          # Type of turn detection.
-          type: nil
-        )
-        end
+          sig { returns(T.nilable(Float)) }
+          attr_reader :threshold
 
-        sig do
-          override.returns(
-            {
+          sig { params(threshold: Float).void }
+          attr_writer :threshold
+
+          # Server-side voice activity detection (VAD) which flips on when user speech is
+          # detected and off after a period of silence.
+          sig do
+            params(
               create_response: T::Boolean,
-              eagerness:
-                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol,
               idle_timeout_ms: T.nilable(Integer),
               interrupt_response: T::Boolean,
               prefix_padding_ms: Integer,
               silence_duration_ms: Integer,
               threshold: Float,
-              type:
-                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
-            }
+              type: Symbol
+            ).returns(T.attached_class)
+          end
+          def self.new(
+            # Whether or not to automatically generate a response when a VAD stop event
+            # occurs.
+            create_response: nil,
+            # Optional timeout after which a model response will be triggered automatically.
+            # This is useful for situations in which a long pause from the user is unexpected,
+            # such as a phone call. The model will effectively prompt the user to continue the
+            # conversation based on the current context.
+            #
+            # The timeout value will be applied after the last model response's audio has
+            # finished playing, i.e. it's set to the `response.done` time plus audio playback
+            # duration.
+            #
+            # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+            # Response) will be emitted when the timeout is reached. Idle timeout is currently
+            # only supported for `server_vad` mode.
+            idle_timeout_ms: nil,
+            # Whether or not to automatically interrupt any ongoing response with output to
+            # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+            # occurs.
+            interrupt_response: nil,
+            # Used only for `server_vad` mode. Amount of audio to include before the VAD
+            # detected speech (in milliseconds). Defaults to 300ms.
+            prefix_padding_ms: nil,
+            # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+            # milliseconds). Defaults to 500ms. With shorter values the model will respond
+            # more quickly, but may jump in on short pauses from the user.
+            silence_duration_ms: nil,
+            # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+            # defaults to 0.5. A higher threshold will require louder audio to activate the
+            # model, and thus might perform better in noisy environments.
+            threshold: nil,
+            # Type of turn detection, `server_vad` to turn on simple Server VAD.
+            type: :server_vad
           )
-        end
-        def to_hash
-        end
+          end
 
-        # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-        # will wait longer for the user to continue speaking, `high` will respond more
-        # quickly. `auto` is the default and is equivalent to `medium`.
-        module Eagerness
-          extend OpenAI::Internal::Type::Enum
+          sig do
+            override.returns(
+              {
+                type: Symbol,
+                create_response: T::Boolean,
+                idle_timeout_ms: T.nilable(Integer),
+                interrupt_response: T::Boolean,
+                prefix_padding_ms: Integer,
+                silence_duration_ms: Integer,
+                threshold: Float
+              }
+            )
+          end
+          def to_hash
+          end
+        end
 
-          TaggedSymbol =
+        class SemanticVad < OpenAI::Internal::Type::BaseModel
+          OrHash =
             T.type_alias do
-              T.all(
-                Symbol,
-                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness
+              T.any(
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad,
+                OpenAI::Internal::AnyHash
               )
             end
-          OrSymbol = T.type_alias { T.any(Symbol, String) }
 
-          LOW =
-            T.let(
-              :low,
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
-          MEDIUM =
-            T.let(
-              :medium,
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
-          HIGH =
-            T.let(
-              :high,
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
-          AUTO =
-            T.let(
-              :auto,
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
-            )
+          # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+          sig { returns(Symbol) }
+          attr_accessor :type
+
+          # Whether or not to automatically generate a response when a VAD stop event
+          # occurs.
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :create_response
 
+          sig { params(create_response: T::Boolean).void }
+          attr_writer :create_response
+
+          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          # will wait longer for the user to continue speaking, `high` will respond more
+          # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          # and `high` have max timeouts of 8s, 4s, and 2s respectively.
           sig do
-            override.returns(
-              T::Array[
-                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
-              ]
+            returns(
+              T.nilable(
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
+              )
             )
           end
-          def self.values
+          attr_reader :eagerness
+
+          sig do
+            params(
+              eagerness:
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
+            ).void
           end
-        end
+          attr_writer :eagerness
 
-        # Type of turn detection.
-        module Type
-          extend OpenAI::Internal::Type::Enum
+          # Whether or not to automatically interrupt any ongoing response with output to
+          # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+          # occurs.
+          sig { returns(T.nilable(T::Boolean)) }
+          attr_reader :interrupt_response
 
-          TaggedSymbol =
-            T.type_alias do
-              T.all(
-                Symbol,
-                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type
-              )
-            end
-          OrSymbol = T.type_alias { T.any(Symbol, String) }
+          sig { params(interrupt_response: T::Boolean).void }
+          attr_writer :interrupt_response
 
-          SERVER_VAD =
-            T.let(
-              :server_vad,
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol
-            )
-          SEMANTIC_VAD =
-            T.let(
-              :semantic_vad,
-              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol
-            )
+          # Server-side semantic turn detection which uses a model to determine when the
+          # user has finished speaking.
+          sig do
+            params(
+              create_response: T::Boolean,
+              eagerness:
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
+              interrupt_response: T::Boolean,
+              type: Symbol
+            ).returns(T.attached_class)
+          end
+          def self.new(
+            # Whether or not to automatically generate a response when a VAD stop event
+            # occurs.
+            create_response: nil,
+            # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+            # will wait longer for the user to continue speaking, `high` will respond more
+            # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+            # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+            eagerness: nil,
+            # Whether or not to automatically interrupt any ongoing response with output to
+            # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+            # occurs.
+            interrupt_response: nil,
+            # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+            type: :semantic_vad
+          )
+          end
 
           sig do
             override.returns(
-              T::Array[
-                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol
-              ]
+              {
+                type: Symbol,
+                create_response: T::Boolean,
+                eagerness:
+                  OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
+                interrupt_response: T::Boolean
+              }
             )
           end
-          def self.values
+          def to_hash
+          end
+
+          # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+          # will wait longer for the user to continue speaking, `high` will respond more
+          # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+          # and `high` have max timeouts of 8s, 4s, and 2s respectively.
+          module Eagerness
+            extend OpenAI::Internal::Type::Enum
+
+            TaggedSymbol =
+              T.type_alias do
+                T.all(
+                  Symbol,
+                  OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness
+                )
+              end
+            OrSymbol = T.type_alias { T.any(Symbol, String) }
+
+            LOW =
+              T.let(
+                :low,
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+            MEDIUM =
+              T.let(
+                :medium,
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+            HIGH =
+              T.let(
+                :high,
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+            AUTO =
+              T.let(
+                :auto,
+                OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+              )
+
+            sig do
+              override.returns(
+                T::Array[
+                  OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
+                ]
+              )
+            end
+            def self.values
+            end
           end
         end
+
+        sig do
+          override.returns(
+            T::Array[
+              OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Variants
+            ]
+          )
+        end
+        def self.variants
+        end
       end
     end
   end
diff --git a/rbi/openai/models/responses/response.rbi b/rbi/openai/models/responses/response.rbi
index dd1d405b..80f62655 100644
--- a/rbi/openai/models/responses/response.rbi
+++ b/rbi/openai/models/responses/response.rbi
@@ -265,10 +265,10 @@ module OpenAI
 
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         sig do
           returns(
@@ -521,10 +521,10 @@ module OpenAI
           top_logprobs: nil,
           # The truncation strategy to use for the model response.
           #
-          # - `auto`: If the context of this response and previous ones exceeds the model's
-          #   context window size, the model will truncate the response to fit the context
-          #   window by dropping input items in the middle of the conversation.
-          # - `disabled` (default): If a model response will exceed the context window size
+          # - `auto`: If the input to this Response exceeds the model's context window size,
+          #   the model will truncate the response to fit the context window by dropping
+          #   items from the beginning of the conversation.
+          # - `disabled` (default): If the input size will exceed the context window size
           #   for a model, the request will fail with a 400 error.
           truncation: nil,
           # Represents token usage details including input tokens, output tokens, a
@@ -819,10 +819,10 @@ module OpenAI
 
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         module Truncation
           extend OpenAI::Internal::Type::Enum
diff --git a/rbi/openai/models/responses/response_create_params.rbi b/rbi/openai/models/responses/response_create_params.rbi
index 9d019c87..42b3aaa1 100644
--- a/rbi/openai/models/responses/response_create_params.rbi
+++ b/rbi/openai/models/responses/response_create_params.rbi
@@ -378,10 +378,10 @@ module OpenAI
 
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         sig do
           returns(
@@ -637,10 +637,10 @@ module OpenAI
           top_p: nil,
           # The truncation strategy to use for the model response.
           #
-          # - `auto`: If the context of this response and previous ones exceeds the model's
-          #   context window size, the model will truncate the response to fit the context
-          #   window by dropping input items in the middle of the conversation.
-          # - `disabled` (default): If a model response will exceed the context window size
+          # - `auto`: If the input to this Response exceeds the model's context window size,
+          #   the model will truncate the response to fit the context window by dropping
+          #   items from the beginning of the conversation.
+          # - `disabled` (default): If the input size will exceed the context window size
           #   for a model, the request will fail with a 400 error.
           truncation: nil,
           # This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -920,10 +920,10 @@ module OpenAI
 
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         module Truncation
           extend OpenAI::Internal::Type::Enum
diff --git a/rbi/openai/resources/responses.rbi b/rbi/openai/resources/responses.rbi
index e82f7866..e032693f 100644
--- a/rbi/openai/resources/responses.rbi
+++ b/rbi/openai/resources/responses.rbi
@@ -258,10 +258,10 @@ module OpenAI
         top_p: nil,
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         truncation: nil,
         # This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -535,10 +535,10 @@ module OpenAI
         top_p: nil,
         # The truncation strategy to use for the model response.
         #
-        # - `auto`: If the context of this response and previous ones exceeds the model's
-        #   context window size, the model will truncate the response to fit the context
-        #   window by dropping input items in the middle of the conversation.
-        # - `disabled` (default): If a model response will exceed the context window size
+        # - `auto`: If the input to this Response exceeds the model's context window size,
+        #   the model will truncate the response to fit the context window by dropping
+        #   items from the beginning of the conversation.
+        # - `disabled` (default): If the input size will exceed the context window size
         #   for a model, the request will fail with a 400 error.
         truncation: nil,
         # This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
diff --git a/sig/openai/models/realtime/realtime_audio_config_input.rbs b/sig/openai/models/realtime/realtime_audio_config_input.rbs
index 08d072de..5c1a430e 100644
--- a/sig/openai/models/realtime/realtime_audio_config_input.rbs
+++ b/sig/openai/models/realtime/realtime_audio_config_input.rbs
@@ -6,7 +6,7 @@ module OpenAI
           format_: OpenAI::Models::Realtime::realtime_audio_formats,
           noise_reduction: OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction,
           transcription: OpenAI::Realtime::AudioTranscription,
-          turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection
+          turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection?
         }
 
       class RealtimeAudioConfigInput < OpenAI::Internal::Type::BaseModel
@@ -28,24 +28,20 @@ module OpenAI
           OpenAI::Realtime::AudioTranscription
         ) -> OpenAI::Realtime::AudioTranscription
 
-        attr_reader turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection?
-
-        def turn_detection=: (
-          OpenAI::Realtime::RealtimeAudioInputTurnDetection
-        ) -> OpenAI::Realtime::RealtimeAudioInputTurnDetection
+        attr_accessor turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection?
 
         def initialize: (
           ?format_: OpenAI::Models::Realtime::realtime_audio_formats,
           ?noise_reduction: OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction,
           ?transcription: OpenAI::Realtime::AudioTranscription,
-          ?turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection
+          ?turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection?
         ) -> void
 
         def to_hash: -> {
           format_: OpenAI::Models::Realtime::realtime_audio_formats,
           noise_reduction: OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction,
           transcription: OpenAI::Realtime::AudioTranscription,
-          turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection
+          turn_detection: OpenAI::Models::Realtime::realtime_audio_input_turn_detection?
         }
 
         type noise_reduction =
diff --git a/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs b/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs
index 3a8b1c9e..4c6593ea 100644
--- a/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs
+++ b/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs
@@ -2,97 +2,123 @@ module OpenAI
   module Models
     module Realtime
       type realtime_audio_input_turn_detection =
-        {
-          create_response: bool,
-          eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness,
-          idle_timeout_ms: Integer?,
-          interrupt_response: bool,
-          prefix_padding_ms: Integer,
-          silence_duration_ms: Integer,
-          threshold: Float,
-          type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_
-        }
+        OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad
+        | OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
 
-      class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
-        attr_reader create_response: bool?
+      module RealtimeAudioInputTurnDetection
+        extend OpenAI::Internal::Type::Union
 
-        def create_response=: (bool) -> bool
+        type server_vad =
+          {
+            type: :server_vad,
+            create_response: bool,
+            idle_timeout_ms: Integer?,
+            interrupt_response: bool,
+            prefix_padding_ms: Integer,
+            silence_duration_ms: Integer,
+            threshold: Float
+          }
 
-        attr_reader eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness?
+        class ServerVad < OpenAI::Internal::Type::BaseModel
+          attr_accessor type: :server_vad
 
-        def eagerness=: (
-          OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness
-        ) -> OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness
+          attr_reader create_response: bool?
 
-        attr_accessor idle_timeout_ms: Integer?
+          def create_response=: (bool) -> bool
 
-        attr_reader interrupt_response: bool?
+          attr_accessor idle_timeout_ms: Integer?
 
-        def interrupt_response=: (bool) -> bool
+          attr_reader interrupt_response: bool?
 
-        attr_reader prefix_padding_ms: Integer?
+          def interrupt_response=: (bool) -> bool
 
-        def prefix_padding_ms=: (Integer) -> Integer
+          attr_reader prefix_padding_ms: Integer?
 
-        attr_reader silence_duration_ms: Integer?
+          def prefix_padding_ms=: (Integer) -> Integer
 
-        def silence_duration_ms=: (Integer) -> Integer
+          attr_reader silence_duration_ms: Integer?
 
-        attr_reader threshold: Float?
+          def silence_duration_ms=: (Integer) -> Integer
 
-        def threshold=: (Float) -> Float
+          attr_reader threshold: Float?
 
-        attr_reader type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_?
+          def threshold=: (Float) -> Float
 
-        def type=: (
-          OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_
-        ) -> OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_
+          def initialize: (
+            ?create_response: bool,
+            ?idle_timeout_ms: Integer?,
+            ?interrupt_response: bool,
+            ?prefix_padding_ms: Integer,
+            ?silence_duration_ms: Integer,
+            ?threshold: Float,
+            ?type: :server_vad
+          ) -> void
 
-        def initialize: (
-          ?create_response: bool,
-          ?eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness,
-          ?idle_timeout_ms: Integer?,
-          ?interrupt_response: bool,
-          ?prefix_padding_ms: Integer,
-          ?silence_duration_ms: Integer,
-          ?threshold: Float,
-          ?type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_
-        ) -> void
+          def to_hash: -> {
+            type: :server_vad,
+            create_response: bool,
+            idle_timeout_ms: Integer?,
+            interrupt_response: bool,
+            prefix_padding_ms: Integer,
+            silence_duration_ms: Integer,
+            threshold: Float
+          }
+        end
 
-        def to_hash: -> {
-          create_response: bool,
-          eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness,
-          idle_timeout_ms: Integer?,
-          interrupt_response: bool,
-          prefix_padding_ms: Integer,
-          silence_duration_ms: Integer,
-          threshold: Float,
-          type: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_
-        }
+        type semantic_vad =
+          {
+            type: :semantic_vad,
+            create_response: bool,
+            eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness,
+            interrupt_response: bool
+          }
 
-        type eagerness = :low | :medium | :high | :auto
+        class SemanticVad < OpenAI::Internal::Type::BaseModel
+          attr_accessor type: :semantic_vad
 
-        module Eagerness
-          extend OpenAI::Internal::Type::Enum
+          attr_reader create_response: bool?
 
-          LOW: :low
-          MEDIUM: :medium
-          HIGH: :high
-          AUTO: :auto
+          def create_response=: (bool) -> bool
 
-          def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::eagerness]
-        end
+          attr_reader eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness?
+
+          def eagerness=: (
+            OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness
+          ) -> OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness
+
+          attr_reader interrupt_response: bool?
 
-        type type_ = :server_vad | :semantic_vad
+          def interrupt_response=: (bool) -> bool
 
-        module Type
-          extend OpenAI::Internal::Type::Enum
+          def initialize: (
+            ?create_response: bool,
+            ?eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness,
+            ?interrupt_response: bool,
+            ?type: :semantic_vad
+          ) -> void
 
-          SERVER_VAD: :server_vad
-          SEMANTIC_VAD: :semantic_vad
+          def to_hash: -> {
+            type: :semantic_vad,
+            create_response: bool,
+            eagerness: OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness,
+            interrupt_response: bool
+          }
 
-          def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::type_]
+          type eagerness = :low | :medium | :high | :auto
+
+          module Eagerness
+            extend OpenAI::Internal::Type::Enum
+
+            LOW: :low
+            MEDIUM: :medium
+            HIGH: :high
+            AUTO: :auto
+
+            def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::eagerness]
+          end
         end
+
+        def self?.variants: -> ::Array[OpenAI::Models::Realtime::realtime_audio_input_turn_detection]
       end
     end
   end
diff --git a/sig/openai/models/realtime/realtime_session.rbs b/sig/openai/models/realtime/realtime_session.rbs
index 3f239c05..480c4857 100644
--- a/sig/openai/models/realtime/realtime_session.rbs
+++ b/sig/openai/models/realtime/realtime_session.rbs
@@ -21,7 +21,7 @@ module OpenAI
           tool_choice: String,
           tools: ::Array[OpenAI::Realtime::RealtimeFunctionTool],
           tracing: OpenAI::Models::Realtime::RealtimeSession::tracing?,
-          turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection?,
+          turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection?,
           voice: OpenAI::Models::Realtime::RealtimeSession::voice
         }
 
@@ -106,7 +106,7 @@ module OpenAI
 
         attr_accessor tracing: OpenAI::Models::Realtime::RealtimeSession::tracing?
 
-        attr_accessor turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection?
+        attr_accessor turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection?
 
         attr_reader voice: OpenAI::Models::Realtime::RealtimeSession::voice?
 
@@ -133,7 +133,7 @@ module OpenAI
           ?tool_choice: String,
           ?tools: ::Array[OpenAI::Realtime::RealtimeFunctionTool],
           ?tracing: OpenAI::Models::Realtime::RealtimeSession::tracing?,
-          ?turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection?,
+          ?turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection?,
           ?voice: OpenAI::Models::Realtime::RealtimeSession::voice
         ) -> void
 
@@ -156,7 +156,7 @@ module OpenAI
           tool_choice: String,
           tools: ::Array[OpenAI::Realtime::RealtimeFunctionTool],
           tracing: OpenAI::Models::Realtime::RealtimeSession::tracing?,
-          turn_detection: OpenAI::Realtime::RealtimeSession::TurnDetection?,
+          turn_detection: OpenAI::Models::Realtime::RealtimeSession::turn_detection?,
           voice: OpenAI::Models::Realtime::RealtimeSession::voice
         }
 
@@ -307,97 +307,123 @@ module OpenAI
         end
 
         type turn_detection =
-          {
-            create_response: bool,
-            eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness,
-            idle_timeout_ms: Integer?,
-            interrupt_response: bool,
-            prefix_padding_ms: Integer,
-            silence_duration_ms: Integer,
-            threshold: Float,
-            type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_
-          }
+          OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad
+          | OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
 
-        class TurnDetection < OpenAI::Internal::Type::BaseModel
-          attr_reader create_response: bool?
+        module TurnDetection
+          extend OpenAI::Internal::Type::Union
 
-          def create_response=: (bool) -> bool
+          type server_vad =
+            {
+              type: :server_vad,
+              create_response: bool,
+              idle_timeout_ms: Integer?,
+              interrupt_response: bool,
+              prefix_padding_ms: Integer,
+              silence_duration_ms: Integer,
+              threshold: Float
+            }
 
-          attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness?
+          class ServerVad < OpenAI::Internal::Type::BaseModel
+            attr_accessor type: :server_vad
 
-          def eagerness=: (
-            OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness
-          ) -> OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness
+            attr_reader create_response: bool?
 
-          attr_accessor idle_timeout_ms: Integer?
+            def create_response=: (bool) -> bool
 
-          attr_reader interrupt_response: bool?
+            attr_accessor idle_timeout_ms: Integer?
 
-          def interrupt_response=: (bool) -> bool
+            attr_reader interrupt_response: bool?
 
-          attr_reader prefix_padding_ms: Integer?
+            def interrupt_response=: (bool) -> bool
 
-          def prefix_padding_ms=: (Integer) -> Integer
+            attr_reader prefix_padding_ms: Integer?
 
-          attr_reader silence_duration_ms: Integer?
+            def prefix_padding_ms=: (Integer) -> Integer
 
-          def silence_duration_ms=: (Integer) -> Integer
+            attr_reader silence_duration_ms: Integer?
 
-          attr_reader threshold: Float?
+            def silence_duration_ms=: (Integer) -> Integer
 
-          def threshold=: (Float) -> Float
+            attr_reader threshold: Float?
 
-          attr_reader type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_?
+            def threshold=: (Float) -> Float
 
-          def type=: (
-            OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_
-          ) -> OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_
+            def initialize: (
+              ?create_response: bool,
+              ?idle_timeout_ms: Integer?,
+              ?interrupt_response: bool,
+              ?prefix_padding_ms: Integer,
+              ?silence_duration_ms: Integer,
+              ?threshold: Float,
+              ?type: :server_vad
+            ) -> void
 
-          def initialize: (
-            ?create_response: bool,
-            ?eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness,
-            ?idle_timeout_ms: Integer?,
-            ?interrupt_response: bool,
-            ?prefix_padding_ms: Integer,
-            ?silence_duration_ms: Integer,
-            ?threshold: Float,
-            ?type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_
-          ) -> void
+            def to_hash: -> {
+              type: :server_vad,
+              create_response: bool,
+              idle_timeout_ms: Integer?,
+              interrupt_response: bool,
+              prefix_padding_ms: Integer,
+              silence_duration_ms: Integer,
+              threshold: Float
+            }
+          end
 
-          def to_hash: -> {
-            create_response: bool,
-            eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness,
-            idle_timeout_ms: Integer?,
-            interrupt_response: bool,
-            prefix_padding_ms: Integer,
-            silence_duration_ms: Integer,
-            threshold: Float,
-            type: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_
-          }
+          type semantic_vad =
+            {
+              type: :semantic_vad,
+              create_response: bool,
+              eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness,
+              interrupt_response: bool
+            }
 
-          type eagerness = :low | :medium | :high | :auto
+          class SemanticVad < OpenAI::Internal::Type::BaseModel
+            attr_accessor type: :semantic_vad
 
-          module Eagerness
-            extend OpenAI::Internal::Type::Enum
+            attr_reader create_response: bool?
 
-            LOW: :low
-            MEDIUM: :medium
-            HIGH: :high
-            AUTO: :auto
+            def create_response=: (bool) -> bool
 
-            def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::TurnDetection::eagerness]
-          end
+            attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness?
+
+            def eagerness=: (
+              OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness
+            ) -> OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness
+
+            attr_reader interrupt_response: bool?
+
+            def interrupt_response=: (bool) -> bool
+
+            def initialize: (
+              ?create_response: bool,
+              ?eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness,
+              ?interrupt_response: bool,
+              ?type: :semantic_vad
+            ) -> void
+
+            def to_hash: -> {
+              type: :semantic_vad,
+              create_response: bool,
+              eagerness: OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness,
+              interrupt_response: bool
+            }
 
-          type type_ = :server_vad | :semantic_vad
+            type eagerness = :low | :medium | :high | :auto
 
-          module Type
-            extend OpenAI::Internal::Type::Enum
+            module Eagerness
+              extend OpenAI::Internal::Type::Enum
 
-            SERVER_VAD: :server_vad
-            SEMANTIC_VAD: :semantic_vad
+              LOW: :low
+              MEDIUM: :medium
+              HIGH: :high
+              AUTO: :auto
 
-            def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::TurnDetection::type_]
+              def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::eagerness]
+            end
           end
+
+          def self?.variants: -> ::Array[OpenAI::Models::Realtime::RealtimeSession::turn_detection]
         end
 
         type voice =
diff --git a/sig/openai/models/realtime/realtime_session_create_response.rbs b/sig/openai/models/realtime/realtime_session_create_response.rbs
index fbecbd5d..e4daa170 100644
--- a/sig/openai/models/realtime/realtime_session_create_response.rbs
+++ b/sig/openai/models/realtime/realtime_session_create_response.rbs
@@ -147,7 +147,7 @@ module OpenAI
               format_: OpenAI::Models::Realtime::realtime_audio_formats,
               noise_reduction: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction,
               transcription: OpenAI::Realtime::AudioTranscription,
-              turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
+              turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection?
             }
 
           class Input < OpenAI::Internal::Type::BaseModel
@@ -169,24 +169,20 @@ module OpenAI
               OpenAI::Realtime::AudioTranscription
             ) -> OpenAI::Realtime::AudioTranscription
 
-            attr_reader turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection?
-
-            def turn_detection=: (
-              OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
-            ) -> OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
+            attr_accessor turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection?
 
             def initialize: (
               ?format_: OpenAI::Models::Realtime::realtime_audio_formats,
               ?noise_reduction: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction,
               ?transcription: OpenAI::Realtime::AudioTranscription,
-              ?turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
+              ?turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection?
             ) -> void
 
             def to_hash: -> {
               format_: OpenAI::Models::Realtime::realtime_audio_formats,
               noise_reduction: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction,
               transcription: OpenAI::Realtime::AudioTranscription,
-              turn_detection: OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
+              turn_detection: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection?
             }
 
             type noise_reduction =
@@ -209,97 +205,123 @@ module OpenAI
             end
 
             type turn_detection =
-              {
-                create_response: bool,
-                eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness,
-                idle_timeout_ms: Integer?,
-                interrupt_response: bool,
-                prefix_padding_ms: Integer,
-                silence_duration_ms: Integer,
-                threshold: Float,
-                type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_
-              }
+              OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad
+              | OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad
 
-            class TurnDetection < OpenAI::Internal::Type::BaseModel
-              attr_reader create_response: bool?
+            module TurnDetection
+              extend OpenAI::Internal::Type::Union
 
-              def create_response=: (bool) -> bool
+              type server_vad =
+                {
+                  type: :server_vad,
+                  create_response: bool,
+                  idle_timeout_ms: Integer?,
+                  interrupt_response: bool,
+                  prefix_padding_ms: Integer,
+                  silence_duration_ms: Integer,
+                  threshold: Float
+                }
 
-              attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness?
+              class ServerVad < OpenAI::Internal::Type::BaseModel
+                attr_accessor type: :server_vad
 
-              def eagerness=: (
-                OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness
-              ) -> OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness
+                attr_reader create_response: bool?
 
-              attr_accessor idle_timeout_ms: Integer?
+                def create_response=: (bool) -> bool
 
-              attr_reader interrupt_response: bool?
+                attr_accessor idle_timeout_ms: Integer?
 
-              def interrupt_response=: (bool) -> bool
+                attr_reader interrupt_response: bool?
 
-              attr_reader prefix_padding_ms: Integer?
+                def interrupt_response=: (bool) -> bool
 
-              def prefix_padding_ms=: (Integer) -> Integer
+                attr_reader prefix_padding_ms: Integer?
 
-              attr_reader silence_duration_ms: Integer?
+                def prefix_padding_ms=: (Integer) -> Integer
 
-              def silence_duration_ms=: (Integer) -> Integer
+                attr_reader silence_duration_ms: Integer?
 
-              attr_reader threshold: Float?
+                def silence_duration_ms=: (Integer) -> Integer
 
-              def threshold=: (Float) -> Float
+                attr_reader threshold: Float?
 
-              attr_reader type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_?
+                def threshold=: (Float) -> Float
 
-              def type=: (
-                OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_
-              ) -> OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_
+                def initialize: (
+                  ?create_response: bool,
+                  ?idle_timeout_ms: Integer?,
+                  ?interrupt_response: bool,
+                  ?prefix_padding_ms: Integer,
+                  ?silence_duration_ms: Integer,
+                  ?threshold: Float,
+                  ?type: :server_vad
+                ) -> void
 
-              def initialize: (
-                ?create_response: bool,
-                ?eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness,
-                ?idle_timeout_ms: Integer?,
-                ?interrupt_response: bool,
-                ?prefix_padding_ms: Integer,
-                ?silence_duration_ms: Integer,
-                ?threshold: Float,
-                ?type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_
-              ) -> void
+                def to_hash: -> {
+                  type: :server_vad,
+                  create_response: bool,
+                  idle_timeout_ms: Integer?,
+                  interrupt_response: bool,
+                  prefix_padding_ms: Integer,
+                  silence_duration_ms: Integer,
+                  threshold: Float
+                }
+              end
 
-              def to_hash: -> {
-                create_response: bool,
-                eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness,
-                idle_timeout_ms: Integer?,
-                interrupt_response: bool,
-                prefix_padding_ms: Integer,
-                silence_duration_ms: Integer,
-                threshold: Float,
-                type: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_
-              }
+              type semantic_vad =
+                {
+                  type: :semantic_vad,
+                  create_response: bool,
+                  eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness,
+                  interrupt_response: bool
+                }
 
-              type eagerness = :low | :medium | :high | :auto
+              class SemanticVad < OpenAI::Internal::Type::BaseModel
+                attr_accessor type: :semantic_vad
 
-              module Eagerness
-                extend OpenAI::Internal::Type::Enum
+                attr_reader create_response: bool?
 
-                LOW: :low
-                MEDIUM: :medium
-                HIGH: :high
-                AUTO: :auto
+                def create_response=: (bool) -> bool
 
-                def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::eagerness]
-              end
+                attr_reader eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness?
 
-              type type_ = :server_vad | :semantic_vad
+                def eagerness=: (
+                  OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness
+                ) -> OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness
 
-              module Type
-                extend OpenAI::Internal::Type::Enum
+                attr_reader interrupt_response: bool?
 
-                SERVER_VAD: :server_vad
-                SEMANTIC_VAD: :semantic_vad
+                def interrupt_response=: (bool) -> bool
 
-                def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::type_]
+                def initialize: (
+                  ?create_response: bool,
+                  ?eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness,
+                  ?interrupt_response: bool,
+                  ?type: :semantic_vad
+                ) -> void
+
+                def to_hash: -> {
+                  type: :semantic_vad,
+                  create_response: bool,
+                  eagerness: OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness,
+                  interrupt_response: bool
+                }
+
+                type eagerness = :low | :medium | :high | :auto
+
+                module Eagerness
+                  extend OpenAI::Internal::Type::Enum
+
+                  LOW: :low
+                  MEDIUM: :medium
+                  HIGH: :high
+                  AUTO: :auto
+
+                  def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::eagerness]
+                end
               end
+
+              def self?.variants: -> ::Array[OpenAI::Models::Realtime::RealtimeSessionCreateResponse::Audio::Input::turn_detection]
             end
           end
 
diff --git a/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs b/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs
index 44b5b8fa..33d8b172 100644
--- a/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs
+++ b/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs
@@ -6,7 +6,7 @@ module OpenAI
           format_: OpenAI::Models::Realtime::realtime_audio_formats,
           noise_reduction: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction,
           transcription: OpenAI::Realtime::AudioTranscription,
-          turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
+          turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection?
         }
 
       class RealtimeTranscriptionSessionAudioInput < OpenAI::Internal::Type::BaseModel
@@ -28,24 +28,20 @@ module OpenAI
           OpenAI::Realtime::AudioTranscription
         ) -> OpenAI::Realtime::AudioTranscription
 
-        attr_reader turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection?
-
-        def turn_detection=: (
-          OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
-        ) -> OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
+        attr_accessor turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection?
 
         def initialize: (
           ?format_: OpenAI::Models::Realtime::realtime_audio_formats,
           ?noise_reduction: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction,
           ?transcription: OpenAI::Realtime::AudioTranscription,
-          ?turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
+          ?turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection?
         ) -> void
 
         def to_hash: -> {
           format_: OpenAI::Models::Realtime::realtime_audio_formats,
           noise_reduction: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction,
           transcription: OpenAI::Realtime::AudioTranscription,
-          turn_detection: OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
+          turn_detection: OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection?
         }
 
         type noise_reduction =
diff --git a/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs b/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs
index 56ac5314..1a61a5e7 100644
--- a/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs
+++ b/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs
@@ -2,97 +2,123 @@ module OpenAI
   module Models
     module Realtime
       type realtime_transcription_session_audio_input_turn_detection =
-        {
-          create_response: bool,
-          eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness,
-          idle_timeout_ms: Integer?,
-          interrupt_response: bool,
-          prefix_padding_ms: Integer,
-          silence_duration_ms: Integer,
-          threshold: Float,
-          type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_
-        }
+        OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad
+        | OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
 
-      class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
-        attr_reader create_response: bool?
+      module RealtimeTranscriptionSessionAudioInputTurnDetection
+        extend OpenAI::Internal::Type::Union
 
-        def create_response=: (bool) -> bool
+        type server_vad =
+          {
+            type: :server_vad,
+            create_response: bool,
+            idle_timeout_ms: Integer?,
+            interrupt_response: bool,
+            prefix_padding_ms: Integer,
+            silence_duration_ms: Integer,
+            threshold: Float
+          }
 
-        attr_reader eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness?
+        class ServerVad < OpenAI::Internal::Type::BaseModel
+          attr_accessor type: :server_vad
 
-        def eagerness=: (
-          OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness
-        ) -> OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness
+          attr_reader create_response: bool?
 
-        attr_accessor idle_timeout_ms: Integer?
+          def create_response=: (bool) -> bool
 
-        attr_reader interrupt_response: bool?
+          attr_accessor idle_timeout_ms: Integer?
 
-        def interrupt_response=: (bool) -> bool
+          attr_reader interrupt_response: bool?
 
-        attr_reader prefix_padding_ms: Integer?
+          def interrupt_response=: (bool) -> bool
 
-        def prefix_padding_ms=: (Integer) -> Integer
+          attr_reader prefix_padding_ms: Integer?
 
-        attr_reader silence_duration_ms: Integer?
+          def prefix_padding_ms=: (Integer) -> Integer
 
-        def silence_duration_ms=: (Integer) -> Integer
+          attr_reader silence_duration_ms: Integer?
 
-        attr_reader threshold: Float?
+          def silence_duration_ms=: (Integer) -> Integer
 
-        def threshold=: (Float) -> Float
+          attr_reader threshold: Float?
 
-        attr_reader type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_?
+          def threshold=: (Float) -> Float
 
-        def type=: (
-          OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_
-        ) -> OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_
+          def initialize: (
+            ?create_response: bool,
+            ?idle_timeout_ms: Integer?,
+            ?interrupt_response: bool,
+            ?prefix_padding_ms: Integer,
+            ?silence_duration_ms: Integer,
+            ?threshold: Float,
+            ?type: :server_vad
+          ) -> void
 
-        def initialize: (
-          ?create_response: bool,
-          ?eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness,
-          ?idle_timeout_ms: Integer?,
-          ?interrupt_response: bool,
-          ?prefix_padding_ms: Integer,
-          ?silence_duration_ms: Integer,
-          ?threshold: Float,
-          ?type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_
-        ) -> void
+          def to_hash: -> {
+            type: :server_vad,
+            create_response: bool,
+            idle_timeout_ms: Integer?,
+            interrupt_response: bool,
+            prefix_padding_ms: Integer,
+            silence_duration_ms: Integer,
+            threshold: Float
+          }
+        end
 
-        def to_hash: -> {
-          create_response: bool,
-          eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness,
-          idle_timeout_ms: Integer?,
-          interrupt_response: bool,
-          prefix_padding_ms: Integer,
-          silence_duration_ms: Integer,
-          threshold: Float,
-          type: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_
-        }
+        type semantic_vad =
+          {
+            type: :semantic_vad,
+            create_response: bool,
+            eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness,
+            interrupt_response: bool
+          }
 
-        type eagerness = :low | :medium | :high | :auto
+        class SemanticVad < OpenAI::Internal::Type::BaseModel
+          attr_accessor type: :semantic_vad
 
-        module Eagerness
-          extend OpenAI::Internal::Type::Enum
+          attr_reader create_response: bool?
 
-          LOW: :low
-          MEDIUM: :medium
-          HIGH: :high
-          AUTO: :auto
+          def create_response=: (bool) -> bool
 
-          def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::eagerness]
-        end
+          attr_reader eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness?
+
+          def eagerness=: (
+            OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness
+          ) -> OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness
+
+          attr_reader interrupt_response: bool?
 
-        type type_ = :server_vad | :semantic_vad
+          def interrupt_response=: (bool) -> bool
 
-        module Type
-          extend OpenAI::Internal::Type::Enum
+          def initialize: (
+            ?create_response: bool,
+            ?eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness,
+            ?interrupt_response: bool,
+            ?type: :semantic_vad
+          ) -> void
 
-          SERVER_VAD: :server_vad
-          SEMANTIC_VAD: :semantic_vad
+          def to_hash: -> {
+            type: :semantic_vad,
+            create_response: bool,
+            eagerness: OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness,
+            interrupt_response: bool
+          }
 
-          def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::type_]
+          type eagerness = :low | :medium | :high | :auto
+
+          module Eagerness
+            extend OpenAI::Internal::Type::Enum
+
+            LOW: :low
+            MEDIUM: :medium
+            HIGH: :high
+            AUTO: :auto
+
+            def self?.values: -> ::Array[OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::eagerness]
+          end
         end
+
+        def self?.variants: -> ::Array[OpenAI::Models::Realtime::realtime_transcription_session_audio_input_turn_detection]
       end
     end
   end

From 31fe462be11f92561d9d0083bd28f76f43db0fe7 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:27:46 +0000
Subject: [PATCH 2/3] codegen metadata

---
 .stats.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index 5388f246..e3897189 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
-openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-94b1e3cb0bdc616ff0c2f267c33dadd95f133b1f64e647aab6c64afb292b2793.yml
+openapi_spec_hash: 2395319ac9befd59b6536ae7f9564a05
 config_hash: 930dac3aa861344867e4ac84f037b5df

From cd99af965697a05ec394578270a5ebdfd3e742df Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:28:09 +0000
Subject: [PATCH 3/3] release: 0.23.2

---
 .release-please-manifest.json | 2 +-
 CHANGELOG.md                  | 8 ++++++++
 Gemfile.lock                  | 2 +-
 README.md                     | 2 +-
 lib/openai/version.rb         | 2 +-
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 354c2fa8..c9da8cc1 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.23.1"
+  ".": "0.23.2"
 }
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d8b3a290..6df92bf4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## 0.23.2 (2025-09-11)
+
+Full Changelog: [v0.23.1...v0.23.2](https://github.com/openai/openai-ruby/compare/v0.23.1...v0.23.2)
+
+### Chores
+
+* **api:** Minor docs and type updates for realtime ([ccef982](https://github.com/openai/openai-ruby/commit/ccef9827b31206fc9ba40d2b6165eeefda7621f5))
+
 ## 0.23.1 (2025-09-10)
 
 Full Changelog: [v0.23.0...v0.23.1](https://github.com/openai/openai-ruby/compare/v0.23.0...v0.23.1)
diff --git a/Gemfile.lock b/Gemfile.lock
index 85e14eb4..04b60951 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -11,7 +11,7 @@ GIT
 PATH
   remote: .
   specs:
-    openai (0.23.1)
+    openai (0.23.2)
       connection_pool
 
 GEM
diff --git a/README.md b/README.md
index ace31963..9a262951 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ To use this gem, install via Bundler by adding the following to your application
 <!-- x-release-please-start-version -->
 
 ```ruby
-gem "openai", "~> 0.23.1"
+gem "openai", "~> 0.23.2"
 ```
 
 <!-- x-release-please-end -->
diff --git a/lib/openai/version.rb b/lib/openai/version.rb
index 4ec01a14..fbf5600a 100644
--- a/lib/openai/version.rb
+++ b/lib/openai/version.rb
@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 
 module OpenAI
-  VERSION = "0.23.1"
+  VERSION = "0.23.2"
 end