Skip to content

Commit 6ff7e8f

Browse files
danodoesdesignsteipete
authored andcommitted
talk: add configurable silence timeout
1 parent 097c588 commit 6ff7e8f

File tree

18 files changed

+162
-9
lines changed

18 files changed

+162
-9
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
88

99
- TUI: infer the active agent from the current workspace when launched inside a configured agent workspace, while preserving explicit `agent:` session targets. (#39591) thanks @arceus77-7.
1010
- Tools/Brave web search: add opt-in `tools.web.search.brave.mode: "llm-context"` so `web_search` can call Brave's LLM Context endpoint and return extracted grounding snippets with source metadata, plus config/docs/test coverage. (#33383) Thanks @thirumaleshp.
11+
- Talk mode: add top-level `talk.silenceTimeoutMs` config so Talk waits a configurable amount of silence before auto-sending the current transcript, while keeping each platform's existing default pause window when unset. (#39607) Thanks @danodoesdesign. Fixes #17147.
1112

1213
### Fixes
1314

apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeManager.kt

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ class TalkModeManager(
5959
private const val tag = "TalkMode"
6060
private const val defaultModelIdFallback = "eleven_v3"
6161
private const val defaultOutputFormatFallback = "pcm_24000"
62-
private const val defaultTalkProvider = "elevenlabs"
63-
private const val silenceWindowMs = 500L
62+
private const val defaultTalkProvider = "elevenlabs"
63+
private const val defaultSilenceTimeoutMs = 700L
6464
private const val listenWatchdogMs = 12_000L
6565
private const val chatFinalWaitWithSubscribeMs = 45_000L
6666
private const val chatFinalWaitWithoutSubscribeMs = 6_000L
@@ -105,6 +105,14 @@ private const val defaultTalkProvider = "elevenlabs"
105105
normalizedPayload = false,
106106
)
107107
}
108+
109+
internal fun resolvedSilenceTimeoutMs(talk: JsonObject?): Long {
110+
val timeout = talk?.get("silenceTimeoutMs").asDoubleOrNull() ?: return defaultSilenceTimeoutMs
111+
if (timeout <= 0 || timeout % 1.0 != 0.0 || timeout > Long.MAX_VALUE.toDouble()) {
112+
return defaultSilenceTimeoutMs
113+
}
114+
return timeout.toLong()
115+
}
108116
}
109117

110118
private val mainHandler = Handler(Looper.getMainLooper())
@@ -134,7 +142,7 @@ private const val defaultTalkProvider = "elevenlabs"
134142
private var listeningMode = false
135143

136144
private var silenceJob: Job? = null
137-
private val silenceWindowMs = 700L
145+
private var silenceWindowMs = defaultSilenceTimeoutMs
138146
private var lastTranscript: String = ""
139147
private var lastHeardAtMs: Long? = null
140148
private var lastSpokenText: String? = null
@@ -1411,6 +1419,7 @@ private const val defaultTalkProvider = "elevenlabs"
14111419
activeConfig?.get("outputFormat")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
14121420
val key = activeConfig?.get("apiKey")?.asStringOrNull()?.trim()?.takeIf { it.isNotEmpty() }
14131421
val interrupt = talk?.get("interruptOnSpeech")?.asBooleanOrNull()
1422+
val silenceTimeoutMs = resolvedSilenceTimeoutMs(talk)
14141423

14151424
if (!isCanonicalMainSessionKey(mainSessionKey)) {
14161425
mainSessionKey = mainKey
@@ -1427,7 +1436,11 @@ private const val defaultTalkProvider = "elevenlabs"
14271436
if (!modelOverrideActive) currentModelId = defaultModelId
14281437
defaultOutputFormat = outputFormat ?: defaultOutputFormatFallback
14291438
apiKey = key ?: envKey?.takeIf { it.isNotEmpty() }
1430-
Log.d(tag, "reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId")
1439+
silenceWindowMs = silenceTimeoutMs
1440+
Log.d(
1441+
tag,
1442+
"reloadConfig apiKey=${if (apiKey != null) "set" else "null"} voiceId=$defaultVoiceId silenceTimeoutMs=$silenceTimeoutMs",
1443+
)
14311444
if (interrupt != null) interruptOnSpeech = interrupt
14321445
activeProviderIsElevenLabs = activeProvider == defaultTalkProvider
14331446
if (!activeProviderIsElevenLabs) {
@@ -1441,6 +1454,7 @@ private const val defaultTalkProvider = "elevenlabs"
14411454
}
14421455
configLoaded = true
14431456
} catch (_: Throwable) {
1457+
silenceWindowMs = defaultSilenceTimeoutMs
14441458
defaultVoiceId = envVoice?.takeIf { it.isNotEmpty() } ?: sagVoice?.takeIf { it.isNotEmpty() }
14451459
defaultModelId = defaultModelIdFallback
14461460
if (!modelOverrideActive) currentModelId = defaultModelId

apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeConfigParsingTest.kt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,23 @@ class TalkModeConfigParsingTest {
5454
assertEquals("voice-legacy", selection?.config?.get("voiceId")?.jsonPrimitive?.content)
5555
assertEquals("legacy-key", selection?.config?.get("apiKey")?.jsonPrimitive?.content)
5656
}
57+
58+
@Test
59+
fun readsConfiguredSilenceTimeoutMs() {
60+
val talk = buildJsonObject { put("silenceTimeoutMs", 1500) }
61+
62+
assertEquals(1500L, TalkModeManager.resolvedSilenceTimeoutMs(talk))
63+
}
64+
65+
@Test
66+
fun defaultsSilenceTimeoutMsWhenMissing() {
67+
assertEquals(700L, TalkModeManager.resolvedSilenceTimeoutMs(null))
68+
}
69+
70+
@Test
71+
fun defaultsSilenceTimeoutMsWhenInvalid() {
72+
val talk = buildJsonObject { put("silenceTimeoutMs", 0) }
73+
74+
assertEquals(700L, TalkModeManager.resolvedSilenceTimeoutMs(talk))
75+
}
5776
}

apps/ios/Sources/Voice/TalkModeManager.swift

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ final class TalkModeManager: NSObject {
3434
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
3535
private static let defaultModelIdFallback = "eleven_v3"
3636
private static let defaultTalkProvider = "elevenlabs"
37+
private static let defaultSilenceTimeoutMs = 900
3738
private static let redactedConfigSentinel = "__OPENCLAW_REDACTED__"
3839
var isEnabled: Bool = false
3940
var isListening: Bool = false
@@ -97,7 +98,7 @@ final class TalkModeManager: NSObject {
9798

9899
private var gateway: GatewayNodeSession?
99100
private var gatewayConnected = false
100-
private let silenceWindow: TimeInterval = 0.9
101+
private var silenceWindow: TimeInterval = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000
101102
private var lastAudioActivity: Date?
102103
private var noiseFloorSamples: [Double] = []
103104
private var noiseFloor: Double?
@@ -2001,6 +2002,24 @@ extension TalkModeManager {
20012002
config: normalizedProviders[providerID] ?? [:])
20022003
}
20032004

2005+
static func resolvedSilenceTimeoutMs(_ talk: [String: Any]?) -> Int {
2006+
switch talk?["silenceTimeoutMs"] {
2007+
case let timeout as Int where timeout > 0:
2008+
return timeout
2009+
case let timeout as Double
2010+
where timeout > 0 && timeout.rounded(.towardZero) == timeout && timeout <= Double(Int.max):
2011+
return Int(timeout)
2012+
case let timeout as NSNumber:
2013+
let value = timeout.doubleValue
2014+
if value > 0 && value.rounded(.towardZero) == value && value <= Double(Int.max) {
2015+
return Int(value)
2016+
}
2017+
return Self.defaultSilenceTimeoutMs
2018+
default:
2019+
return Self.defaultSilenceTimeoutMs
2020+
}
2021+
}
2022+
20042023
func reloadConfig() async {
20052024
guard let gateway else { return }
20062025
self.pcmFormatUnavailable = false
@@ -2020,6 +2039,7 @@ extension TalkModeManager {
20202039
}
20212040
let activeProvider = selection?.provider ?? Self.defaultTalkProvider
20222041
let activeConfig = selection?.config
2042+
let silenceTimeoutMs = Self.resolvedSilenceTimeoutMs(talk)
20232043
self.defaultVoiceId = (activeConfig?["voiceId"] as? String)?
20242044
.trimmingCharacters(in: .whitespacesAndNewlines)
20252045
if let aliases = activeConfig?["voiceAliases"] as? [String: Any] {
@@ -2067,8 +2087,9 @@ extension TalkModeManager {
20672087
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
20682088
self.interruptOnSpeech = interrupt
20692089
}
2090+
self.silenceWindow = TimeInterval(silenceTimeoutMs) / 1000
20702091
if selection != nil {
2071-
GatewayDiagnostics.log("talk config provider=\(activeProvider)")
2092+
GatewayDiagnostics.log("talk config provider=\(activeProvider) silenceTimeoutMs=\(silenceTimeoutMs)")
20722093
}
20732094
} catch {
20742095
self.defaultModelId = Self.defaultModelIdFallback
@@ -2079,6 +2100,7 @@ extension TalkModeManager {
20792100
self.gatewayTalkDefaultModelId = nil
20802101
self.gatewayTalkApiKeyConfigured = false
20812102
self.gatewayTalkConfigLoaded = false
2103+
self.silenceWindow = TimeInterval(Self.defaultSilenceTimeoutMs) / 1000
20822104
}
20832105
}
20842106

apps/ios/Tests/TalkModeConfigParsingTests.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,24 @@ import Testing
4747
userInfo: [NSLocalizedDescriptionKey: "queue enqueue failed"])
4848
#expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error) == false)
4949
}
50+
51+
@Test func readsConfiguredSilenceTimeoutMs() {
52+
let talk: [String: Any] = [
53+
"silenceTimeoutMs": 1500,
54+
]
55+
56+
#expect(TalkModeManager.resolvedSilenceTimeoutMs(talk) == 1500)
57+
}
58+
59+
@Test func defaultsSilenceTimeoutMsWhenMissing() {
60+
#expect(TalkModeManager.resolvedSilenceTimeoutMs(nil) == 900)
61+
}
62+
63+
@Test func defaultsSilenceTimeoutMsWhenInvalid() {
64+
let talk: [String: Any] = [
65+
"silenceTimeoutMs": 0,
66+
]
67+
68+
#expect(TalkModeManager.resolvedSilenceTimeoutMs(talk) == 900)
69+
}
5070
}

apps/macos/Sources/OpenClaw/TalkModeRuntime.swift

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ actor TalkModeRuntime {
1212
private let ttsLogger = Logger(subsystem: "ai.openclaw", category: "talk.tts")
1313
private static let defaultModelIdFallback = "eleven_v3"
1414
private static let defaultTalkProvider = "elevenlabs"
15+
private static let defaultSilenceTimeoutMs = 700
1516

1617
private final class RMSMeter: @unchecked Sendable {
1718
private let lock = NSLock()
@@ -66,7 +67,7 @@ actor TalkModeRuntime {
6667
private var fallbackVoiceId: String?
6768
private var lastPlaybackWasPCM: Bool = false
6869

69-
private let silenceWindow: TimeInterval = 0.7
70+
private var silenceWindow: TimeInterval = TimeInterval(TalkModeRuntime.defaultSilenceTimeoutMs) / 1000
7071
private let minSpeechRMS: Double = 1e-3
7172
private let speechBoostFactor: Double = 6.0
7273

@@ -783,6 +784,7 @@ extension TalkModeRuntime {
783784
}
784785
self.defaultOutputFormat = cfg.outputFormat
785786
self.interruptOnSpeech = cfg.interruptOnSpeech
787+
self.silenceWindow = TimeInterval(cfg.silenceTimeoutMs) / 1000
786788
self.apiKey = cfg.apiKey
787789
let hasApiKey = (cfg.apiKey?.isEmpty == false)
788790
let voiceLabel = (cfg.voiceId?.isEmpty == false) ? cfg.voiceId! : "none"
@@ -792,7 +794,8 @@ extension TalkModeRuntime {
792794
"talk config voiceId=\(voiceLabel, privacy: .public) " +
793795
"modelId=\(modelLabel, privacy: .public) " +
794796
"apiKey=\(hasApiKey, privacy: .public) " +
795-
"interrupt=\(cfg.interruptOnSpeech, privacy: .public)")
797+
"interrupt=\(cfg.interruptOnSpeech, privacy: .public) " +
798+
"silenceTimeoutMs=\(cfg.silenceTimeoutMs, privacy: .public)")
796799
}
797800

798801
private struct TalkRuntimeConfig {
@@ -801,6 +804,7 @@ extension TalkModeRuntime {
801804
let modelId: String?
802805
let outputFormat: String?
803806
let interruptOnSpeech: Bool
807+
let silenceTimeoutMs: Int
804808
let apiKey: String?
805809
}
806810

@@ -880,6 +884,21 @@ extension TalkModeRuntime {
880884
normalizedPayload: false)
881885
}
882886

887+
static func resolvedSilenceTimeoutMs(_ talk: [String: AnyCodable]?) -> Int {
888+
if let timeout = talk?["silenceTimeoutMs"]?.intValue, timeout > 0 {
889+
return timeout
890+
}
891+
if
892+
let timeout = talk?["silenceTimeoutMs"]?.doubleValue,
893+
timeout > 0,
894+
timeout.rounded(.towardZero) == timeout,
895+
timeout <= Double(Int.max)
896+
{
897+
return Int(timeout)
898+
}
899+
return Self.defaultSilenceTimeoutMs
900+
}
901+
883902
private func fetchTalkConfig() async -> TalkRuntimeConfig {
884903
let env = ProcessInfo.processInfo.environment
885904
let envVoice = env["ELEVENLABS_VOICE_ID"]?.trimmingCharacters(in: .whitespacesAndNewlines)
@@ -895,6 +914,7 @@ extension TalkModeRuntime {
895914
let selection = Self.selectTalkProviderConfig(talk)
896915
let activeProvider = selection?.provider ?? Self.defaultTalkProvider
897916
let activeConfig = selection?.config
917+
let silenceTimeoutMs = Self.resolvedSilenceTimeoutMs(talk)
898918
let ui = snap.config?["ui"]?.dictionaryValue
899919
let rawSeam = ui?["seamColor"]?.stringValue?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
900920
await MainActor.run {
@@ -939,6 +959,7 @@ extension TalkModeRuntime {
939959
modelId: resolvedModel,
940960
outputFormat: outputFormat,
941961
interruptOnSpeech: interrupt ?? true,
962+
silenceTimeoutMs: silenceTimeoutMs,
942963
apiKey: resolvedApiKey)
943964
} catch {
944965
let resolvedVoice =
@@ -951,6 +972,7 @@ extension TalkModeRuntime {
951972
modelId: Self.defaultModelIdFallback,
952973
outputFormat: nil,
953974
interruptOnSpeech: true,
975+
silenceTimeoutMs: Self.defaultSilenceTimeoutMs,
954976
apiKey: resolvedApiKey)
955977
}
956978
}

apps/macos/Tests/OpenClawIPCTests/TalkModeConfigParsingTests.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,24 @@ struct TalkModeConfigParsingTests {
3232
#expect(selection?.config["voiceId"]?.stringValue == "voice-legacy")
3333
#expect(selection?.config["apiKey"]?.stringValue == "legacy-key")
3434
}
35+
36+
@Test func readsConfiguredSilenceTimeoutMs() {
37+
let talk: [String: AnyCodable] = [
38+
"silenceTimeoutMs": AnyCodable(1500),
39+
]
40+
41+
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(talk) == 1500)
42+
}
43+
44+
@Test func defaultsSilenceTimeoutMsWhenMissing() {
45+
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(nil) == 700)
46+
}
47+
48+
@Test func defaultsSilenceTimeoutMsWhenInvalid() {
49+
let talk: [String: AnyCodable] = [
50+
"silenceTimeoutMs": AnyCodable(0),
51+
]
52+
53+
#expect(TalkModeRuntime.resolvedSilenceTimeoutMs(talk) == 700)
54+
}
3555
}

docs/gateway/configuration-reference.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1659,6 +1659,7 @@ Defaults for Talk mode (macOS/iOS/Android).
16591659
modelId: "eleven_v3",
16601660
outputFormat: "mp3_44100_128",
16611661
apiKey: "elevenlabs_api_key",
1662+
silenceTimeoutMs: 1500,
16621663
interruptOnSpeech: true,
16631664
},
16641665
}
@@ -1668,6 +1669,7 @@ Defaults for Talk mode (macOS/iOS/Android).
16681669
- `apiKey` and `providers.*.apiKey` accept plaintext strings or SecretRef objects.
16691670
- `ELEVENLABS_API_KEY` fallback applies only when no Talk API key is configured.
16701671
- `voiceAliases` lets Talk directives use friendly names.
1672+
- `silenceTimeoutMs` controls how long Talk mode waits after user silence before it sends the transcript. Unset keeps the platform default pause window (`700` ms on macOS and Android, `900` ms on iOS).
16711673

16721674
---
16731675

docs/nodes/talk.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ Supported keys:
5656
modelId: "eleven_v3",
5757
outputFormat: "mp3_44100_128",
5858
apiKey: "elevenlabs_api_key",
59+
silenceTimeoutMs: 1500,
5960
interruptOnSpeech: true,
6061
},
6162
}
@@ -64,6 +65,7 @@ Supported keys:
6465
Defaults:
6566

6667
- `interruptOnSpeech`: true
68+
- `silenceTimeoutMs`: when unset, Talk keeps the platform default pause window before sending the transcript (`700` ms on macOS and Android, `900` ms on iOS)
6769
- `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
6870
- `modelId`: defaults to `eleven_v3` when unset
6971
- `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)

src/config/schema.help.quality.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ const TARGET_KEYS = [
305305
"talk.modelId",
306306
"talk.outputFormat",
307307
"talk.interruptOnSpeech",
308+
"talk.silenceTimeoutMs",
308309
"meta",
309310
"env",
310311
"env.shellEnv",

0 commit comments

Comments
 (0)