From e1543823f42fecc55bddfe028edf53634cd8edee Mon Sep 17 00:00:00 2001 From: fqscfqj Date: Tue, 26 May 2026 08:51:06 +0000 Subject: [PATCH 1/7] fix(qwen-asr): enable timestamp output when forced_aligner is configured Two bugs prevented timestamps from working in the qwen-asr backend: 1. transcribe() was called without return_time_stamps=True, so the forced aligner was loaded but never invoked. Now we pass return_time_stamps=True when a forced_aligner is present. 2. The timestamp parsing code expected (list, tuple) items, but the qwen_asr library returns ForcedAlignItem dataclass instances with .text, .start_time, .end_time attributes. Added hasattr() check to handle this correctly, falling back to tuple parsing for backward compatibility. --- backend/python/qwen-asr/backend.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py index 2d1940afc053..c437470a9ff9 100644 --- a/backend/python/qwen-asr/backend.py +++ b/backend/python/qwen-asr/backend.py @@ -151,7 +151,11 @@ def AudioTranscription(self, request, context): if request.prompt and request.prompt.strip(): context = request.prompt.strip() - results = self.model.transcribe(audio=audio_path, language=language, context=context) + has_aligner = getattr(self.model, 'forced_aligner', None) is not None + results = self.model.transcribe( + audio=audio_path, language=language, context=context, + return_time_stamps=has_aligner, + ) if not results: return backend_pb2.TranscriptResult(segments=[], text="") @@ -164,7 +168,12 @@ def AudioTranscription(self, request, context): start_ms = 0 end_ms = 0 seg_text = text - if isinstance(ts, (list, tuple)) and len(ts) >= 3: + if hasattr(ts, 'start_time') and hasattr(ts, 'end_time') and hasattr(ts, 'text'): + # ForcedAlignItem dataclass (from qwen_asr forced aligner) + start_ms = int(ts.start_time * 1000) if ts.start_time is not None else 0 + end_ms = int(ts.end_time * 1000) if ts.end_time is not None else 0 + seg_text = ts.text or "" + elif isinstance(ts, (list, tuple)) and len(ts) >= 3: start_ms = int(float(ts[0]) * 1000) if ts[0] is not None else 0 end_ms = int(float(ts[1]) * 1000) if ts[1] is not None else 0 seg_text = ts[2] if len(ts) > 2 and ts[2] is not None else "" From 346c5d21157ea053d2f459e82e63f613a173516f Mon Sep 17 00:00:00 2001 From: fqscfqj Date: Tue, 26 May 2026 08:51:17 +0000 Subject: [PATCH 2/7] refactor: address Copilot review for qwen-asr timestamps - Wrap return_time_stamps kwarg in try/except TypeError for safety - Add defensive float() normalization for timestamp times - Use str() for text extraction to ensure string type --- backend/python/qwen-asr/backend.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py index c437470a9ff9..26ea14920115 100644 --- a/backend/python/qwen-asr/backend.py +++ b/backend/python/qwen-asr/backend.py @@ -152,10 +152,13 @@ def AudioTranscription(self, request, context): context = request.prompt.strip() has_aligner = getattr(self.model, 'forced_aligner', None) is not None - results = self.model.transcribe( - audio=audio_path, language=language, context=context, - return_time_stamps=has_aligner, - ) + try: + results = self.model.transcribe( + audio=audio_path, language=language, context=context, + return_time_stamps=has_aligner, + ) + except TypeError: + results = self.model.transcribe(audio=audio_path, language=language, context=context) if not results: return backend_pb2.TranscriptResult(segments=[], text="") @@ -170,9 +173,9 @@ def AudioTranscription(self, request, context): seg_text = text if hasattr(ts, 'start_time') and hasattr(ts, 'end_time') and hasattr(ts, 'text'): # ForcedAlignItem dataclass (from qwen_asr forced aligner) - start_ms = int(ts.start_time * 1000) if ts.start_time is not None else 0 - end_ms = int(ts.end_time * 1000) if ts.end_time is not None else 0 - seg_text = ts.text or "" + start_ms = int(float(ts.start_time) * 1000) if ts.start_time is not None else 0 + end_ms = int(float(ts.end_time) * 1000) if ts.end_time is not None else 0 + seg_text = str(ts.text) if ts.text else "" elif isinstance(ts, (list, tuple)) and len(ts) >= 3: start_ms = int(float(ts[0]) * 1000) if ts[0] is not None else 0 end_ms = int(float(ts[1]) * 1000) if ts[1] is not None else 0 From ee2fc0b9c003818f82061686bec9478093890b01 Mon Sep 17 00:00:00 2001 From: fqscfqj Date: Tue, 26 May 2026 09:58:07 +0000 Subject: [PATCH 3/7] fix(qwen-asr): convert seconds to nanoseconds for Go time.Duration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Go server reads TranscriptSegment.start/end via time.Duration, which is in nanoseconds. Previously the backend sent milliseconds (* 1000), causing timestamps to be 1000x too small (e.g. 8e-8 instead of 0.08). Convert seconds → nanoseconds (* 1e9) instead. Also applies to the legacy tuple path for consistency. --- backend/python/qwen-asr/backend.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py index 26ea14920115..835285bdf97b 100644 --- a/backend/python/qwen-asr/backend.py +++ b/backend/python/qwen-asr/backend.py @@ -168,20 +168,21 @@ def AudioTranscription(self, request, context): if getattr(r, 'time_stamps', None) and len(r.time_stamps) > 0: for idx, ts in enumerate(r.time_stamps): - start_ms = 0 - end_ms = 0 + start_ns = 0 + end_ns = 0 seg_text = text + # Go's time.Duration is in nanoseconds, so convert seconds → ns. if hasattr(ts, 'start_time') and hasattr(ts, 'end_time') and hasattr(ts, 'text'): # ForcedAlignItem dataclass (from qwen_asr forced aligner) - start_ms = int(float(ts.start_time) * 1000) if ts.start_time is not None else 0 - end_ms = int(float(ts.end_time) * 1000) if ts.end_time is not None else 0 + start_ns = int(float(ts.start_time) * 1_000_000_000) if ts.start_time is not None else 0 + end_ns = int(float(ts.end_time) * 1_000_000_000) if ts.end_time is not None else 0 seg_text = str(ts.text) if ts.text else "" elif isinstance(ts, (list, tuple)) and len(ts) >= 3: - start_ms = int(float(ts[0]) * 1000) if ts[0] is not None else 0 - end_ms = int(float(ts[1]) * 1000) if ts[1] is not None else 0 + start_ns = int(float(ts[0]) * 1_000_000_000) if ts[0] is not None else 0 + end_ns = int(float(ts[1]) * 1_000_000_000) if ts[1] is not None else 0 seg_text = ts[2] if len(ts) > 2 and ts[2] is not None else "" result_segments.append(backend_pb2.TranscriptSegment( - id=idx, start=start_ms, end=end_ms, text=seg_text + id=idx, start=start_ns, end=end_ns, text=seg_text )) else: if text: From 72d0442de2030be57bd59f443146a1701d32dae1 Mon Sep 17 00:00:00 2001 From: fqscfqj Date: Tue, 26 May 2026 10:05:51 +0000 Subject: [PATCH 4/7] feat(qwen-asr): respect timestamp_granularities (segment vs word) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Read request.timestamp_granularities from the gRPC request. - 'word': return one segment per aligned item (character / word) - 'segment' (default): merge consecutive items at sentence boundaries Sentence boundaries detected via CJK punctuation (。!?;…) and Latin endings (. ! ? ;). This matches the OpenAI Whisper API contract where omitting the parameter defaults to segment-level. --- backend/python/qwen-asr/backend.py | 111 +++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 21 deletions(-) diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py index 835285bdf97b..d5830941e7da 100644 --- a/backend/python/qwen-asr/backend.py +++ b/backend/python/qwen-asr/backend.py @@ -134,6 +134,87 @@ def LoadModel(self, request, context): return backend_pb2.Result(message="Model loaded successfully", success=True) + @staticmethod + def _is_sentence_end(text): + """Check if text ends with sentence-terminating punctuation.""" + if not text: + return False + # CJK + common punctuation + endings = set('。!?;…♪~»))】」』"'》>)') + # Latin endings + endings.update('.!?;') + return text[-1] in endings + + @staticmethod + def _extract_word_info(ts): + """Return (start_sec, end_sec, text) from a ForcedAlignItem or tuple.""" + if hasattr(ts, 'start_time') and hasattr(ts, 'end_time') and hasattr(ts, 'text'): + return ( + float(ts.start_time) if ts.start_time is not None else 0.0, + float(ts.end_time) if ts.end_time is not None else 0.0, + str(ts.text) if ts.text else "", + ) + elif isinstance(ts, (list, tuple)) and len(ts) >= 3: + return ( + float(ts[0]) if ts[0] is not None else 0.0, + float(ts[1]) if ts[1] is not None else 0.0, + ts[2] if len(ts) > 2 and ts[2] is not None else "", + ) + return (0.0, 0.0, "") + + def _build_segments(self, time_stamps, granularity): + """Build TranscriptSegment list from forced-aligner output. + + granularity: + - "word": one segment per aligned item (character / word) + - "segment" (default): merge consecutive items at sentence boundaries + """ + if granularity == "word": + result = [] + for idx, ts in enumerate(time_stamps): + s, e, t = self._extract_word_info(ts) + result.append(backend_pb2.TranscriptSegment( + id=idx, + start=int(s * 1_000_000_000), + end=int(e * 1_000_000_000), + text=t, + )) + return result + + # segment mode — merge at sentence boundaries + result = [] + buf_text = [] + buf_start = None + buf_end = 0.0 + + for ts in time_stamps: + s, e, t = self._extract_word_info(ts) + if buf_start is None: + buf_start = s + buf_text.append(t) + buf_end = e + + if self._is_sentence_end(t): + result.append(backend_pb2.TranscriptSegment( + id=len(result), + start=int(buf_start * 1_000_000_000), + end=int(buf_end * 1_000_000_000), + text="".join(buf_text), + )) + buf_text = [] + buf_start = None + + # flush remaining + if buf_text and buf_start is not None: + result.append(backend_pb2.TranscriptSegment( + id=len(result), + start=int(buf_start * 1_000_000_000), + end=int(buf_end * 1_000_000_000), + text="".join(buf_text), + )) + + return result + def AudioTranscription(self, request, context): result_segments = [] text = "" @@ -147,18 +228,22 @@ def AudioTranscription(self, request, context): if request.language and request.language.strip(): language = request.language.strip() - context = "" + ctx = "" if request.prompt and request.prompt.strip(): - context = request.prompt.strip() + ctx = request.prompt.strip() + + # Determine requested granularity (default: segment) + granularities = list(request.timestamp_granularities) if request.timestamp_granularities else [] + granularity = "word" if "word" in granularities else "segment" has_aligner = getattr(self.model, 'forced_aligner', None) is not None try: results = self.model.transcribe( - audio=audio_path, language=language, context=context, + audio=audio_path, language=language, context=ctx, return_time_stamps=has_aligner, ) except TypeError: - results = self.model.transcribe(audio=audio_path, language=language, context=context) + results = self.model.transcribe(audio=audio_path, language=language, context=ctx) if not results: return backend_pb2.TranscriptResult(segments=[], text="") @@ -167,23 +252,7 @@ def AudioTranscription(self, request, context): text = r.text or "" if getattr(r, 'time_stamps', None) and len(r.time_stamps) > 0: - for idx, ts in enumerate(r.time_stamps): - start_ns = 0 - end_ns = 0 - seg_text = text - # Go's time.Duration is in nanoseconds, so convert seconds → ns. - if hasattr(ts, 'start_time') and hasattr(ts, 'end_time') and hasattr(ts, 'text'): - # ForcedAlignItem dataclass (from qwen_asr forced aligner) - start_ns = int(float(ts.start_time) * 1_000_000_000) if ts.start_time is not None else 0 - end_ns = int(float(ts.end_time) * 1_000_000_000) if ts.end_time is not None else 0 - seg_text = str(ts.text) if ts.text else "" - elif isinstance(ts, (list, tuple)) and len(ts) >= 3: - start_ns = int(float(ts[0]) * 1_000_000_000) if ts[0] is not None else 0 - end_ns = int(float(ts[1]) * 1_000_000_000) if ts[1] is not None else 0 - seg_text = ts[2] if len(ts) > 2 and ts[2] is not None else "" - result_segments.append(backend_pb2.TranscriptSegment( - id=idx, start=start_ns, end=end_ns, text=seg_text - )) + result_segments = self._build_segments(r.time_stamps, granularity) else: if text: result_segments.append(backend_pb2.TranscriptSegment( From dd4e86b6ec267ebd758b7c0878ae29772367d3d3 Mon Sep 17 00:00:00 2001 From: fqscfqj Date: Tue, 26 May 2026 10:09:33 +0000 Subject: [PATCH 5/7] fix(qwen-asr): escape smart quotes in punctuation set Unicode curly quotes (U+2018/2019) were being interpreted as Python string delimiters, causing SyntaxError. Use explicit unicode escapes. --- backend/python/qwen-asr/backend.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py index d5830941e7da..65d310b14a91 100644 --- a/backend/python/qwen-asr/backend.py +++ b/backend/python/qwen-asr/backend.py @@ -140,7 +140,9 @@ def _is_sentence_end(text): if not text: return False # CJK + common punctuation - endings = set('。!?;…♪~»))】」』"'》>)') + endings = set("。!?;…♪~»))】」』》>)") + # Smart quotes (U+201C, U+201D, U+2018, U+2019) + endings.update(["\u201c", "\u201d", "\u2018", "\u2019"]) # Latin endings endings.update('.!?;') return text[-1] in endings From 5b4abcb3ab357b8121b0b88f8acbabf0327c98f6 Mon Sep 17 00:00:00 2001 From: fqscfqj Date: Tue, 26 May 2026 10:12:31 +0000 Subject: [PATCH 6/7] fix(qwen-asr): use time-gap threshold for segment boundaries The forced aligner strips punctuation from its output, so text-based sentence detection doesn't work. Instead, detect segment boundaries by measuring time gaps between consecutive aligned items. Threshold = max(median_gap * 4, 0.3s). This cleanly separates intra-sentence gaps (< 0.24s) from inter-sentence gaps (> 0.3s) across Chinese, English, and other languages. --- backend/python/qwen-asr/backend.py | 54 +++++++++++++++++++----------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py index 65d310b14a91..238af01f5100 100644 --- a/backend/python/qwen-asr/backend.py +++ b/backend/python/qwen-asr/backend.py @@ -134,19 +134,6 @@ def LoadModel(self, request, context): return backend_pb2.Result(message="Model loaded successfully", success=True) - @staticmethod - def _is_sentence_end(text): - """Check if text ends with sentence-terminating punctuation.""" - if not text: - return False - # CJK + common punctuation - endings = set("。!?;…♪~»))】」』》>)") - # Smart quotes (U+201C, U+201D, U+2018, U+2019) - endings.update(["\u201c", "\u201d", "\u2018", "\u2019"]) - # Latin endings - endings.update('.!?;') - return text[-1] in endings - @staticmethod def _extract_word_info(ts): """Return (start_sec, end_sec, text) from a ForcedAlignItem or tuple.""" @@ -164,12 +151,34 @@ def _extract_word_info(ts): ) return (0.0, 0.0, "") + @staticmethod + def _compute_gap_threshold(time_stamps): + """Compute a gap threshold for sentence boundary detection. + + Uses the median inter-item gap multiplied by a factor, with a + minimum floor of 0.3s. Returns 0 if there are too few items. + """ + if len(time_stamps) < 2: + return 0.0 + gaps = [] + for i in range(1, len(time_stamps)): + prev_s, prev_e, _ = BackendServicer._extract_word_info(time_stamps[i - 1]) + curr_s, _, _ = BackendServicer._extract_word_info(time_stamps[i]) + gaps.append(curr_s - prev_e) + if not gaps: + return 0.0 + gaps.sort() + median = gaps[len(gaps) // 2] + # threshold = max(median * 4, 0.3s) + return max(median * 4, 0.3) + def _build_segments(self, time_stamps, granularity): """Build TranscriptSegment list from forced-aligner output. granularity: - "word": one segment per aligned item (character / word) - - "segment" (default): merge consecutive items at sentence boundaries + - "segment" (default): merge consecutive items, splitting at + time gaps that exceed a dynamic threshold (sentence boundaries). """ if granularity == "word": result = [] @@ -183,20 +192,19 @@ def _build_segments(self, time_stamps, granularity): )) return result - # segment mode — merge at sentence boundaries + # segment mode — merge at time-gap boundaries + threshold = self._compute_gap_threshold(time_stamps) result = [] buf_text = [] buf_start = None buf_end = 0.0 + prev_end = None for ts in time_stamps: s, e, t = self._extract_word_info(ts) - if buf_start is None: - buf_start = s - buf_text.append(t) - buf_end = e - if self._is_sentence_end(t): + # Detect sentence boundary via time gap + if prev_end is not None and (s - prev_end) >= threshold and buf_text: result.append(backend_pb2.TranscriptSegment( id=len(result), start=int(buf_start * 1_000_000_000), @@ -206,6 +214,12 @@ def _build_segments(self, time_stamps, granularity): buf_text = [] buf_start = None + if buf_start is None: + buf_start = s + buf_text.append(t) + buf_end = e + prev_end = e + # flush remaining if buf_text and buf_start is not None: result.append(backend_pb2.TranscriptSegment( From e58c5a37d27cfc4b89261c25d1ff66d824cd816d Mon Sep 17 00:00:00 2001 From: fqscfqj Date: Tue, 26 May 2026 10:25:40 +0000 Subject: [PATCH 7/7] fix(qwen-asr): smart join with spaces for non-CJK tokens The forced aligner strips whitespace from tokenized text, so English words like ['hello', 'world'] were joined as 'helloworld'. Add _smart_join() that inserts spaces between non-CJK tokens while keeping CJK characters and punctuation unspaced. Works for Chinese, English, Korean, Japanese, and mixed-language text. --- backend/python/qwen-asr/backend.py | 57 ++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py index 238af01f5100..196f8f439fb4 100644 --- a/backend/python/qwen-asr/backend.py +++ b/backend/python/qwen-asr/backend.py @@ -134,6 +134,59 @@ def LoadModel(self, request, context): return backend_pb2.Result(message="Model loaded successfully", success=True) + @staticmethod + def _is_cjk(ch): + """Check if a character is CJK (Chinese/Japanese/Korean).""" + cp = ord(ch) + return ( + 0x4E00 <= cp <= 0x9FFF # CJK Unified Ideographs + or 0x3400 <= cp <= 0x4DBF # Extension A + or 0x20000 <= cp <= 0x2A6DF # Extension B + or 0xF900 <= cp <= 0xFAFF # Compatibility Ideographs + or 0x3040 <= cp <= 0x309F # Hiragana + or 0x30A0 <= cp <= 0x30FF # Katakana + or 0xAC00 <= cp <= 0xD7AF # Hangul Syllables + ) + + @staticmethod + def _is_punct(ch): + """Check if a character is punctuation (no space before it).""" + import unicodedata + cat = unicodedata.category(ch) + return cat.startswith('P') + + @staticmethod + def _smart_join(tokens): + """Join tokens with spaces for non-CJK text, without spaces for CJK. + + Rules: + - Between two CJK chars: no space + - Between two non-CJK tokens: space + - Before punctuation: no space + - CJK adjacent to non-CJK: no space (smooth mixed-text transition) + """ + if not tokens: + return "" + result = [tokens[0]] + for token in tokens[1:]: + if not token: + continue + prev_ch = result[-1][-1] if result[-1] else '' + curr_ch = token[0] + # Punctuation never gets a space before it + if BackendServicer._is_punct(curr_ch): + result.append(token) + # CJK to CJK: no space + elif prev_ch and BackendServicer._is_cjk(prev_ch) and BackendServicer._is_cjk(curr_ch): + result.append(token) + # CJK adjacent to non-CJK or vice versa: no space + elif prev_ch and (BackendServicer._is_cjk(prev_ch) or BackendServicer._is_cjk(curr_ch)): + result.append(token) + # Both non-CJK (Latin, Cyrillic, etc.): add space + else: + result.append(' ' + token) + return "".join(result) + @staticmethod def _extract_word_info(ts): """Return (start_sec, end_sec, text) from a ForcedAlignItem or tuple.""" @@ -209,7 +262,7 @@ def _build_segments(self, time_stamps, granularity): id=len(result), start=int(buf_start * 1_000_000_000), end=int(buf_end * 1_000_000_000), - text="".join(buf_text), + text=self._smart_join(buf_text), )) buf_text = [] buf_start = None @@ -226,7 +279,7 @@ def _build_segments(self, time_stamps, granularity): id=len(result), start=int(buf_start * 1_000_000_000), end=int(buf_end * 1_000_000_000), - text="".join(buf_text), + text=self._smart_join(buf_text), )) return result