From 487bf51fac02363dea67cd3ad227dd2e05858a4c Mon Sep 17 00:00:00 2001 From: Christopher Glasz Date: Mon, 13 Feb 2023 18:33:15 -0500 Subject: [PATCH 1/2] Update SPEAKER_ID logic, remove LONG_SPEAKER_ID --- python/AzureSpeechDetection/README.md | 3 +-- .../acs_speech_component/acs_speech_component.py | 9 --------- .../acs_speech_component/acs_speech_processor.py | 4 +++- .../AzureSpeechDetection/tests/test_acs_speech.py | 14 +------------- 4 files changed, 5 insertions(+), 25 deletions(-) diff --git a/python/AzureSpeechDetection/README.md b/python/AzureSpeechDetection/README.md index fa8c07ff..a77b4d92 100644 --- a/python/AzureSpeechDetection/README.md +++ b/python/AzureSpeechDetection/README.md @@ -36,8 +36,7 @@ Returned `AudioTrack` objects have the following members in their `detection_pro | Property Key | Description | |--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `SPEAKER_ID` | An integer speaker identifier, indexed from 1. When a job has been segmented by the Workflow Manager, the ID for all utterances will be overwritten by zero, to avoid confusion (as speaker IDs are not consistent between subjobs). | -| `LONG_SPEAKER_ID` | A unique speaker identifier, of the form "`--<#>`, where `` and `` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. | +| `SPEAKER_ID` | A unique speaker identifier, of the form "`--<#>`, where `` and `` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. | | `GENDER` | Only present if supplied by an upstream component. The gender of the speaker. | | `GENDER_CONFIDENCE` | Only present if supplied by an upstream component. The confidence of the gender classification. | | `TRANSCRIPT` | The text of the utterance transcript. Words are space-separated. | diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py index d5e83c06..4f6a5d7e 100644 --- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py +++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py @@ -70,15 +70,6 @@ def get_detections_from_job( logger.exception(f'Exception raised while processing audio: {e}') raise - for track in audio_tracks: - sid = track.detection_properties['SPEAKER_ID'] - if job_config.is_triggered_job: - track.detection_properties['LONG_SPEAKER_ID'] = sid - else: - track.detection_properties['LONG_SPEAKER_ID'] = job_config.speaker_id_prefix + sid - if job_config.overwrite_ids: - track.detection_properties['SPEAKER_ID'] = '0' - logger.info('Processing complete. Found %d tracks.' % len(audio_tracks)) return audio_tracks diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py index cfca7f7b..e3ad50dc 100644 --- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py +++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py @@ -73,6 +73,7 @@ def __init__(self): @staticmethod def convert_word_timing( recognized_phrases: Iterable[Mapping[str, Any]], + job_config: AzureJobConfig, speaker: Optional[mpf_util.SpeakerInfo] = None ) -> Iterable[Utterance]: """ Convert ACS recognized_phrases structure to utterances with correct @@ -96,7 +97,7 @@ def convert_word_timing( confidence = phrase['nBest'][0]['confidence'] word_segments = list(map(get_seg, phrase['nBest'][0]['words'])) word_confidences = [w['confidence'] for w in phrase['nBest'][0]['words']] - speaker_id = str(phrase.get('speaker', '0')) + speaker_id = job_config.speaker_id_prefix + str(phrase.get('speaker', '0')) # Ensure display text tokens are one-to-one with word segments # If not, replace with bare words. This loses punctuation and @@ -336,6 +337,7 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]: recognized_phrases = transcription['recognizedPhrases'] utterances = self.convert_word_timing( recognized_phrases=recognized_phrases, + job_config=job_config, speaker=job_config.speaker) logger.info('Completed process audio') diff --git a/python/AzureSpeechDetection/tests/test_acs_speech.py b/python/AzureSpeechDetection/tests/test_acs_speech.py index 46e82b17..60136ce5 100644 --- a/python/AzureSpeechDetection/tests/test_acs_speech.py +++ b/python/AzureSpeechDetection/tests/test_acs_speech.py @@ -184,7 +184,7 @@ def test_diarization(self): # There should be two speakers with diarization, one without len_raw, len_dia = [ len(set([ - track.detection_properties['LONG_SPEAKER_ID'] + track.detection_properties['SPEAKER_ID'] for track in result ])) for result in results @@ -192,18 +192,6 @@ def test_diarization(self): self.assertEqual(1, len_raw) self.assertEqual(2, len_dia) - # A nonzero start_time indicates to the component that this is a - # subjob, so all SPEAKER_IDs should be equal to 0 - ids_raw, ids_dia = [ - set([ - track.detection_properties['SPEAKER_ID'] - for track in result - ]) - for result in results - ] - self.assertEqual({'0'}, ids_raw) - self.assertEqual({'0'}, ids_dia) - def test_language(self): job_en = mpf.AudioJob( job_name='test_bilingual_english', From d8223f9bc4d8824b44063b857299581b95ca8bf5 Mon Sep 17 00:00:00 2001 From: Christopher Glasz Date: Mon, 20 Feb 2023 13:37:06 -0500 Subject: [PATCH 2/2] Temporarily replace SPEAKER_ID with LONG_SPEAKER_ID --- python/AzureSpeechDetection/README.md | 3 ++- .../acs_speech_component/acs_speech_component.py | 5 +++++ python/AzureSpeechDetection/tests/test_acs_speech.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/AzureSpeechDetection/README.md b/python/AzureSpeechDetection/README.md index a77b4d92..95c196e6 100644 --- a/python/AzureSpeechDetection/README.md +++ b/python/AzureSpeechDetection/README.md @@ -36,7 +36,8 @@ Returned `AudioTrack` objects have the following members in their `detection_pro | Property Key | Description | |--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `SPEAKER_ID` | A unique speaker identifier, of the form "`--<#>`, where `` and `` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. | +| `LONG_SPEAKER_ID` | A unique speaker identifier, of the form "`--<#>`, where `` and `` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. | +| `SPEAKER_ID` | A dummy field set to "0". | | `GENDER` | Only present if supplied by an upstream component. The gender of the speaker. | | `GENDER_CONFIDENCE` | Only present if supplied by an upstream component. The confidence of the gender classification. | | `TRANSCRIPT` | The text of the utterance transcript. Words are space-separated. | diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py index 4f6a5d7e..d283943f 100644 --- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py +++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py @@ -70,6 +70,11 @@ def get_detections_from_job( logger.exception(f'Exception raised while processing audio: {e}') raise + # Remove this block to drop LONG_SPEAKER_ID + for track in audio_tracks: + track.detection_properties['LONG_SPEAKER_ID'] = track.detection_properties['SPEAKER_ID'] + track.detection_properties['SPEAKER_ID'] = '0' + logger.info('Processing complete. Found %d tracks.' % len(audio_tracks)) return audio_tracks diff --git a/python/AzureSpeechDetection/tests/test_acs_speech.py b/python/AzureSpeechDetection/tests/test_acs_speech.py index 60136ce5..8304f040 100644 --- a/python/AzureSpeechDetection/tests/test_acs_speech.py +++ b/python/AzureSpeechDetection/tests/test_acs_speech.py @@ -184,7 +184,7 @@ def test_diarization(self): # There should be two speakers with diarization, one without len_raw, len_dia = [ len(set([ - track.detection_properties['SPEAKER_ID'] + track.detection_properties['LONG_SPEAKER_ID'] for track in result ])) for result in results