From 487bf51fac02363dea67cd3ad227dd2e05858a4c Mon Sep 17 00:00:00 2001
From: Christopher Glasz <cglasz@mitre.org>
Date: Mon, 13 Feb 2023 18:33:15 -0500
Subject: [PATCH 1/2] Update SPEAKER_ID logic, remove LONG_SPEAKER_ID

---
 python/AzureSpeechDetection/README.md              |  3 +--
 .../acs_speech_component/acs_speech_component.py   |  9 ---------
 .../acs_speech_component/acs_speech_processor.py   |  4 +++-
 .../AzureSpeechDetection/tests/test_acs_speech.py  | 14 +-------------
 4 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/python/AzureSpeechDetection/README.md b/python/AzureSpeechDetection/README.md
index fa8c07ff..a77b4d92 100644
--- a/python/AzureSpeechDetection/README.md
+++ b/python/AzureSpeechDetection/README.md
@@ -36,8 +36,7 @@ Returned `AudioTrack` objects have the following members in their `detection_pro
 
 | Property Key                   | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
 |--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `SPEAKER_ID`                   | An integer speaker identifier, indexed from 1. When a job has been segmented by the Workflow Manager, the ID for all utterances will be overwritten by zero, to avoid confusion (as speaker IDs are not consistent between subjobs).                                                                                                                                                                                                                                                                                                                                                                |
-| `LONG_SPEAKER_ID`              | A unique speaker identifier, of the form "`<start_offset>-<stop_offset>-<#>`, where `<start_offset>` and `<stop_offset>` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. |
+| `SPEAKER_ID`                   | A unique speaker identifier, of the form "`<start_offset>-<stop_offset>-<#>`, where `<start_offset>` and `<stop_offset>` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. |
 | `GENDER`                       | Only present if supplied by an upstream component. The gender of the speaker.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | `GENDER_CONFIDENCE`            | Only present if supplied by an upstream component. The confidence of the gender classification.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | `TRANSCRIPT`                   | The text of the utterance transcript. Words are space-separated.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py
index d5e83c06..4f6a5d7e 100644
--- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py
+++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py
@@ -70,15 +70,6 @@ def get_detections_from_job(
             logger.exception(f'Exception raised while processing audio: {e}')
             raise
 
-        for track in audio_tracks:
-            sid = track.detection_properties['SPEAKER_ID']
-            if job_config.is_triggered_job:
-                track.detection_properties['LONG_SPEAKER_ID'] = sid
-            else:
-                track.detection_properties['LONG_SPEAKER_ID'] = job_config.speaker_id_prefix + sid
-            if job_config.overwrite_ids:
-                track.detection_properties['SPEAKER_ID'] = '0'
-
         logger.info('Processing complete. Found %d tracks.' % len(audio_tracks))
         return audio_tracks
 
diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py
index cfca7f7b..e3ad50dc 100644
--- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py
+++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py
@@ -73,6 +73,7 @@ def __init__(self):
     @staticmethod
     def convert_word_timing(
                 recognized_phrases: Iterable[Mapping[str, Any]],
+                job_config: AzureJobConfig,
                 speaker: Optional[mpf_util.SpeakerInfo] = None
             ) -> Iterable[Utterance]:
         """ Convert ACS recognized_phrases structure to utterances with correct
@@ -96,7 +97,7 @@ def convert_word_timing(
             confidence = phrase['nBest'][0]['confidence']
             word_segments = list(map(get_seg, phrase['nBest'][0]['words']))
             word_confidences = [w['confidence'] for w in phrase['nBest'][0]['words']]
-            speaker_id = str(phrase.get('speaker', '0'))
+            speaker_id = job_config.speaker_id_prefix + str(phrase.get('speaker', '0'))
 
             # Ensure display text tokens are one-to-one with word segments
             #  If not, replace with bare words. This loses punctuation and
@@ -336,6 +337,7 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]:
         recognized_phrases = transcription['recognizedPhrases']
         utterances = self.convert_word_timing(
             recognized_phrases=recognized_phrases,
+            job_config=job_config,
             speaker=job_config.speaker)
 
         logger.info('Completed process audio')
diff --git a/python/AzureSpeechDetection/tests/test_acs_speech.py b/python/AzureSpeechDetection/tests/test_acs_speech.py
index 46e82b17..60136ce5 100644
--- a/python/AzureSpeechDetection/tests/test_acs_speech.py
+++ b/python/AzureSpeechDetection/tests/test_acs_speech.py
@@ -184,7 +184,7 @@ def test_diarization(self):
         # There should be two speakers with diarization, one without
         len_raw, len_dia = [
             len(set([
-                track.detection_properties['LONG_SPEAKER_ID']
+                track.detection_properties['SPEAKER_ID']
                 for track in result
             ]))
             for result in results
@@ -192,18 +192,6 @@ def test_diarization(self):
         self.assertEqual(1, len_raw)
         self.assertEqual(2, len_dia)
 
-        # A nonzero start_time indicates to the component that this is a
-        #  subjob, so all SPEAKER_IDs should be equal to 0
-        ids_raw, ids_dia = [
-            set([
-                track.detection_properties['SPEAKER_ID']
-                for track in result
-            ])
-            for result in results
-        ]
-        self.assertEqual({'0'}, ids_raw)
-        self.assertEqual({'0'}, ids_dia)
-
     def test_language(self):
         job_en = mpf.AudioJob(
             job_name='test_bilingual_english',

From d8223f9bc4d8824b44063b857299581b95ca8bf5 Mon Sep 17 00:00:00 2001
From: Christopher Glasz <cglasz@mitre.org>
Date: Mon, 20 Feb 2023 13:37:06 -0500
Subject: [PATCH 2/2] Temporarily replace SPEAKER_ID with LONG_SPEAKER_ID

---
 python/AzureSpeechDetection/README.md                        | 3 ++-
 .../acs_speech_component/acs_speech_component.py             | 5 +++++
 python/AzureSpeechDetection/tests/test_acs_speech.py         | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/AzureSpeechDetection/README.md b/python/AzureSpeechDetection/README.md
index a77b4d92..95c196e6 100644
--- a/python/AzureSpeechDetection/README.md
+++ b/python/AzureSpeechDetection/README.md
@@ -36,7 +36,8 @@ Returned `AudioTrack` objects have the following members in their `detection_pro
 
 | Property Key                   | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
 |--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `SPEAKER_ID`                   | A unique speaker identifier, of the form "`<start_offset>-<stop_offset>-<#>`, where `<start_offset>` and `<stop_offset>` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. |
+| `LONG_SPEAKER_ID`              | A unique speaker identifier, of the form "`<start_offset>-<stop_offset>-<#>`, where `<start_offset>` and `<stop_offset>` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. |
+| `SPEAKER_ID`                   | A dummy field set to "0".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 | `GENDER`                       | Only present if supplied by an upstream component. The gender of the speaker.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | `GENDER_CONFIDENCE`            | Only present if supplied by an upstream component. The confidence of the gender classification.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | `TRANSCRIPT`                   | The text of the utterance transcript. Words are space-separated.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py
index 4f6a5d7e..d283943f 100644
--- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py
+++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_component.py
@@ -70,6 +70,11 @@ def get_detections_from_job(
             logger.exception(f'Exception raised while processing audio: {e}')
             raise
 
+        # Remove this block to drop LONG_SPEAKER_ID
+        for track in audio_tracks:
+            track.detection_properties['LONG_SPEAKER_ID'] = track.detection_properties['SPEAKER_ID']
+            track.detection_properties['SPEAKER_ID'] = '0'
+
         logger.info('Processing complete. Found %d tracks.' % len(audio_tracks))
         return audio_tracks
 
diff --git a/python/AzureSpeechDetection/tests/test_acs_speech.py b/python/AzureSpeechDetection/tests/test_acs_speech.py
index 60136ce5..8304f040 100644
--- a/python/AzureSpeechDetection/tests/test_acs_speech.py
+++ b/python/AzureSpeechDetection/tests/test_acs_speech.py
@@ -184,7 +184,7 @@ def test_diarization(self):
         # There should be two speakers with diarization, one without
         len_raw, len_dia = [
             len(set([
-                track.detection_properties['SPEAKER_ID']
+                track.detection_properties['LONG_SPEAKER_ID']
                 for track in result
             ]))
             for result in results