fix: repair failing speech tests (#2179)

* fix: repair failing speech tests * update SAS url for audio2 test file --------- Co-authored-by: Brendan Walsh <brwals@outlook.com> Co-authored-by: Mark Hamilton <mhamilton723@gmail.com>
microsoft · Mar 20, 2024 · 9aea411 · 9aea411
1 parent 3f4e12c
commit 9aea411
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 22 deletions.
diff --git a/...c/main/scala/com/microsoft/azure/synapse/ml/services/speech/SpeakerEmotionInference.scala b/...c/main/scala/com/microsoft/azure/synapse/ml/services/speech/SpeakerEmotionInference.scala
@@ -29,7 +29,7 @@ class SpeakerEmotionInference(override val uid: String)
 
   setDefault(
     locale -> Left("en-US"),
-    voiceName -> Left("en-US-JennyNeural"),
+    voiceName -> Left("en-US-JaneNeural"),
     text -> Left(this.uid + "_text"))
 
   def urlPath: String = "cognitiveservices/v1"
@@ -54,7 +54,7 @@ class SpeakerEmotionInference(override val uid: String)
   override protected def prepareEntity: Row => Option[AbstractHttpEntity] = { row =>
     val body: String =
       s"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts'" +
-        s" xml:lang='en-US'><voice name='Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)'>" +
+        s" xml:lang='en-US'><voice name='Microsoft Server Speech Text to Speech Voice (en-US, JaneNeural)'>" +
         s"<mstts:task name ='RoleStyle'/>${getValue(row, text)}</voice></speak>"
     Some(new StringEntity(body))
   }

diff --git a/...t/scala/com/microsoft/azure/synapse/ml/services/speech/SpeakerEmotionInferenceSuite.scala b/...t/scala/com/microsoft/azure/synapse/ml/services/speech/SpeakerEmotionInferenceSuite.scala
@@ -20,42 +20,55 @@ class SpeakerEmotionInferenceSuite extends TransformerFuzzing[SpeakerEmotionInfe
     .setLocation("eastus")
     .setSubscriptionKey(cognitiveKey)
     .setLocale("en-US")
-    .setVoiceName("en-US-JennyNeural")
+    .setVoiceName("en-US-JaneNeural")
     .setTextCol("text")
     .setOutputCol("ssml")
 
   val testData: Map[String, String] = Map[String, String](
     ("\"A\" \"B\" \"C\"",
       "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' " +
-        "xml:lang='en-US'><voice name='en-US-JennyNeural'>" +
+        "xml:lang='en-US'><voice name='en-US-JaneNeural'>" +
         "<mstts:express-as role='male' style='calm'>\"A\"</mstts:express-as> " +
         "<mstts:express-as role='male' style='calm'>\"B\"</mstts:express-as> " +
         "<mstts:express-as role='male' style='calm'>\"C\"</mstts:express-as></voice></speak>\n"),
     ("\"I'm shouting excitedly!\" she shouted excitedly.",
       "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' " +
-        "xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JennyNeural'>" +
+        "xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JaneNeural'>" +
         "<mstts:express-as role='female' style='excited'>\"I'm shouting excitedly!\"</mstts:express-as> she shouted " +
         "excitedly.</voice></speak>\n"),
     ("This text has no quotes in it, so isValid should be false",
       "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' " +
-        "xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JennyNeural'>" +
+        "xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JaneNeural'>" +
         "This text has no quotes in it, so isValid should be false</voice></speak>\n"),
     ("\"This is an example of a sentence with unmatched quotes,\" she said.\"",
       "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' " +
-        "xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JennyNeural'>" +
+        "xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JaneNeural'>" +
         "<mstts:express-as role='female' style='calm'>\"This is an example of a sentence with unmatched quotes,\"" +
         "</mstts:express-as> she said.\"</voice></speak>\n"))
 
   lazy val df: DataFrame = testData.keys.toSeq.toDF("text")
 
+  def normalizeSSML(ssml: String): String = {
+    val ignoredAttributes: List[String] = List("name", "style", "role")
+    ignoredAttributes.foldLeft(ssml)((acc, attr) =>
+      acc.replaceAll(s"""\\s+$attr='[^']*'""", s"$attr="))
+  }
+
+  /*
+    We're testing the structure of the returned call not the quality of the api, so ignore specifics like role and style
+   */
+  def assertFuzzyEquals(actualSSML: String, expectedSSML: String): Unit = {
+    assert(normalizeSSML(expectedSSML).equals(normalizeSSML(actualSSML)))
+  }
+
   test("basic") {
     val transformed = ssmlGenerator.transform(df)
     transformed.show(truncate = false)
-    transformed.collect().map(row => {
+    transformed.collect().foreach { row =>
       val actual = testData.getOrElse(row.getString(0), "")
       val expected = row.getString(2)
-      assert(actual.equals(expected))
-    })
+      assertFuzzyEquals(actual, expected)
+    }
   }
 
   test("arbitrary df size") {
@@ -65,9 +78,9 @@ class SpeakerEmotionInferenceSuite extends TransformerFuzzing[SpeakerEmotionInfe
         val actual = row.getString(5)
         val expected =
           """<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' """ +
-            """xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JennyNeural'>""" +
+            """xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JaneNeural'>""" +
             s"""Hello</voice></speak>\n"""
-        assert(actual.equals(expected))
+        assertFuzzyEquals(actual, expected)
       })
   }
 
@@ -77,7 +90,7 @@ class SpeakerEmotionInferenceSuite extends TransformerFuzzing[SpeakerEmotionInfe
       SSMLConversation(5, 8, """"B"""", "male", "calm"),
       SSMLConversation(10, 13, """"C"""", "male", "calm")))) ->
       ("""<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' """ +
-        """xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JennyNeural'>""" +
+        """xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JaneNeural'>""" +
         """<mstts:express-as role='male' style='calm'>"A"</mstts:express-as>, """ +
         """<mstts:express-as role='male' style='calm'>"B"</mstts:express-as>, """ +
         """<mstts:express-as role='male' style='calm'>"C"</mstts:express-as></voice></speak>""" + "\n")),
@@ -86,7 +99,7 @@ class SpeakerEmotionInferenceSuite extends TransformerFuzzing[SpeakerEmotionInfe
       SSMLConversation(5, 8, """"B"""", "male", "calm"),
       SSMLConversation(9, 12, """"C"""", "male", "calm")))) ->
       ("""<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' """ +
-        """xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JennyNeural'>Z""" +
+        """xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='en-US-JaneNeural'>Z""" +
         """<mstts:express-as role='male' style='calm'>"A"</mstts:express-as>Z<mstts:express-as role='male' """ +
         """style='calm'>"B"</mstts:express-as>Z<mstts:express-as role='male' style='calm'>"C"""" +
         """</mstts:express-as>Z</voice></speak>""" + "\n")),
@@ -96,7 +109,7 @@ class SpeakerEmotionInferenceSuite extends TransformerFuzzing[SpeakerEmotionInfe
       SSMLConversation(6, 9, """"C"""", "male", "calm")))) ->
       ("""<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' """ +
         """xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'>""" +
-        """<voice name='en-US-JennyNeural'><mstts:express-as role='male' style='calm'>"A"""" +
+        """<voice name='en-US-JaneNeural'><mstts:express-as role='male' style='calm'>"A"""" +
         """</mstts:express-as><mstts:express-as role='male' style='calm'>"B"</mstts:express-as>""" +
         """<mstts:express-as role='male' style='calm'>"C"</mstts:express-as></voice></speak>""" + "\n")))
 
@@ -105,7 +118,7 @@ class SpeakerEmotionInferenceSuite extends TransformerFuzzing[SpeakerEmotionInfe
       val result = ssmlGenerator.formatSSML(
         test._1._1,
         "en-US",
-        "en-US-JennyNeural",
+        "en-US-JaneNeural",
         test._1._2)
       assertResult(test._2)(result)
     })

diff --git a/.../src/test/scala/com/microsoft/azure/synapse/ml/services/speech/SpeechToTextSDKSuite.scala b/.../src/test/scala/com/microsoft/azure/synapse/ml/services/speech/SpeechToTextSDKSuite.scala
@@ -232,9 +232,9 @@ class SpeechToTextSDKSuite extends TransformerFuzzing[SpeechToTextSDK] with Spee
   }
 
   test("SAS URL based access") {
-    val sasURL = "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav?sv=2019-12-12" +
-      "?sv=2021-10-04&st=2024-02-28T16%3A17%3A55Z&se=2026-03-30T15%3A33%3A00Z" +
-      "&sr=c&sp=rl&sig=5Oy6pEaF4hN3lj8uo6daLN%2F%2BiV9VD6XFNSy%2FZ8Upeeg%3D"
+    val sasURL = "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav" +
+      "?sp=r&st=2024-03-18T20:17:56Z&se=9999-03-19T04:17:56Z&spr=https&sv=2022-11-02" +
+      "&sr=b&sig=JUU1ojKzTbb45bSP7rOAVXajwrUEp9Ux20oCiD8%2Bb%2FM%3D"
 
     tryWithRetries(Array(100, 500)) { () => //For handling flaky build machines
       val uriDf = Seq(Tuple1(sasURL))
@@ -429,8 +429,8 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
 
   test("SAS URL based access") {
     val sasURL = "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav" +
-      "?sv=2021-10-04&st=2024-02-28T16%3A17%3A55Z&se=2026-03-30T15%3A33%3A00Z" +
-      "&sr=c&sp=rl&sig=5Oy6pEaF4hN3lj8uo6daLN%2F%2BiV9VD6XFNSy%2FZ8Upeeg%3D"
+      "?sp=r&st=2024-03-18T20:17:56Z&se=9999-03-19T04:17:56Z&spr=https&sv=2022-11-02" +
+      "&sr=b&sig=JUU1ojKzTbb45bSP7rOAVXajwrUEp9Ux20oCiD8%2Bb%2FM%3D"
 
     tryWithRetries(Array(100, 500)) { () => //For handling flaky build machines
       val uriDf = Seq(Tuple1(sasURL))

diff --git a/...ive/src/test/scala/com/microsoft/azure/synapse/ml/services/speech/TextToSpeechSuite.scala b/...ive/src/test/scala/com/microsoft/azure/synapse/ml/services/speech/TextToSpeechSuite.scala
@@ -43,7 +43,7 @@ class TextToSpeechSuite extends TransformerFuzzing[TextToSpeech] with CognitiveK
     """<speak xmlns="http://www.w3.org/2001/10/synthesis" """ +
       """xmlns:mstts="http://www.w3.org/2001/mstts" """ +
       """xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US">""" +
-      """<voice name="en-US-JennyNeural"><mstts:express-as role='female' style='terrified'>""" +
+      """<voice name="en-US-JaneNeural"><mstts:express-as role='female' style='terrified'>""" +
       """This is how I sound right now.</mstts:express-as></voice></speak>""",
     new File(saveDir, "test1.mp3").toString)).toDF("text", "filename")