add getWaveFormat API to AudioStreamFormat to support non-PCM wav for… (

#460) * add getWaveFormat API to AudioStreamFormat to support non-PCM wav formats, include test and test files * extend gitattributes for mulaw, alaw * Use CTS specific key for CTS tests * use different CTS key * key change again
microsoft · Dec 3, 2021 · a552ae3 · a552ae3
1 parent 0757c64
commit a552ae3
Show file tree

Hide file tree

Showing 9 changed files with 103 additions and 64 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -18,3 +18,5 @@ LICENSE text
 
 # Binary extensions:
 *.wav binary
+*.alaw binary
+*.mulaw binary
diff --git a/ci/build.yml b/ci/build.yml
@@ -58,6 +58,8 @@ jobs:
         BotRegion:$(BotRegionJS) ^
         SpeakerIDSubscriptionKey:$(SpeakerRecognition-WestUS-Key) ^
         SpeakerIDRegion:westus ^
+        ConversationTranscriptionKey:$(ConverstationTranscriptionKeyWestUSOnline) ^
+        ConversationTranscriptionRegion:westus ^
         CustomVoiceSubscriptionKey:$(speech-ne-s0-key1) ^
         CustomVoiceRegion:northeurope
 

diff --git a/src/sdk/Audio/AudioStreamFormat.ts b/src/sdk/Audio/AudioStreamFormat.ts
@@ -30,6 +30,22 @@ export abstract class AudioStreamFormat {
         return AudioStreamFormatImpl.getDefaultInputFormat();
     }
 
+    /**
+     * Creates an audio stream format object with the specified format characteristics.
+     * @member AudioStreamFormat.getWaveFormat
+     * @function
+     * @public
+     * @param {number} samplesPerSecond - Sample rate, in samples per second (Hertz).
+     * @param {number} bitsPerSample - Bits per sample, typically 16.
+     * @param {number} channels - Number of channels in the waveform-audio data. Monaural data
+     *        uses one channel and stereo data uses two channels.
+     * @param {AudioFormatTag} format - Audio format (PCM, alaw or mulaw).
+     * @returns {AudioStreamFormat} The audio stream format being created.
+     */
+    public static getWaveFormat(samplesPerSecond: number, bitsPerSample: number, channels: number, format: AudioFormatTag): AudioStreamFormat {
+        return new AudioStreamFormatImpl(samplesPerSecond, bitsPerSample, channels, format);
+    }
+
     /**
      * Creates an audio stream format object with the specified pcm waveformat characteristics.
      * @member AudioStreamFormat.getWaveFormatPCM
@@ -72,6 +88,8 @@ export class AudioStreamFormatImpl extends AudioStreamFormat {
      */
     public constructor(samplesPerSec: number = 16000, bitsPerSample: number = 16, channels: number = 1, format: AudioFormatTag = AudioFormatTag.PCM) {
         super();
+
+        let isWavFormat: boolean = true;
         /* 1 for PCM; 6 for alaw; 7 for mulaw */
         switch (format) {
             case AudioFormatTag.PCM:
@@ -84,42 +102,45 @@ export class AudioStreamFormatImpl extends AudioStreamFormat {
                 this.formatTag = 7;
                 break;
             default:
+                isWavFormat = false;
         }
         this.bitsPerSample = bitsPerSample;
         this.samplesPerSec = samplesPerSec;
         this.channels = channels;
         this.avgBytesPerSec = this.samplesPerSec * this.channels * (this.bitsPerSample / 8);
         this.blockAlign = this.channels * Math.max(this.bitsPerSample, 8);
 
-        this.privHeader = new ArrayBuffer(44);
-
-        // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/DataView
-        const view = new DataView(this.privHeader);
-
-        /* RIFF identifier */
-        this.setString(view, 0, "RIFF");
-        /* file length */
-        view.setUint32(4, 0, true);
-        /* RIFF type & Format */
-        this.setString(view, 8, "WAVEfmt ");
-        /* format chunk length */
-        view.setUint32(16, 16, true);
-        /* audio format */
-        view.setUint16(20, this.formatTag, true);
-        /* channel count */
-        view.setUint16(22, this.channels, true);
-        /* sample rate */
-        view.setUint32(24, this.samplesPerSec, true);
-        /* byte rate (sample rate * block align) */
-        view.setUint32(28, this.avgBytesPerSec, true);
-        /* block align (channel count * bytes per sample) */
-        view.setUint16(32, this.channels * (this.bitsPerSample / 8), true);
-        /* bits per sample */
-        view.setUint16(34, this.bitsPerSample, true);
-        /* data chunk identifier */
-        this.setString(view, 36, "data");
-        /* data chunk length */
-        view.setUint32(40, 0, true);
+        if (isWavFormat) {
+            this.privHeader = new ArrayBuffer(44);
+
+            // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/DataView
+            const view = new DataView(this.privHeader);
+
+            /* RIFF identifier */
+            this.setString(view, 0, "RIFF");
+            /* file length */
+            view.setUint32(4, 0, true);
+            /* RIFF type & Format */
+            this.setString(view, 8, "WAVEfmt ");
+            /* format chunk length */
+            view.setUint32(16, 16, true);
+            /* audio format */
+            view.setUint16(20, this.formatTag, true);
+            /* channel count */
+            view.setUint16(22, this.channels, true);
+            /* sample rate */
+            view.setUint32(24, this.samplesPerSec, true);
+            /* byte rate (sample rate * block align) */
+            view.setUint32(28, this.avgBytesPerSec, true);
+            /* block align (channel count * bytes per sample) */
+            view.setUint16(32, this.channels * (this.bitsPerSample / 8), true);
+            /* bits per sample */
+            view.setUint16(34, this.bitsPerSample, true);
+            /* data chunk identifier */
+            this.setString(view, 36, "data");
+            /* data chunk length */
+            view.setUint32(40, 0, true);
+        }
     }
 
     /**

diff --git a/src/sdk/Exports.ts b/src/sdk/Exports.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT license.
 
 export { AudioConfig } from "./Audio/AudioConfig";
-export { AudioStreamFormat } from "./Audio/AudioStreamFormat";
+export { AudioStreamFormat, AudioFormatTag } from "./Audio/AudioStreamFormat";
 export { AudioInputStream, PullAudioInputStream, PushAudioInputStream } from "./Audio/AudioInputStream";
 export { AudioOutputStream, PullAudioOutputStream, PushAudioOutputStream} from "./Audio/AudioOutputStream";
 export { CancellationReason } from "./CancellationReason";

diff --git a/tests/ConversationTranscriberTests.ts b/tests/ConversationTranscriberTests.ts
@@ -60,7 +60,7 @@ const CreateConversation: (speechConfig?: sdk.SpeechTranslationConfig) => Promis
 };
 
 const BuildSpeechConfig: () => sdk.SpeechTranslationConfig = (): sdk.SpeechTranslationConfig => {
-    const s: sdk.SpeechTranslationConfig = sdk.SpeechTranslationConfig.fromSubscription(Settings.SpeakerIDSubscriptionKey, Settings.SpeakerIDRegion);
+    const s: sdk.SpeechTranslationConfig = sdk.SpeechTranslationConfig.fromSubscription(Settings.ConversationTranscriptionKey, Settings.ConversationTranscriptionRegion);
     expect(s).not.toBeUndefined();
     return s;
 };

diff --git a/tests/Settings.ts b/tests/Settings.ts
@@ -67,6 +67,8 @@ export class Settings {
     public static WaveFile: string = Settings.InputDir + "whatstheweatherlike.wav";
     public static WaveFile8ch: string = Settings.InputDir + "Speech016_30s_xmos_8ch.wav";
     public static WaveFile44k: string = Settings.InputDir + "whatstheweatherlike.44khz.wav";
+    public static WaveFileMulaw: string = Settings.InputDir + "whatstheweatherlike.mulaw";
+    public static WaveFileAlaw: string = Settings.InputDir + "whatstheweatherlike.alaw";
     public static LongerWaveFile: string = Settings.InputDir + "StreamingEnrollment.wav";
     public static MonoChannelAlignedWaveFile: string = Settings.InputDir + "only-a-test.wav";
     public static WaveFileLanguage: string = "en-US";

diff --git a/tests/SpeechRecognizerTests.ts b/tests/SpeechRecognizerTests.ts
@@ -1113,53 +1113,65 @@ describe.each([true])("Service based tests", (forceNodeWebSocket: boolean) => {
         r.startContinuousRecognitionAsync();
     }, 15000);
 
-    test("PushStream44K file", (done: jest.DoneCallback) => {
+    test("PushStream44K, muLaw, Alaw files", async (done: jest.DoneCallback) => {
         // tslint:disable-next-line:no-console
-        console.info("Name: PushStream44K file");
+        console.info("Name: PushStream44K, muLaw, Alaw files");
         const s: sdk.SpeechConfig = BuildSpeechConfig();
         objsToClose.push(s);
 
-        const format: sdk.AudioStreamFormat = sdk.AudioStreamFormat.getWaveFormatPCM(44100, 16, 1);
-        const f: ArrayBuffer = WaveFileAudioInput.LoadArrayFromFile(Settings.WaveFile44k);
-        const p: sdk.PushAudioInputStream = sdk.AudioInputStream.createPushStream(format);
-        const config: sdk.AudioConfig = sdk.AudioConfig.fromStreamInput(p);
+        let success: number = 0;
 
-        p.write(f);
-        p.close();
+        const formatTestFiles: { file: string, sampleRate: number, bitRate: number, channels: number, formatTag: sdk.AudioFormatTag }[] = [
+            { file: Settings.WaveFile44k, sampleRate: 44100, bitRate: 16, channels: 1, formatTag: sdk.AudioFormatTag.PCM },
+            { file: Settings.WaveFileAlaw, sampleRate: 16000, bitRate: 16, channels: 1, formatTag: sdk.AudioFormatTag.ALaw },
+            { file: Settings.WaveFileMulaw, sampleRate: 16000, bitRate: 16, channels: 1, formatTag: sdk.AudioFormatTag.MuLaw },
+        ];
 
-        const r: sdk.SpeechRecognizer = new sdk.SpeechRecognizer(s, config);
-        objsToClose.push(r);
+        for (const testFile of formatTestFiles) {
+            const format: sdk.AudioStreamFormat = sdk.AudioStreamFormat.getWaveFormat(testFile.sampleRate, testFile.bitRate, testFile.channels, testFile.formatTag);
+            const f: ArrayBuffer = WaveFileAudioInput.LoadArrayFromFile(testFile.file);
+            const p: sdk.PushAudioInputStream = sdk.AudioInputStream.createPushStream(format);
+            const config: sdk.AudioConfig = sdk.AudioConfig.fromStreamInput(p);
 
-        expect(r).not.toBeUndefined();
-        expect(r instanceof sdk.Recognizer);
+            p.write(f);
+            p.close();
 
-        r.canceled = (o: sdk.Recognizer, e: sdk.SpeechRecognitionCanceledEventArgs): void => {
-            try {
-                expect(e.errorDetails).toBeUndefined();
-            } catch (error) {
-                done.fail(error);
-            }
-        };
+            const r: sdk.SpeechRecognizer = new sdk.SpeechRecognizer(s, config);
+            objsToClose.push(r);
 
-        r.recognizeOnceAsync(
-            (p2: sdk.SpeechRecognitionResult) => {
-                const res: sdk.SpeechRecognitionResult = p2;
-                try {
-                    expect(res).not.toBeUndefined();
-                    expect(sdk.ResultReason[res.reason]).toEqual(sdk.ResultReason[sdk.ResultReason.RecognizedSpeech]);
-                    expect(res.text).toEqual("What's the weather like?");
-                    expect(res.properties).not.toBeUndefined();
-                    expect(res.properties.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult)).not.toBeUndefined();
+            expect(r).not.toBeUndefined();
+            expect(r instanceof sdk.Recognizer);
 
-                    done();
+            r.canceled = (o: sdk.Recognizer, e: sdk.SpeechRecognitionCanceledEventArgs): void => {
+                try {
+                    expect(e.errorDetails).toBeUndefined();
                 } catch (error) {
                     done.fail(error);
                 }
+            };
 
-            },
-            (error: string) => {
-                done.fail(error);
-            });
+            r.recognizeOnceAsync(
+                (p2: sdk.SpeechRecognitionResult) => {
+                    const res: sdk.SpeechRecognitionResult = p2;
+                    try {
+                        expect(res).not.toBeUndefined();
+                        expect(sdk.ResultReason[res.reason]).toEqual(sdk.ResultReason[sdk.ResultReason.RecognizedSpeech]);
+                        expect(res.text).toEqual("What's the weather like?");
+                        expect(res.properties).not.toBeUndefined();
+                        expect(res.properties.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult)).not.toBeUndefined();
+
+                        success++;
+                    } catch (error) {
+                        done.fail(error);
+                    }
+
+                },
+                (error: string) => {
+                    done.fail(error);
+                });
+
+        }
+        WaitForCondition(() => success === 3, done);
     });
 
     test("PushStream4KPostRecognizePush", (done: jest.DoneCallback) => {

diff --git a/tests/input/audio/whatstheweatherlike.alaw b/tests/input/audio/whatstheweatherlike.alaw
diff --git a/tests/input/audio/whatstheweatherlike.mulaw b/tests/input/audio/whatstheweatherlike.mulaw