diff --git a/python/WhisperSpeechDetection/Dockerfile b/python/WhisperSpeechDetection/Dockerfile new file mode 100644 index 00000000..a35c8ea5 --- /dev/null +++ b/python/WhisperSpeechDetection/Dockerfile @@ -0,0 +1,51 @@ +# syntax=docker/dockerfile:1.2 + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +ARG BUILD_REGISTRY +ARG BUILD_TAG=latest +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} + +ARG RUN_TESTS=false + +RUN pip install --no-cache-dir openai-whisper==20230314 'numpy<1.24,>=1.18' + +RUN python -c 'import whisper; whisper.load_model("base")' +RUN python -c 'import whisper; whisper.load_model("base.en")' +RUN python -c 'import whisper; whisper.load_model("tiny")' +RUN python -c 'from tiktoken_ext.openai_public import gpt2; gpt2()' + +RUN --mount=target=.,readwrite \ + install-component.sh; \ + if [ "${RUN_TESTS,,}" == true ]; then python tests/test_whisper_detection.py; fi + +LABEL org.label-schema.license="Apache 2.0" \ + org.label-schema.name="OpenMPF Whisper Speech Detection" \ + org.label-schema.schema-version="1.0" \ + org.label-schema.url="https://openmpf.github.io" \ + org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \ + org.label-schema.vendor="MITRE" diff --git a/python/WhisperSpeechDetection/LICENSE b/python/WhisperSpeechDetection/LICENSE new file mode 100644 index 00000000..6c2e4117 --- /dev/null +++ b/python/WhisperSpeechDetection/LICENSE @@ -0,0 +1,31 @@ +/****************************************************************************** + * Copyright 2023 The MITRE Corporation * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + ******************************************************************************/ + + This project contains content developed by The MITRE Corporation. If this code + is used in a deployment or embedded within another project, it is requested + that you send an email to opensource@mitre.org in order to let us know where + this software is being used. + + + + This software makes use of source code derived from third party software: + + ------------------------------------------------------------------------------ + + Whisper is licensed under MIT License. + Copyright (c) 2022 OpenAI + + Project page: https://github.com/openai/whisper \ No newline at end of file diff --git a/python/WhisperSpeechDetection/README.md b/python/WhisperSpeechDetection/README.md new file mode 100644 index 00000000..40fa79e6 --- /dev/null +++ b/python/WhisperSpeechDetection/README.md @@ -0,0 +1,109 @@ +# Overview + +This repository contains source code and model data for the OpenMPF Whisper Speech Detection component. +This component uses the OpenAI Whisper model. + +# Introduction + +This component identifies the language spoken in audio and video clips. + +# Input Properties +- `WHISPER_MODEL_SIZE`: Size of the Whisper model. Whisper has `tiny`, `base`, `small`, `medium`, and `large` models available for multilingual models. English-only models are available in `tiny`, `base`, `small`, and `medium`. +- `WHISPER_MODEL_LANG`: Whisper has English-only models and multilingual models. Set to `en` for English-only models and `multi` for multilingual models. +- `WHISPER_MODE`: Determines whether Whisper will perform language detection, speech-to-text transcription, or speech translation. English-only models can only transcribe English audio. Set to `LANGUAGE_DETECTION` for spoken language detection, `TRANSCRIPTION` for speech-to-text transcription, and `SPEECH_TRANSLATION` for speech translation. +- `AUDIO_LANGUAGE`: Optional property that indicates the language to use for audio translation or transcription. If left as an empty string, Whisper will automatically detect a single language from the first 30 seconds of audio. + +# Output Properties +- `DETECTED_LANGUAGE`: Language with the highest confidence value. +- `DETECTED_LANGUAGE_CONFIDENCE`: The confidence value of the language with the highest confidence. +- `TRANSCRIPT`: Returns transcript of audio for transcription and translation runs. +- `TRANSLATED_AUDIO`: Returns the translated text for translated audio runs. + +# Behavior +Some quirks in Whisper's behavior when transcribing or translating audio with multiple languages has been observed. + +### Transcribe ### + +Size | Provided Language | Result for Spanish Part | Result for English Part +------|-------------------|-------------------------|------------------------- +base | Auto-detected | Correctly transcribed | Gibberish +large | Auto-detected | Correctly transcribed | Translated to Spanish +base | English | Translated to English | Correctly transcribed +large | English | Translated to English | Correctly transcribed + + +### Translate ### + +Size | Provided Language | Result for Spanish Part | Result for English Part +------|-------------------|-------------------------|------------------------ +base | Auto-detected | Correctly translated | Not included in output +base | English | Correctly translated | Transcribed +large | Auto-detected | Correctly translated | Mostly skipped +large | English | Correctly translated | Mostly skipped + + +See [whisper_behavior_notes.md](whisper_behavior_notes.md) for more details. + +# Language Identifiers +The following are the ISO 639-1 codes, the ISO 639-3 codes, and their corresponding languages which Whisper can translate to English. + +All translations are to English. + +| ISO-639-1 | ISO-639-3 | Language | +| --- |---|------------------| +| `af` | `afr` | Afrikaans | +| `ar` | `ara` | Arabic | +| `hy` | `hye` | Armenian | +| `az` | `aze` | Azerbaijani | +| `be` | `bel` | Belarusian | +| `bs` | `bos` | Bosnian | +| `bg` | `bul` | Bulgarian | +| `ca` | `cat` | Catalan | +| `zh` | `zho` | Chinese | +| `hr` | `hrv` | Croatian | +| `cs` | `ces` | Czech | +| `da` | `dan` | Danish | +| `nl` | `nld` | Dutch | +| `en` | `eng` | English | +| `fi` | `fin` | Finnish | +| `fr` | `fra` | French | +| `gl` | `glg` | Galician | +| `de` | `deu` | German | +| `el` | `ell` | Greek | +| `he` | `heb` | Hebrew | +| `hi` | `hin` | Hindi | +| `hu` | `hun` | Hungarian | +| `is` | `isl` | Icelandic | +| `id` | `ind` | Indonesian | +| `it` | `ita` | Italian | +| `ja` | `jpn` | Japanese | +| `kn` | `kan` | Kannada | +| `kk` | `kaz` | Kazakh | +| `ko` | `kor` | Korean | +| `lv` | `lav` | Latvian | +| `lt` | `lit` | Lithuanian | +| `mk` | `mkd` | Macedonian | +| `ms` | `msa` | Malay | +| `mi` | `mri` | Maori | +| `mr` | `mar` | Marathi | +| `ne` | `nep` | Nepali | +| `no` | `nor` | Norwegian | +| `fa` | `fas` | Persian | +| `pl` | `pol` | Polish | +| `pt` | `por` | Portuguese | +| `ro` | `ron` | Romanian | +| `ru` | `rus` | Russian | +| `sr` | `srp` | Serbian | +| `sk` | `slk` | Slovak | +| `sl` | `slv` | Slovenian | +| `es` | `spa` | Spanish | +| `sw` | `swa` | Swahili | +| `sv` | `swe` | Swedish | +| `tl` | `tgl` | Tagalog | +| `ta` | `tam` | Tamil | +| `th` | `tha` | Thai | +| `tr` | `tur` | Turkish | +| `uk` | `ukr` | Ukrainian | +| `ur` | `urd` | Urdu | +| `vi` | `vie` | Vietnamese | +| `cy` | `cym` | Welsh | \ No newline at end of file diff --git a/python/WhisperSpeechDetection/plugin-files/descriptor/descriptor.json b/python/WhisperSpeechDetection/plugin-files/descriptor/descriptor.json new file mode 100644 index 00000000..1c0cae20 --- /dev/null +++ b/python/WhisperSpeechDetection/plugin-files/descriptor/descriptor.json @@ -0,0 +1,130 @@ +{ + "componentName": "WhisperSpeechDetection", + "componentVersion": "7.2", + "middlewareVersion": "7.2", + "sourceLanguage": "python", + "batchLibrary": "WhisperSpeechDetection", + "environmentVariables": [], + "algorithm": { + "name": "WHISPERSPEECH", + "description": "Uses OpenAI's Whisper model to perform language detection in speech.", + "actionType": "DETECTION", + "requiresCollection": { + "states": [] + }, + "providesCollection": { + "states": [ + "DETECTION", + "DETECTION_SPEECH", + "DETECTION_SPEECH_WHISPER" + ], + "properties": [ + { + "name": "WHISPER_MODEL_LANG", + "description": "Whisper has English-only models and multilingual models. Set to 'en' for English-only models and 'multi' for multilingual models.", + "type": "STRING", + "defaultValue": "multi" + }, + { + "name": "WHISPER_MODEL_SIZE", + "description": "Whisper models come in multiple sizes; 'tiny', 'base', 'small', 'medium', and 'large'. Multilingual models are available in all 5 sizes. English-only models are not available in 'large' size, but are available in the other four sizes .", + "type": "STRING", + "defaultValue": "base" + }, + { + "name": "WHISPER_MODE", + "description": "Set to 'LANGUAGE_DETECTION' for spoken language detection, 'TRANSCRIPTION' for speech-to-text transcription, and 'SPEECH_TRANSLATION' for speech translation.", + "type": "STRING", + "defaultValue": "LANGUAGE_DETECTION" + }, + { + "name": "AUDIO_LANGUAGE", + "description": "Optional property that indicates the language to use for audio translation or transcription. If left as an empty string, Whisper will automatically detect a single language from the first 30 seconds of audio.", + "type": "STRING", + "defaultValue": "" + } + ] + } + }, + "actions": [ + { + "name": "WHISPER SPEECH LANGUAGE DETECTION ACTION", + "description": "Uses OpenAI's Whisper model to perform language detection in speech.", + "algorithm": "WHISPERSPEECH", + "properties": [ + { + "name": "WHISPER_MODE", + "value": "LANGUAGE_DETECTION" + } + ] + }, + { + "name": "WHISPER SPEECH DETECTION ACTION", + "description": "Uses OpenAI's Whisper model to convert speech to text.", + "algorithm": "WHISPERSPEECH", + "properties": [ + { + "name": "WHISPER_MODE", + "value": "TRANSCRIPTION" + } + ] + }, + { + "name": "WHISPER SPEECH DETECTION WITH TRANSLATION ACTION", + "description": "Uses OpenAI's Whisper model to convert speech to text and translate it to English.", + "algorithm": "WHISPERSPEECH", + "properties": [ + { + "name": "WHISPER_MODE", + "value": "SPEECH_TRANSLATION" + } + ] + } + ], + "tasks": [ + { + "name": "WHISPER SPEECH LANGUAGE DETECTION TASK", + "description": "Uses OpenAI's Whisper model to perform language detection in speech.", + "actions": [ + "WHISPER SPEECH LANGUAGE DETECTION ACTION" + ] + }, + { + "name": "WHISPER SPEECH DETECTION TASK", + "description": "Uses OpenAI's Whisper model to convert speech to text.", + "actions": [ + "WHISPER SPEECH DETECTION ACTION" + ] + }, + { + "name": "WHISPER SPEECH DETECTION WITH TRANSLATION TASK", + "description": "Uses OpenAI's Whisper model to convert speech to text and translate it to English.", + "actions": [ + "WHISPER SPEECH DETECTION WITH TRANSLATION ACTION" + ] + } + ], + "pipelines": [ + { + "name": "WHISPER SPEECH LANGUAGE DETECTION PIPELINE", + "description": "Uses OpenAI's Whisper model to perform language detection in speech.", + "tasks": [ + "WHISPER SPEECH LANGUAGE DETECTION TASK" + ] + }, + { + "name": "WHISPER SPEECH DETECTION PIPELINE", + "description": "Uses OpenAI's Whisper model to convert speech to text.", + "tasks": [ + "WHISPER SPEECH DETECTION TASK" + ] + }, + { + "name": "WHISPER SPEECH DETECTION WITH TRANSLATION PIPELINE", + "description": "Uses OpenAI's Whisper model to convert speech to text and translate it to English.", + "tasks": [ + "WHISPER SPEECH DETECTION WITH TRANSLATION TASK" + ] + } + ] +} diff --git a/python/WhisperSpeechDetection/pyproject.toml b/python/WhisperSpeechDetection/pyproject.toml new file mode 100644 index 00000000..d2aa20ab --- /dev/null +++ b/python/WhisperSpeechDetection/pyproject.toml @@ -0,0 +1,29 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" diff --git a/python/WhisperSpeechDetection/sample_whisper_speech_detector.py b/python/WhisperSpeechDetection/sample_whisper_speech_detector.py new file mode 100644 index 00000000..62db2cda --- /dev/null +++ b/python/WhisperSpeechDetection/sample_whisper_speech_detector.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import sys + +from whisper_speech_detection_component import WhisperSpeechDetectionWrapper + + +def main(): + if len(sys.argv) != 3: + sys.exit(f'Usage {sys.argv[0]} ') + + _, audio_file, whisper_mode = sys.argv + + + job_props = {"WHISPER_MODE": whisper_mode} + + + audio_tracks = WhisperSpeechDetectionWrapper().process_audio(audio_file, 0, 0, job_props) + print(audio_tracks) + detection_props = audio_tracks[0].detection_properties + + if whisper_mode == 0: + print('DETECTED LANGUAGE:', detection_props['DETECTED_LANGUAGE']) + print('DETECTED LANGUAGE CONFIDENCE:', audio_tracks[0].confidence) + elif whisper_mode == 1: + print('DECODED LANGUAGE:', detection_props['DECODED_LANGUAGE']) + print('TRANSCRIPT:', detection_props['TRANSCRIPT']) + elif whisper_mode == 2: + print('DECODED LANGUAGE:', detection_props['DECODED_LANGUAGE']) + print('TRANSCRIPT:', detection_props['TRANSCRIPT']) + print('TRANSLATED AUDIO', detection_props['TRANSLATED_AUDIO']) + + +if __name__ == '__main__': + main() diff --git a/python/WhisperSpeechDetection/setup.cfg b/python/WhisperSpeechDetection/setup.cfg new file mode 100644 index 00000000..e39b3611 --- /dev/null +++ b/python/WhisperSpeechDetection/setup.cfg @@ -0,0 +1,41 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[metadata] +name = WhisperSpeechDetection +version = 7.2 + +[options] +packages = whisper_speech_detection_component +install_requires = + mpf_component_api>=7.2 + mpf_component_util>=7.2 + openai-whisper==20230314 + numpy<1.24,>=1.18 + +[options.entry_points] +mpf.exported_component = + component = whisper_speech_detection_component.whisper_speech_detection_component:WhisperSpeechDetectionComponent diff --git a/python/WhisperSpeechDetection/tests/data/NOTICE b/python/WhisperSpeechDetection/tests/data/NOTICE new file mode 100644 index 00000000..9ea65fc1 --- /dev/null +++ b/python/WhisperSpeechDetection/tests/data/NOTICE @@ -0,0 +1,48 @@ +# bilingual.mp3 +Copyright 2016 Spanish in Texas Project +Creative Commons Attribution 3.0 Unported License: https://creativecommons.org/licenses/by/3.0 +https://www.youtube.com/watch?v=eBTWD7Zh-AA + +# left.wav +Sphinx-4 Speech Recognition System +Copyright 1999-2015 Carnegie Mellon University. All Rights Reserved. http://cmusphinx.sourceforge.net/ +Source code: https://github.com/cmusphinx/sphinx4 + +Copyright 1999-2015 Carnegie Mellon University. +Portions Copyright 2002-2008 Sun Microsystems, Inc. +Portions Copyright 2002-2008 Mitsubishi Electric Research Laboratories. +Portions Copyright 2013-2015 Alpha Cephei, Inc. + +All Rights Reserved. Use is subject to license terms. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Original authors' names are not deleted. + +4. The authors' names are not used to endorse or promote products derived from this software without specific prior written permission. + +This work was supported in part by funding from the Defense Advanced +Research Projects Agency and the National Science Foundation of the +United States of America, the CMU Sphinx Speech Consortium, and +Sun Microsystems, Inc. + +CARNEGIE MELLON UNIVERSITY, SUN MICROSYSTEMS, INC., MITSUBISHI +ELECTRONIC RESEARCH LABORATORIES AND THE CONTRIBUTORS TO THIS WORK +DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL +CARNEGIE MELLON UNIVERSITY, SUN MICROSYSTEMS, INC., MITSUBISHI +ELECTRONIC RESEARCH LABORATORIES NOR THE CONTRIBUTORS BE LIABLE FOR +ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + +# left.avi +Created by combining left.wav with public domain footage. \ No newline at end of file diff --git a/python/WhisperSpeechDetection/tests/data/bilingual.mp3 b/python/WhisperSpeechDetection/tests/data/bilingual.mp3 new file mode 100644 index 00000000..e9f099b3 Binary files /dev/null and b/python/WhisperSpeechDetection/tests/data/bilingual.mp3 differ diff --git a/python/WhisperSpeechDetection/tests/data/left.avi b/python/WhisperSpeechDetection/tests/data/left.avi new file mode 100644 index 00000000..6d7c1911 Binary files /dev/null and b/python/WhisperSpeechDetection/tests/data/left.avi differ diff --git a/python/WhisperSpeechDetection/tests/data/left.wav b/python/WhisperSpeechDetection/tests/data/left.wav new file mode 100644 index 00000000..f91e31f9 Binary files /dev/null and b/python/WhisperSpeechDetection/tests/data/left.wav differ diff --git a/python/WhisperSpeechDetection/tests/test_whisper_detection.py b/python/WhisperSpeechDetection/tests/test_whisper_detection.py new file mode 100644 index 00000000..e389a878 --- /dev/null +++ b/python/WhisperSpeechDetection/tests/test_whisper_detection.py @@ -0,0 +1,180 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import unittest +import os + +import mpf_component_api as mpf +import warnings +from whisper_speech_detection_component import WhisperSpeechDetectionComponent + + +class TestWhisperSpeechDetection(unittest.TestCase): + def setUp(self): + warnings.simplefilter('ignore', category=ResourceWarning) + warnings.simplefilter('ignore', category=DeprecationWarning) + + @staticmethod + def _get_test_file(filename): + return os.path.join( + os.path.dirname(__file__), + 'data', + filename + ) + + def test_audio_job(self): + job = mpf.AudioJob('test', self._get_test_file('left.wav'), 0, -1, {}, {}) + comp = WhisperSpeechDetectionComponent() + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + self.assertEqual('en', result[0].detection_properties['DETECTED_LANGUAGE']) + self.assertEqual('eng', result[0].detection_properties['ISO_LANGUAGE']) + + def test_video_job(self): + media_properties = dict(FPS='24') + + job = mpf.VideoJob('test', self._get_test_file('left.avi'), 0, -1, {}, media_properties) + comp = WhisperSpeechDetectionComponent() + result = comp.get_detections_from_video(job) + + self.assertEqual(1, len(result)) + self.assertEqual('en', result[0].detection_properties['DETECTED_LANGUAGE']) + self.assertEqual('eng', result[0].detection_properties['ISO_LANGUAGE']) + + def test_load_different_models_lang_detection(self): + job = mpf.AudioJob('test', self._get_test_file('left.wav'), 0, -1, {}, {}) + comp = WhisperSpeechDetectionComponent() + + self.assertFalse(comp.wrapper.initialized) + + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + self.assertEqual('en', result[0].detection_properties['DETECTED_LANGUAGE']) + self.assertEqual('eng', result[0].detection_properties['ISO_LANGUAGE']) + + self.assertEqual("multi", comp.wrapper.lang) + self.assertEqual("base", comp.wrapper.size) + self.assertTrue(comp.wrapper.initialized) + + job_props = dict(WHISPER_MODEL_SIZE='tiny', WHISPER_MODEL_LANG='multi') + job = mpf.AudioJob('test', self._get_test_file('left.wav'), 0, -1, job_props, {}) + + result = comp.get_detections_from_audio(job) + + self.assertEqual("multi", comp.wrapper.lang) + self.assertEqual("tiny", comp.wrapper.size) + self.assertTrue(comp.wrapper.initialized) + + self.assertEqual(1, len(result)) + self.assertEqual('en', result[0].detection_properties['DETECTED_LANGUAGE']) + + def test_transcribe(self): + job_props = dict(WHISPER_MODE="TRANSCRIPTION") + job = mpf.AudioJob('test', self._get_test_file('left.wav'), 0, -1, job_props, {}) + + comp = WhisperSpeechDetectionComponent() + result = comp.get_detections_from_audio(job) + + expected_text = "There's three left on the left side, the one closest to us." + + self.assertEqual(1, len(result)) + self.assertEqual('en', result[0].detection_properties['DECODED_LANGUAGE']) + self.assertEqual('eng', result[0].detection_properties['ISO_LANGUAGE']) + self.assertEqual(expected_text, result[0].detection_properties["TRANSCRIPT"]) + + def test_transcribe_given_language(self): + expected_text = ( + "Me comunico con diversas personas en inglés y en español o mezclando ambas lenguas. " + "Hay tantas razones. Yo lo hago por obviamente solidaridad, porque me reciben en esa comunidad. " + "Como crecién el lado mexicano no acostumbraba en ese pasado. " + "Luego, al cruzar la frontera metafórica porque no existe derecho, me di cuenta, hablan " + "diferente, salpican verdad su conversación con palabras en inglés y porque no. " + "Entonces no es fácil hacerlo porque he tratado de hacerlo y lo aprecio y lo entiendo " + "mucho más por la experiencia que tenido todos estos. Y lo hago para tratar de pertenecer, " + "para no ser diferente que me consideren, como parte de esa comunidad." + ) + + job_props = dict(WHISPER_MODE="TRANSCRIPTION") + job = mpf.AudioJob('test', self._get_test_file('bilingual.mp3'), 0, -1, job_props, {}) + + comp = WhisperSpeechDetectionComponent() + + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + self.assertEqual('es', result[0].detection_properties['DECODED_LANGUAGE']) + self.assertEqual('spa', result[0].detection_properties['ISO_LANGUAGE']) + + # Results for the English portion of the audio are non-deterministic + self.assertTrue(expected_text in result[0].detection_properties["TRANSCRIPT"]) + + job_props = dict(WHISPER_MODE="TRANSCRIPTION", AUDIO_LANGUAGE='es') + job = mpf.AudioJob('test', self._get_test_file('bilingual.mp3'), 0, -1, job_props, {}) + + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + + # Results for the English portion of the audio are non-deterministic + self.assertTrue(expected_text in result[0].detection_properties["TRANSCRIPT"]) + + def test_translation(self): + expected_text = ( + 'I communicate with different people in English and Spanish or mixing both languages. ' + 'There are so many reasons. I do it because obviously solidarity, because they receive ' + 'me in that community. As the Mexican people believe, it is not used in that past. Then, ' + 'when crossing the border, metaphorically, because there is no right, I realize, talking ' + 'different, they get out of the truth, their conversation, with words in English. And why ' + 'not? So it is not easy to do it, because I tried to do it. And I appreciate it, and I ' + 'understand it much more, because of the experience I had all these years. And I do it to ' + 'try to be, to not be different than I consider myself to be. As part of that community.' + ) + + job_props = dict(WHISPER_MODE="SPEECH_TRANSLATION") + job = mpf.AudioJob('test', self._get_test_file('bilingual.mp3'), 0, -1, job_props, {}) + + comp = WhisperSpeechDetectionComponent() + + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + self.assertEqual('es', result[0].detection_properties['DECODED_LANGUAGE']) + self.assertEqual('spa', result[0].detection_properties['ISO_LANGUAGE']) + self.assertTrue(expected_text in result[0].detection_properties["TRANSLATED_AUDIO"]) + + job_props = dict(WHISPER_MODE="SPEECH_TRANSLATION", AUDIO_LANGUAGE='es') + job = mpf.AudioJob('test', self._get_test_file('bilingual.mp3'), 0, -1, job_props, {}) + + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + self.assertTrue(expected_text in result[0].detection_properties["TRANSLATED_AUDIO"]) + + +if __name__ == '__main__': + unittest.main(verbosity=2) diff --git a/python/WhisperSpeechDetection/whisper_behavior_notes.md b/python/WhisperSpeechDetection/whisper_behavior_notes.md new file mode 100644 index 00000000..3badea5e --- /dev/null +++ b/python/WhisperSpeechDetection/whisper_behavior_notes.md @@ -0,0 +1,194 @@ +# Whisper Multi-Lingual Audio Investigation + +When set to transcription mode, sometimes Whisper would also translate +the audio + +Translating when it's supposed to be transcribing seems to happen more +with larger models and when the audio language is specified, where the +audio gets translated to the specified language. Spanish+English audio +where the model is told the language is in Spanish will have the English +audio translated to Spanish and vice-versa. These translations are very +similar to the translation output when the model is set to actually +translate. + +Example: First half of audio is in Spanish, **second half in English** + +Default settings (base size multilingual model, model auto-detects +language) output: + +- **Bolded text** is English audio, returns different results each + run + +- Detected language is Spanish + +- Me comunico con diversas personas en inglés y en español o + mezclando ambas lenguas. Hay tantas razones. Yo lo hago por + obviamente solidaridad, porque me reciben en esa comunidad. Como + crecién el lado mexicano no acostumbraba en ese pasado. Luego, al + cruzar la frontera metafórica porque no existe derecho, me di + cuenta, hablan diferente, salpican verdad su conversación con + palabras en inglés y porque no. Entonces no es fácil hacerlo porque + he tratado de hacerlo y lo aprecio y lo entiendo mucho más por la + experiencia que he tenido todos estos años. Y lo hago para tratar de + pertenecer, para no ser diferente que me consideren, como parte de + esa comunidad. **Gracias por crear este estilo y ver cómo se trabaja + Speaking inont不錯. O sea, el flour departador de la creación имеет + su** + + +Large multilingual model, auto-detect language output: + +- **Bolded text** is English audio, which has been translated to + Spanish. Returns the same/almost the same each run + +- Detected language is Spanish + +- Me comunico con diversas personas en inglés y en español o + mezclando ambas lenguas. Hay tantas razones. Yo lo hago por + obviamente solidaridad, porque me reciben en esa comunidad. Como + crecí en el lado mexicano, no acostumbraba en ese pasado. Luego, al + cruzar la frontera metafórica, porque no existe de hecho, me di + cuenta que hablan diferente. Salpican su conversación con palabras + en inglés. ¿Y por qué no? Entonces, no es fácil hacerlo, porque he + tratado de hacerlo. Lo aprecio y lo entiendo mucho más por la + experiencia que he tenido todos estos años. Y lo hago para tratar de + pertenecer, para no ser diferente, que me consideren como parte de + esa comunidad. **Lo hago todo el tiempo. En la escuela, con mis + colegas. A veces, cuando estoy enseñando, no mezclo tanto. Trato de + hablar más español en mis clases, porque estoy enseñando en español. + Si trabajo con gente que son amigas de mí, que hablan español, yo le + digo la palabra en la lengua que sea más fácil.** + + + +Say we wanted to transcribe the English portion of the audio, so we tell +the model the audio is in English + +Base multilingual model, English audio output: + +- **Bolded text** is Spanish audio, and it's been translated to + English + +- **I communicate with different people in English and Spanish or in + English. There are so many reasons. I do it because I obviously + solidarity because I receive in that community. As I grew up in + Mexico, I didn\'t use that language. Then, when crossing the border, + I was afraid because there was no right. I realized that speaking + different people, they really get their conversation with English + words. I didn\'t do it because I didn\'t do it. I didn\'t try to do + it because I was afraid. I appreciated it and I understand it much + more because of the experience I had in all these years. I do it to + try to keep it from being different, not to be considered as part of + that community.** I do all the time. At school with my colleagues. + Usually when I\'m teaching, I don\'t mix as much. I try to speak + more Spanish in my Spanish classes because we\'re teaching in + Spanish. I will mix if I\'m working with people that are friends of + mine, that are bilinguals, say the word in the language that comes + easiest. + + +Large multilingual model, English audio output: + +- **Bolded text** is Spanish audio that has been translated to + English + +- **I communicate with different people in English and Spanish or + mixing both languages. There are so many reasons. I do it for + solidarity, because they receive me in that community. As I grew up + in the Mexican side, I wasn\'t used to that past. Then, when I + crossed the metaphorical border, because it doesn\'t exist, I + realized that they speak differently. They speak in English, and why + not? So, it\'s not easy to do it, because I\'ve tried to do it. I + appreciate it and understand it more because of the experience I\'ve + had all these years. I do it to try to belong, not to be different.** + I do it all the time, at school with my colleagues. Usually when + I\'m teaching, I don\'t mix as much. I try to speak more Spanish in + my Spanish classes, because we\'re teaching in Spanish. I will mix + if I\'m working with people that are friends of mine, that are + bilingual, I\'ll say the word in the language that comes easiest. + + +If we tell the model to translate instead of transcribe, the +translations are very similar to the ones we've seen above + +Base multilingual model, auto-detect language translation results: + +- Detected language is Spanish + +- Only translated Spanish portion of the audio + +- Telling the model the audio is in Spanish returns the same results + +- I communicate with different people in English and Spanish or + mixing both languages. There are so many reasons. I do it because + obviously solidarity, because they receive me in that community. As + the Mexican people believe, it is not used in that past. Then, when + crossing the border, metaphorically, because there is no right, I + realize, talking different, they get out of the truth, their + conversation, with words in English. Why not? It is not easy to do + it because I try to do it. I appreciate it and I understand it much + more because of the experience I had all these years. I do it to try + to keep it from being, not to be different, and I consider it to be + a community. + + +Base multilingual model, English audio translation results: + +- **Bolded text** is Spanish audio + +- **I communicate with different people in English and Spanish or in + English. There are so many reasons. I do it because I obviously + solidarity because I receive in that community. As I grew up in + Mexico, I didn\'t use that language. Then, when crossing the border, + I was afraid because there was no right. I realized that speaking + different people, they really get their conversation with speaking + English. I didn\'t do it because I didn\'t want to do it. I tried to + do it because I was afraid. I appreciated it and I understand it + more because of the experience I had in all these years. I do it to + try to keep it from being different, not to be considered as part of + that community.** I do all the time. At school with my colleagues. + Usually when I\'m teaching, I don\'t mix as much. I try to speak + more Spanish in my Spanish classes because we\'re teaching in + Spanish. I will mix if I\'m working with people that are friends of + mine that are bilingual. Say the word in the language that comes + easiest. + + +Large multilingual model, auto-detect language translation results: + +- **Bolded text** is English audio. It's the last sentence in the English + audio so the English portion of the audio has been mostly skipped + over/ignored + +- I communicate with different people in English and Spanish or + mixing both languages. There are so many reasons. I do it for + solidarity, because they receive me in that community. As I grew up + in the Mexican side, I was not used to that past. Then, when I + crossed the metaphorical border, because it does not exist, I + realized that they speak differently. They speak in English, and why + not? So it is not easy to do it, because I have tried to do it. I + appreciate it and understand it much more because of the experience + I have had all these years. I do it to try to belong, not to be + different, to be considered as part of that community. **I will say + the word in the language that comes easiest.** + + +Large multilingual model, English audio translation results: + +- **Bolded text** is Spanish. Again seems to only want to translate + Spanish portion of audio but caught the last sentence of the English + audio as well + +- **I communicate with different people in English and Spanish or + mixing both languages. There are so many reasons. I do it for + solidarity, because they receive me in that community. As I grew up + in the Mexican side, I was not used to that past. Then, when I + crossed the metaphorical border, because it does not exist, I + realized that they speak differently. They speak in English, and why + not? So it is not easy to do it, because I have tried to do it. I + appreciate it and understand it much more because of the experience + I have had all these years. I do it to try to belong, not to be + different, to be considered as part of that community.** I think it is + the language that comes easiest. + + diff --git a/python/WhisperSpeechDetection/whisper_speech_detection_component/__init__.py b/python/WhisperSpeechDetection/whisper_speech_detection_component/__init__.py new file mode 100644 index 00000000..eee59e99 --- /dev/null +++ b/python/WhisperSpeechDetection/whisper_speech_detection_component/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .whisper_speech_detection_component import WhisperSpeechDetectionComponent, WhisperSpeechDetectionWrapper diff --git a/python/WhisperSpeechDetection/whisper_speech_detection_component/whisper_speech_detection_component.py b/python/WhisperSpeechDetection/whisper_speech_detection_component/whisper_speech_detection_component.py new file mode 100644 index 00000000..cbd67dbe --- /dev/null +++ b/python/WhisperSpeechDetection/whisper_speech_detection_component/whisper_speech_detection_component.py @@ -0,0 +1,321 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import whisper + +import logging +import warnings + +import mpf_component_api as mpf +import mpf_component_util as mpf_util +from typing import Sequence + +logger = logging.getLogger('WhisperDetectionComponent') + +warnings.filterwarnings('ignore', 'FP16 is not supported on CPU; using FP32 instead', UserWarning) +warnings.filterwarnings('ignore', category=ResourceWarning, module='multilingual.tiktoken') + +class WhisperSpeechDetectionComponent: + detection_type = 'SPEECH' + + def __init__(self): + logger.info('Creating instance of WhisperSpeechDetectionComponent') + self.wrapper = WhisperSpeechDetectionWrapper() + logger.info('WhisperSpeechDetectionComponent created') + + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: + logger.info('Received video job') + + start_frame = job.start_frame + stop_frame = job.stop_frame + + media_properties = job.media_properties + media_frame_count = int(media_properties.get('FRAME_COUNT', -1)) + media_duration = float(media_properties.get('DURATION', -1)) + + fpms = float(job.media_properties['FPS']) / 1000.0 + start_time = int(start_frame / fpms) + + if stop_frame > 0 and stop_frame < media_frame_count - 1: + stop_time = int(stop_frame / fpms) + elif media_duration > 0: + stop_time = int(media_duration) + elif media_frame_count > 0: + stop_time = int(media_frame_count / fpms) + else: + stop_time = None + + try: + audio_tracks = self.wrapper.process_audio( + target_file=job.data_uri, + start_time=start_time, + stop_time=stop_time, + job_properties=job.job_properties + ) + except Exception as e: + # Pass the exception up + error_str = 'Exception raised while processing audio: ' + str(e) + logger.exception(error_str) + raise + + try: + # Convert audio tracks to video tracks with placeholder frame locs + video_tracks = [] + for track in audio_tracks: + + video_track = mpf.VideoTrack( + start_frame=0, + stop_frame=-1, + confidence=track.confidence, + detection_properties=track.detection_properties + ) + + loc = mpf.ImageLocation( + x_left_upper=0, + y_left_upper=0, + width=0, + height=0, + confidence=track.confidence, + detection_properties=track.detection_properties + ) + + video_track.frame_locations[0] = loc + video_tracks.append(video_track) + + except Exception as e: + # Pass the exception up + logger.exception( + 'Exception raised while converting to video track: ' + str(e) + ) + raise + + logger.info('Processing complete. Found %d tracks.' % len(video_tracks)) + return video_tracks + + def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: + logger.info("Received audio job") + start_time = job.start_time + stop_time = job.stop_time + + try: + audio_tracks = self.wrapper.process_audio( + target_file=job.data_uri, + start_time=start_time, + stop_time=stop_time, + job_properties=job.job_properties + ) + except Exception as e: + # Pass the exception up + logger.exception( + "Exception raised while processing audio: " + str(e) + ) + raise + + logger.info('Processing complete. Found %d tracks.' % len(audio_tracks)) + return audio_tracks + + +class WhisperSpeechDetectionWrapper: + def __init__(self): + self.model = None + self.initialized = False + self.size = None + self.lang = None + + self.iso_map = { + 'af': 'afr', + 'ar': 'ara', + 'hy': 'hye', + 'az': 'aze', + 'be': 'bel', + 'bs': 'bos', + 'bg': 'bul', + 'ca': 'cat', + 'zh': 'zho', + 'cs': 'ces', + 'da': 'dan', + 'nl': 'nld', + 'en': 'eng', + 'et': 'est', + 'fi': 'fin', + 'fr': 'fra', + 'de': 'deu', + 'el': 'ell', + 'he': 'heb', + 'hi': 'hin', + 'hr': 'hrv', + 'hu': 'hun', + 'id': 'ind', + 'gl': 'glg', + 'it': 'ita', + 'is': 'isl', + 'ja': 'jpn', + 'kn': 'kan', + 'kk': 'kaz', + 'ko': 'kor', + 'lv': 'lav', + 'lt': 'lit', + 'mr': 'mar', + 'mi': 'mri', + 'mk': 'mkd', + 'ms': 'msa', + 'ne': 'nep', + 'no': 'nor', + 'fa': 'fas', + 'pl': 'pol', + 'pt': 'por', + 'ro': 'ron', + 'ru': 'rus', + 'sr': 'srp', + 'sk': 'slk', + 'sl': 'slv', + 'es': 'spa', + 'sw': 'swa', + 'sv': 'swe', + 'ta': 'tam', + 'tl': 'tgl', + 'th': 'tha', + 'tr': 'tur', + 'uk': 'ukr', + 'ur': 'urd', + 'vi': 'vie', + 'cy': 'cym' + } + + def process_audio(self, target_file, start_time, stop_time, job_properties): + model_size = mpf_util.get_property(job_properties, 'WHISPER_MODEL_SIZE', "base") + model_lang = mpf_util.get_property(job_properties, 'WHISPER_MODEL_LANG', "multi") + mode = mpf_util.get_property(job_properties, "WHISPER_MODE", "LANGUAGE_DETECTION") + + if model_lang == "en" and model_size == "large": + raise mpf.DetectionError.INVALID_PROPERTY.exception("Whisper does not have a large English model available.") + + if not self.initialized: + self._load_model(model_size, model_lang) + self.initialized = True + + elif self.size != model_size or self.lang != model_lang: + self._load_model(model_size, model_lang) + + audio_tracks = [] + + if mode == "LANGUAGE_DETECTION": + if model_lang != "multi": + raise mpf.DetectionError.INVALID_PROPERTY.exception("Whisper does not support language detection " + "using English models. Please use the multilingual " + "models.") + + audio = whisper.load_audio(target_file) + audio = whisper.pad_or_trim(audio) + + mel = whisper.log_mel_spectrogram(audio).to(self.model.device) + _, probs = self.model.detect_language(mel) + logger.info(f"Detected language: {max(probs, key=probs.get)}") + + detected_language = max(probs, key=probs.get) + detected_lang_conf = probs[detected_language] + + iso_639_3 = self.iso_map.get(detected_language, 'UNKNOWN') + + properties = dict( + DETECTED_LANGUAGE=detected_language, + ISO_LANGUAGE=iso_639_3 + ) + + track = mpf.AudioTrack( + start_time=start_time, + stop_time=stop_time, + confidence=detected_lang_conf, + detection_properties=properties + ) + audio_tracks.append(track) + + logger.debug('Completed process audio') + + elif mode == "TRANSCRIPTION": + properties = self._transcribe_text(target_file, job_properties) + + track = mpf.AudioTrack( + start_time=start_time, + stop_time=stop_time, + detection_properties=properties + ) + + audio_tracks.append(track) + elif mode == "SPEECH_TRANSLATION": + properties = self._transcribe_text(target_file, job_properties) + result = self.model.transcribe(target_file, task="translate") + + properties["TRANSLATED_AUDIO"] = result['text'].strip() + + track = mpf.AudioTrack( + start_time=start_time, + stop_time=stop_time, + detection_properties=properties + ) + + audio_tracks.append(track) + + return audio_tracks + + def _load_model(self, model_size, model_lang): + self.size = model_size + self.lang = model_lang + + if self.lang == "en": + model_string = self.size + "." + "en" + else: + model_string = self.size + + logger.info(f'Loading the "{model_string}" model...') + self.model = whisper.load_model(model_string) + + def _transcribe_text(self, target_file, job_properties): + language = mpf_util.get_property(job_properties, 'AUDIO_LANGUAGE', "") + + if language == "": + result = self.model.transcribe(target_file) + + iso_639_3 = self.iso_map.get(result['language'], 'UNKNOWN') + + properties = dict( + DECODED_LANGUAGE=result['language'], + ISO_LANGUAGE=iso_639_3, + TRANSCRIPT=result['text'].strip() + ) + + else: + + iso_639_3 = self.iso_map.get(language, 'UNKNOWN') + + result = self.model.transcribe(target_file, language=language) + properties = dict( + DECODED_LANGUAGE=language, + ISO_LANGUAGE=iso_639_3, + TRANSCRIPT=result['text'].strip() + ) + + return properties