openmpf · mcrensh · Sep 1, 2023 · Oct 21, 2022 · Nov 8, 2022 · Nov 18, 2022
diff --git a/python/WhisperSpeechDetection/Dockerfile b/python/WhisperSpeechDetection/Dockerfile
@@ -0,0 +1,51 @@
+# syntax=docker/dockerfile:1.2
+
+#############################################################################
+# NOTICE                                                                    #
+#                                                                           #
+# This software (or technical data) was produced for the U.S. Government    #
+# under contract, and is subject to the Rights in Data-General Clause       #
+# 52.227-14, Alt. IV (DEC 2007).                                            #
+#                                                                           #
+# Copyright 2023 The MITRE Corporation. All Rights Reserved.                #
+#############################################################################
+
+#############################################################################
+# Copyright 2023 The MITRE Corporation                                      #
+#                                                                           #
+# Licensed under the Apache License, Version 2.0 (the "License");           #
+# you may not use this file except in compliance with the License.          #
+# You may obtain a copy of the License at                                   #
+#                                                                           #
+#    http://www.apache.org/licenses/LICENSE-2.0                             #
+#                                                                           #
+# Unless required by applicable law or agreed to in writing, software       #
+# distributed under the License is distributed on an "AS IS" BASIS,         #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+# See the License for the specific language governing permissions and       #
+# limitations under the License.                                            #
+#############################################################################
+
+ARG BUILD_REGISTRY
+ARG BUILD_TAG=latest
+FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG}
+
+ARG RUN_TESTS=false
+
+RUN pip install --no-cache-dir openai-whisper==20230314 'numpy<1.24,>=1.18'
+
+RUN python -c 'import whisper; whisper.load_model("base")'
+RUN python -c 'import whisper; whisper.load_model("base.en")'
+RUN python -c 'import whisper; whisper.load_model("tiny")'
+RUN python -c 'from tiktoken_ext.openai_public import gpt2; gpt2()'
+
+RUN --mount=target=.,readwrite \
+    install-component.sh; \
+    if [ "${RUN_TESTS,,}" == true ]; then python tests/test_whisper_detection.py; fi
+
+LABEL org.label-schema.license="Apache 2.0" \
+      org.label-schema.name="OpenMPF Whisper Speech Detection" \
+      org.label-schema.schema-version="1.0" \
+      org.label-schema.url="https://openmpf.github.io" \
+      org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \
+      org.label-schema.vendor="MITRE"
diff --git a/python/WhisperSpeechDetection/LICENSE b/python/WhisperSpeechDetection/LICENSE
@@ -0,0 +1,31 @@
+/******************************************************************************
+ * Copyright 2023 The MITRE Corporation                                       *
+ *                                                                            *
+ * Licensed under the Apache License, Version 2.0 (the "License");            *
+ * you may not use this file except in compliance with the License.           *
+ * You may obtain a copy of the License at                                    *
+ *                                                                            *
+ *    http://www.apache.org/licenses/LICENSE-2.0                              *
+ *                                                                            *
+ * Unless required by applicable law or agreed to in writing, software        *
+ * distributed under the License is distributed on an "AS IS" BASIS,          *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   *
+ * See the License for the specific language governing permissions and        *
+ * limitations under the License.                                             *
+ ******************************************************************************/
+
+ This project contains content developed by The MITRE Corporation. If this code
+ is used in a deployment or embedded within another project, it is requested
+ that you send an email to opensource@mitre.org in order to let us know where
+ this software is being used.
+
+
+
+ This software makes use of source code derived from third party software:
+
+ ------------------------------------------------------------------------------
+
+ Whisper is licensed under MIT License.
+ Copyright (c) 2022 OpenAI
+
+ Project page: https://github.com/openai/whisper
diff --git a/python/WhisperSpeechDetection/README.md b/python/WhisperSpeechDetection/README.md
@@ -0,0 +1,109 @@
+# Overview
+
+This repository contains source code and model data for the OpenMPF Whisper Speech Detection component.
+This component uses the OpenAI Whisper model.
+
+# Introduction
+
+This component identifies the language spoken in audio and video clips.
+
+# Input Properties
+- `WHISPER_MODEL_SIZE`: Size of the Whisper model. Whisper has `tiny`, `base`, `small`, `medium`, and `large` models available for multilingual models. English-only models are available in `tiny`, `base`, `small`, and `medium`. 
+- `WHISPER_MODEL_LANG`: Whisper has English-only models and multilingual models. Set to `en` for English-only models and `multi` for multilingual models.
+- `WHISPER_MODE`: Determines whether Whisper will perform language detection, speech-to-text transcription, or speech translation. English-only models can only transcribe English audio. Set to `LANGUAGE_DETECTION` for spoken language detection, `TRANSCRIPTION` for speech-to-text transcription, and `SPEECH_TRANSLATION` for speech translation.
+- `AUDIO_LANGUAGE`: Optional property that indicates the language to use for audio translation or transcription. If left as an empty string, Whisper will automatically detect a single language from the first 30 seconds of audio.
+
+# Output Properties
+- `DETECTED_LANGUAGE`: Language with the highest confidence value.
+- `DETECTED_LANGUAGE_CONFIDENCE`: The confidence value of the language with the highest confidence.
+- `TRANSCRIPT`: Returns transcript of audio for transcription and translation runs.
+- `TRANSLATED_AUDIO`: Returns the translated text for translated audio runs.
+
+# Behavior
+Some quirks in Whisper's behavior when transcribing or translating audio with multiple languages has been observed.
+
+### Transcribe ###
+
+Size  | Provided Language | Result for Spanish Part | Result for English Part
+------|-------------------|-------------------------|-------------------------
+base  | Auto-detected     | Correctly transcribed   | Gibberish
+large | Auto-detected     | Correctly transcribed   | Translated to Spanish
+base  | English           | Translated to English   | Correctly transcribed 
+large | English           | Translated to English   | Correctly transcribed 
+
+
+### Translate ###
+
+Size  | Provided Language | Result for Spanish Part | Result for English Part
+------|-------------------|-------------------------|------------------------
+base  | Auto-detected     | Correctly translated    | Not included in output
+base  | English           | Correctly translated    | Transcribed
+large | Auto-detected     | Correctly translated    | Mostly skipped
+large | English           | Correctly translated    | Mostly skipped
+
+
+See [whisper_behavior_notes.md](whisper_behavior_notes.md) for more details.
+
+# Language Identifiers
+The following are the ISO 639-1 codes, the ISO 639-3 codes, and their corresponding languages which Whisper can translate to English.
+
+All translations are to English.
+
+| ISO-639-1 | ISO-639-3 | Language         |
+| --- |---|------------------|
+| `af` | `afr` | Afrikaans        |
+| `ar` | `ara` | Arabic           |
+| `hy` | `hye` | Armenian         |
+| `az` | `aze` | Azerbaijani      |
+| `be` | `bel` | Belarusian       |
+| `bs` | `bos` | Bosnian          |
+| `bg` | `bul` | Bulgarian        |
+| `ca` | `cat` | Catalan          |
+| `zh` | `zho` | Chinese          |
+| `hr` | `hrv` | Croatian         |
+| `cs` | `ces` | Czech            |
+| `da` | `dan` | Danish           |
+| `nl` | `nld` | Dutch            |
+| `en` | `eng` | English          |
+| `fi` | `fin` | Finnish          |
+| `fr` | `fra` | French           |
+| `gl` | `glg` | Galician         |
+| `de` | `deu` | German           |
+| `el` | `ell` | Greek            |
+| `he` | `heb` | Hebrew           |
+| `hi` | `hin` | Hindi            |
+| `hu` | `hun` | Hungarian        |
+| `is` | `isl` | Icelandic        |
+| `id` | `ind` | Indonesian       |
+| `it` | `ita` | Italian          |
+| `ja` | `jpn` | Japanese         |
+| `kn` | `kan` | Kannada          |
+| `kk` | `kaz` | Kazakh           |
+| `ko` | `kor` | Korean           |
+| `lv` | `lav` | Latvian          |
+| `lt` | `lit` | Lithuanian       |
+| `mk` | `mkd` | Macedonian       |
+| `ms` | `msa` | Malay            |
+| `mi` | `mri` | Maori            |
+| `mr` | `mar` | Marathi          |
+| `ne` | `nep` | Nepali           |
+| `no` | `nor` | Norwegian        |
+| `fa` | `fas` | Persian          |
+| `pl` | `pol` | Polish           |
+| `pt` | `por` | Portuguese       |
+| `ro` | `ron` | Romanian         |
+| `ru` | `rus` | Russian          |
+| `sr` | `srp` | Serbian          |
+| `sk` | `slk` | Slovak           |
+| `sl` | `slv` | Slovenian        |
+| `es` | `spa` | Spanish          |
+| `sw` | `swa` | Swahili          |
+| `sv` | `swe` | Swedish          |
+| `tl` | `tgl` | Tagalog          |
+| `ta` | `tam` | Tamil            |
+| `th` | `tha` | Thai             |
+| `tr` | `tur` | Turkish          |
+| `uk` | `ukr` | Ukrainian        |
+| `ur` | `urd` | Urdu             |
+| `vi` | `vie` | Vietnamese       |
+| `cy` | `cym` | Welsh            |
diff --git a/python/WhisperSpeechDetection/plugin-files/descriptor/descriptor.json b/python/WhisperSpeechDetection/plugin-files/descriptor/descriptor.json
@@ -0,0 +1,130 @@
+{
+  "componentName": "WhisperSpeechDetection",
+  "componentVersion": "7.2",
+  "middlewareVersion": "7.2",
+  "sourceLanguage": "python",
+  "batchLibrary": "WhisperSpeechDetection",
+  "environmentVariables": [],
+  "algorithm": {
+    "name": "WHISPERSPEECH",
+    "description": "Uses OpenAI's Whisper model to perform language detection in speech.",
+    "actionType": "DETECTION",
+    "requiresCollection": {
+      "states": []
+    },
+    "providesCollection": {
+      "states": [
+        "DETECTION",
+        "DETECTION_SPEECH",
+        "DETECTION_SPEECH_WHISPER"
+      ],
+      "properties": [
+        {
+          "name": "WHISPER_MODEL_LANG",
+          "description": "Whisper has English-only models and multilingual models. Set to 'en' for English-only models and 'multi' for multilingual models.",
+          "type": "STRING",
+          "defaultValue": "multi"
+        },
+        {
+          "name": "WHISPER_MODEL_SIZE",
+          "description": "Whisper models come in multiple sizes; 'tiny', 'base', 'small', 'medium', and 'large'. Multilingual models are available in all 5 sizes. English-only models are not available in 'large' size, but are available in the other four sizes .",
+          "type": "STRING",
+          "defaultValue": "base"
+        },
+        {
+          "name": "WHISPER_MODE",
+          "description": "Set to 'LANGUAGE_DETECTION' for spoken language detection, 'TRANSCRIPTION' for speech-to-text transcription, and 'SPEECH_TRANSLATION' for speech translation.",
+          "type": "STRING",
+          "defaultValue": "LANGUAGE_DETECTION"
+        },
+        {
+          "name": "AUDIO_LANGUAGE",
+          "description": "Optional property that indicates the language to use for audio translation or transcription. If left as an empty string, Whisper will automatically detect a single language from the first 30 seconds of audio.",
+          "type": "STRING",
+          "defaultValue": ""
+        }
+      ]
+    }
+  },
+  "actions": [
+    {
+      "name": "WHISPER SPEECH LANGUAGE DETECTION ACTION",
+      "description": "Uses OpenAI's Whisper model to perform language detection in speech.",
+      "algorithm": "WHISPERSPEECH",
+      "properties": [
+        {
+          "name": "WHISPER_MODE",
+          "value": "LANGUAGE_DETECTION"
+        }
+      ]
+    },
+    {
+      "name": "WHISPER SPEECH DETECTION ACTION",
+      "description": "Uses OpenAI's Whisper model to convert speech to text.",
+      "algorithm": "WHISPERSPEECH",
+      "properties": [
+        {
+          "name": "WHISPER_MODE",
+          "value": "TRANSCRIPTION"
+        }
+      ]
+    },
+    {
+      "name": "WHISPER SPEECH DETECTION WITH TRANSLATION ACTION",
+      "description": "Uses OpenAI's Whisper model to convert speech to text and translate it to English.",
+      "algorithm": "WHISPERSPEECH",
+      "properties": [
+        {
+          "name": "WHISPER_MODE",
+          "value": "SPEECH_TRANSLATION"
+        }
+      ]
+    }
+  ],
+  "tasks": [
+    {
+      "name": "WHISPER SPEECH LANGUAGE DETECTION TASK",
+      "description": "Uses OpenAI's Whisper model to perform language detection in speech.",
+      "actions": [
+        "WHISPER SPEECH LANGUAGE DETECTION ACTION"
+      ]
+    },
+    {
+      "name": "WHISPER SPEECH DETECTION TASK",
+      "description": "Uses OpenAI's Whisper model to convert speech to text.",
+      "actions": [
+        "WHISPER SPEECH DETECTION ACTION"
+      ]
+    },
+    {
+      "name": "WHISPER SPEECH DETECTION WITH TRANSLATION TASK",
+      "description": "Uses OpenAI's Whisper model to convert speech to text and translate it to English.",
+      "actions": [
+        "WHISPER SPEECH DETECTION WITH TRANSLATION ACTION"
+      ]
+    }
+  ],
+  "pipelines": [
+    {
+      "name": "WHISPER SPEECH LANGUAGE DETECTION PIPELINE",
+      "description": "Uses OpenAI's Whisper model to perform language detection in speech.",
+      "tasks": [
+        "WHISPER SPEECH LANGUAGE DETECTION TASK"
+      ]
+    },
+    {
+      "name": "WHISPER SPEECH DETECTION PIPELINE",
+      "description": "Uses OpenAI's Whisper model to convert speech to text.",
+      "tasks": [
+        "WHISPER SPEECH DETECTION TASK"
+      ]
+    },
+    {
+      "name": "WHISPER SPEECH DETECTION WITH TRANSLATION PIPELINE",
+      "description": "Uses OpenAI's Whisper model to convert speech to text and translate it to English.",
+      "tasks": [
+        "WHISPER SPEECH DETECTION WITH TRANSLATION TASK"
+      ]
+    }
+  ]
+}
diff --git a/python/WhisperSpeechDetection/pyproject.toml b/python/WhisperSpeechDetection/pyproject.toml
@@ -0,0 +1,29 @@
+#############################################################################
+# NOTICE                                                                    #
+#                                                                           #
+# This software (or technical data) was produced for the U.S. Government    #
+# under contract, and is subject to the Rights in Data-General Clause       #
+# 52.227-14, Alt. IV (DEC 2007).                                            #
+#                                                                           #
+# Copyright 2023 The MITRE Corporation. All Rights Reserved.                #
+#############################################################################
+
+#############################################################################
+# Copyright 2023 The MITRE Corporation                                      #
+#                                                                           #
+# Licensed under the Apache License, Version 2.0 (the "License");           #
+# you may not use this file except in compliance with the License.          #
+# You may obtain a copy of the License at                                   #
+#                                                                           #
+#    http://www.apache.org/licenses/LICENSE-2.0                             #
+#                                                                           #
+# Unless required by applicable law or agreed to in writing, software       #
+# distributed under the License is distributed on an "AS IS" BASIS,         #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+# See the License for the specific language governing permissions and       #
+# limitations under the License.                                            #
+#############################################################################
+
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"