From e9f80b41f024e860d1aedf3574774b64733125a1 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 12 Nov 2025 16:18:36 -0800 Subject: [PATCH 1/4] fix: rm model_name, executorch #15798 --- metal/whisper/run.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/metal/whisper/run.sh b/metal/whisper/run.sh index 4a4db3be..de7eb954 100755 --- a/metal/whisper/run.sh +++ b/metal/whisper/run.sh @@ -45,5 +45,4 @@ done --tokenizer_path "$ARTIFACT_DIR"/ \ --audio_path "$INPUT_AUDIO_PATH" \ --processor_path "$ARTIFACT_DIR"/whisper_preprocessor.pte \ - --model_name "large-v3-turbo" \ --temperature 0 From fed95279fbd5e437fcadb3aa8d60af7ee9fb9278 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 12 Nov 2025 16:21:08 -0800 Subject: [PATCH 2/4] feat: add model_name param to support various whisper models --- metal/whisper/e2e.sh | 10 +++++++++- metal/whisper/export.sh | 14 +++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/metal/whisper/e2e.sh b/metal/whisper/e2e.sh index 8162bf4a..9978f827 100755 --- a/metal/whisper/e2e.sh +++ b/metal/whisper/e2e.sh @@ -18,6 +18,7 @@ EXECUTORCH_PATH="" ARTIFACT_DIR="" ENV_NAME="" AUDIO_PATH="" +MODEL_NAME="openai/whisper-large-v3-turbo" echo "Current script path: $(realpath "$0")" SCRIPT_DIR="$(realpath "$(dirname "$(realpath "$0")")")" @@ -37,6 +38,7 @@ usage() { echo " --create-env Create the Python environment" echo " --setup-env Set up the Python environment" echo " --export Export the Whisper model" + echo " --model-name NAME HuggingFace model name (optional, default: openai/whisper-large-v3-turbo)" echo " --build Build the Whisper runner" echo " --audio-path PATH Path to the input audio file" echo " --run Run the Whisper model" @@ -44,6 +46,7 @@ usage() { echo "" echo "Example:" echo " $0 --env-name metal-backend --setup-env --export --build --audio-path audio.wav --run" + echo " $0 --env-name metal-backend --export --model-name openai/whisper-small --build --audio-path audio.wav --run" exit 1 } @@ -78,6 +81,10 @@ while [[ $# -gt 0 ]]; do EXPORT=true shift ;; + --model-name) + MODEL_NAME="$2" + shift 2 + ;; --build) BUILD=true shift @@ -160,8 +167,9 @@ fi # Execute export if [ "$EXPORT" = true ]; then echo "Exporting Whisper model to $ARTIFACT_DIR ..." + echo " - Model: $MODEL_NAME" echo " - Script: $SCRIPT_DIR/export.sh" - conda run -n "$ENV_NAME" "$SCRIPT_DIR/export.sh" "$ARTIFACT_DIR" + conda run -n "$ENV_NAME" "$SCRIPT_DIR/export.sh" "$ARTIFACT_DIR" "$MODEL_NAME" fi # Execute build diff --git a/metal/whisper/export.sh b/metal/whisper/export.sh index 3e8be16b..b1903195 100755 --- a/metal/whisper/export.sh +++ b/metal/whisper/export.sh @@ -6,17 +6,25 @@ # LICENSE file in the root directory of this source tree. ARTIFACT_DIR="$1" +MODEL_NAME="${2:-openai/whisper-large-v3-turbo}" if [ -z "$ARTIFACT_DIR" ]; then echo "Error: ARTIFACT_DIR argument not provided." - echo "Usage: $0 " + echo "Usage: $0 [MODEL_NAME]" + echo "" + echo "Arguments:" + echo " Directory to store exported model files (required)" + echo " [MODEL_NAME] HuggingFace model name (optional, default: openai/whisper-large-v3-turbo)" + echo "" + echo "Example:" + echo " $0 ~/Desktop/whisper openai/whisper-small" exit 1 fi mkdir -p "$ARTIFACT_DIR" optimum-cli export executorch \ - --model "openai/whisper-large-v3-turbo" \ + --model "$MODEL_NAME" \ --task "automatic-speech-recognition" \ --recipe "metal" \ --dtype bfloat16 \ @@ -28,7 +36,7 @@ python -m executorch.extension.audio.mel_spectrogram \ --max_audio_len 300 \ --output_file "$ARTIFACT_DIR"/whisper_preprocessor.pte -TOKENIZER_URL="https://huggingface.co/openai/whisper-large-v3-turbo/resolve/main" +TOKENIZER_URL="https://huggingface.co/$MODEL_NAME/resolve/main" curl -L $TOKENIZER_URL/tokenizer.json -o $ARTIFACT_DIR/tokenizer.json curl -L $TOKENIZER_URL/tokenizer_config.json -o $ARTIFACT_DIR/tokenizer_config.json From 88401de95fbfa17716819ad5b1c1f380c82dee75 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 12 Nov 2025 16:21:49 -0800 Subject: [PATCH 3/4] feat: support standard model's FEATURE_SIZE with 80 --- metal/whisper/export.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/metal/whisper/export.sh b/metal/whisper/export.sh index b1903195..45aee964 100755 --- a/metal/whisper/export.sh +++ b/metal/whisper/export.sh @@ -23,6 +23,18 @@ fi mkdir -p "$ARTIFACT_DIR" +echo "Exporting model: $MODEL_NAME" + +# Determine feature_size based on model name +# large-v3 and large-v3-turbo use 128 mel features, all others use 80 +if [[ "$MODEL_NAME" == *"large-v3"* ]]; then + FEATURE_SIZE=128 + echo "Using feature_size=128 for large-v3/large-v3-turbo model" +else + FEATURE_SIZE=80 + echo "Using feature_size=80 for standard Whisper model" +fi + optimum-cli export executorch \ --model "$MODEL_NAME" \ --task "automatic-speech-recognition" \ @@ -31,7 +43,7 @@ optimum-cli export executorch \ --output_dir "$ARTIFACT_DIR" python -m executorch.extension.audio.mel_spectrogram \ - --feature_size 128 \ + --feature_size $FEATURE_SIZE \ --stack_output \ --max_audio_len 300 \ --output_file "$ARTIFACT_DIR"/whisper_preprocessor.pte From 968c34c40a27504dd5d2f24a96e61e6c6cff8eb9 Mon Sep 17 00:00:00 2001 From: Young Han Date: Wed, 12 Nov 2025 16:22:59 -0800 Subject: [PATCH 4/4] docs: update model_name requirement, different FEATURE_SIZE and various model support --- metal/whisper/README.md | 101 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/metal/whisper/README.md b/metal/whisper/README.md index 7d964a87..63172a66 100644 --- a/metal/whisper/README.md +++ b/metal/whisper/README.md @@ -21,6 +21,7 @@ The Metal backend scripts are located under `metal/` ## Whisper Example The Whisper example demonstrates how to: + 1. Set up your environment 2. Export the Whisper model to ExecuTorch format 3. Build the Whisper Metal runner @@ -37,7 +38,8 @@ metal/whisper/e2e.sh \ --setup-env \ --export \ --build \ - --run --audio-path + --run --audio-path \ + [--model-name ] ``` **Required Arguments:** @@ -46,7 +48,13 @@ metal/whisper/e2e.sh \ - `` - Name of the conda environment to create (e.g., `whisper-example`) - `` - Path to your audio file for inference (e.g., `~/Desktop/audio.wav`) -**Example:** +**Optional Arguments:** + +- `` - HuggingFace Whisper model name (default: `openai/whisper-large-v3-turbo`) + - Available models: `openai/whisper-tiny`, `openai/whisper-base`, `openai/whisper-small`, + `openai/whisper-medium`, `openai/whisper-large`, `openai/whisper-large-v3-turbo` + +**Example (default large-v3-turbo model):** ```bash metal/whisper/e2e.sh \ @@ -60,11 +68,29 @@ metal/whisper/e2e.sh \ --run --audio-path ~/Desktop/audio.wav ``` +**Example (using small model):** + +```bash +metal/whisper/e2e.sh \ + --artifact-dir ~/Desktop/whisper \ + --env-name whisper-example \ + --clone-et \ + --create-env \ + --setup-env \ + --export \ + --model-name openai/whisper-small \ + --build \ + --run --audio-path ~/Desktop/audio.wav +``` + This will automatically: + 1. Setup the environment: - - Clone the executorch repo - - Create a conda environment named `whisper-example` - - Install all dependencies + +- Clone the executorch repo +- Create a conda environment named `whisper-example` +- Install all dependencies + 2. Export the Whisper model to the `~/Desktop/whisper` directory 3. Build the Whisper Metal runner 4. Run inference on `~/Desktop/whisper/audio.wav` @@ -86,14 +112,29 @@ conda activate #### Step 2: Export the Model ```bash -/path/to/metal/whisper/export.sh +/path/to/metal/whisper/export.sh [model_name] ``` **Arguments:** -- `` - Directory to store exported model files (e.g., `~/Desktop/whisper`) + +- `` - Directory to store exported model files (required, e.g., `~/Desktop/whisper`) +- `[model_name]` - HuggingFace Whisper model name (optional, default: `openai/whisper-large-v3-turbo`) + - Available models: `openai/whisper-tiny`, `openai/whisper-base`, `openai/whisper-small`, + `openai/whisper-medium`, `openai/whisper-large`, `openai/whisper-large-v3-turbo` + +**Examples:** + +```bash +# Export default large-v3-turbo model +/path/to/metal/whisper/export.sh ~/Desktop/whisper + +# Export small model +/path/to/metal/whisper/export.sh ~/Desktop/whisper openai/whisper-small +``` This will: -- Download the Whisper model + +- Download the specified Whisper model from HuggingFace - Export it to ExecuTorch format with Metal optimizations - Save model files (`.pte`), metadata, and preprocessor to the specified directory @@ -110,11 +151,51 @@ This will: ``` **Arguments:** -- `` - Path to your audio file (e.g., `/path/to/audio.wav`) -- `` - Directory containing exported model files (e.g., `~/Desktop/whisper`) + +- `` - Path to your audio file (required, e.g., `/path/to/audio.wav`) +- `` - Directory containing exported model files (required, e.g., `~/Desktop/whisper`) + +**Example:** + +```bash +/path/to/metal/whisper/run.sh ~/Desktop/audio.wav ~/Desktop/whisper +``` This will: + - Validate that all required model files exist - Load the model and preprocessor - Run inference on the provided audio - Display timing information + +## Available Whisper Models + +The following Whisper models are supported: + +| Model Name | HuggingFace ID (for export) | Parameters | Mel Features | Relative Speed | Use Case | +| -------------- | ------------------------------- | ---------- | ------------ | -------------- | ------------------------------ | +| Tiny | `openai/whisper-tiny` | 39M | 80 | Fastest | Quick transcription, real-time | +| Base | `openai/whisper-base` | 74M | 80 | Very Fast | Good balance for real-time | +| Small | `openai/whisper-small` | 244M | 80 | Fast | Recommended for most use cases | +| Medium | `openai/whisper-medium` | 769M | 80 | Moderate | Higher accuracy needed | +| Large | `openai/whisper-large` | 1550M | 80 | Slower | Best accuracy | +| Large V3 | `openai/whisper-large-v3` | 1550M | **128** | Slower | Latest architecture | +| Large V3 Turbo | `openai/whisper-large-v3-turbo` | 809M | **128** | Fast | Default, good balance | + +### Mel Features Configuration + +The export scripts automatically configure the correct mel feature size based on the model: + +- **80 mel features**: Used by all standard models (tiny, base, small, medium, large, large-v2) +- **128 mel features**: Used only by large-v3 and large-v3-turbo variants + +**Important:** The preprocessor must match the model's expected feature size, or you'll encounter tensor shape mismatch errors. The export scripts handle this automatically. + +### Tokenizer Configuration + +**Important Note:** All Whisper models downloaded from HuggingFace now use the updated tokenizer format where: + +- Token `50257` = `<|endoftext|>` +- Token `50258` = `<|startoftranscript|>` (used as `decoder_start_token_id`) + +The whisper_runner automatically uses `decoder_start_token_id=50258` for all models, so you don't need to worry about tokenizer compatibility when exporting and running any Whisper variant.