# Transcribe 2 hours of audio in less than 2 minutes with Whisper


This tutorial demonstrates how to use the `pruna` package to optimize any custom whisper model. In this case, the smash function wraps the model into an efficient pipeline, which will transcribe 2 hours of audio in under 2 minutes on an A100 GPU We will use the `openai/whisper-large-v3` model as an example.

In [2]:
# if you are not running the latest version of this tutorial, make sure to install the matching version of pruna
# the following command will install the latest version of pruna

%pip install pruna

Collecting pruna
  Downloading pruna-0.2.9-py3-none-any.whl.metadata (29 kB)
Collecting aenum (from pruna)
  Downloading aenum-3.1.16-py3-none-any.whl.metadata (3.8 kB)
Collecting bitsandbytes (from pruna)
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting codecarbon (from pruna)
  Downloading codecarbon-3.0.4-py3-none-any.whl.metadata (11 kB)
Collecting colorama (from pruna)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting configspace>=1.2.1 (from pruna)
  Downloading configspace-1.2.1.tar.gz (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ctranslate2==4.6.0 (from pruna)
  Downloading ctranslate2-4.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2

In [1]:
pip show pruna

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/hqq_aten-0.0.0-py3.12-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mName: pruna
Version: 0.2.9
Summary: Smash your AI models
Home-page: 
Author: 
Author-email: Pruna AI <hello@pruna.ai>
License: Copyright 2025 - Pruna AI GmbH. All rights reserved.

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is gr

### 1. Loading the ASR model

First, load your ASR model.

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id_whisper = "openai/whisper-large-v3"

model_whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id_whisper, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True,
)
model_whisper.to(device)

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [27]:
import torch
from transformers import AutoModelForSpeechSeq2Seq

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "unsloth/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True,
)
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

### 2. Initializing the Smash Config

Next, initialize the smash_config. Since the compiler require a processor, we add it to the smash_config.

In [39]:
from pruna import SmashConfig
import os
from transformers import AutoTokenizer, AutoProcessor
import numpy as np

# Create directory with complete tokenizer files
complete_model_path = "./unsloth-whisper-complete"
os.makedirs(complete_model_path, exist_ok=True)

# Download tokenizer from OpenAI (has complete files including tokenizer.json)
tokenizer = AutoTokenizer.from_pretrained("openai/whisper-large-v3-turbo")
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

# Save complete tokenizer files locally
tokenizer.save_pretrained(complete_model_path)
processor.save_pretrained(complete_model_path)


# Initialize the SmashConfig
smash_config = SmashConfig(
    batch_size=1,         # low latency for CPU
    device="cuda"          # force CPU execution
)
smash_config.add_tokenizer(complete_model_path)
smash_config.add_processor(complete_model_path)
smash_config['compiler'] = 'c_whisper'
smash_config['batcher'] = 'whisper_s2t'
smash_config['c_whisper_weight_bits'] = 8
smash_config['whisper_s2t_int8'] = True

In [40]:
from pruna import smash

smashed_model = smash(
    model=model,
    smash_config=smash_config
)

INFO - Starting compiler c_whisper...
INFO - compiler c_whisper was applied successfully.
INFO - Starting batcher whisper_s2t...
INFO - Preparing model for inference with batch size 1...
INFO - batcher whisper_s2t was applied successfully.
  return datetime.utcnow().replace(tzinfo=utc)


In [25]:
# this doesnt work with unlsoth model

from pruna import SmashConfig
import os
from transformers import AutoTokenizer, AutoProcessor

# Create directory with complete tokenizer files
complete_model_path = "./unsloth-whisper-complete"
os.makedirs(complete_model_path, exist_ok=True)

# Download tokenizer from OpenAI (has complete files including tokenizer.json)
tokenizer = AutoTokenizer.from_pretrained("openai/whisper-large-v3-turbo")
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

# Save complete tokenizer files locally
tokenizer.save_pretrained(complete_model_path)
processor.save_pretrained(complete_model_path)


# Initialize the SmashConfig
smash_config = SmashConfig()
smash_config.add_tokenizer(complete_model_path)
smash_config.add_processor(complete_model_path)
smash_config['compiler'] = 'c_translate'
smash_config['batcher'] = 'whisper_s2t'
smash_config['c_translate_weight_bits'] = 8

# uncomment the following line to quantize the model to 8 bits
# smash_config['c_whisper_weight_bits'] = 8

INFO - Using best available device: 'cuda'


### 3. Smashing the Model

Now, smash the model. This will take approximately 2 minutes on a T4 GPU.

In [26]:
from pruna import smash

# Smash the model
smashed_model = smash(
    model=m_model,
    smash_config=smash_config,
)

  return datetime.utcnow().replace(tzinfo=utc)


AttributeError: 'WhisperModelCT2' object has no attribute 'config'

In [23]:
from pruna import PrunaModel

m_model = PrunaModel.from_pretrained(
    "manohar03/unsloth-whisper-large-v3-turbo-pruna-smashed",
    revision="6820cb19a08c7fab47805ccb698208c5b998a3d4"   # commit hash here
)


Fetching 20 files:   0%|          | 0/20 [00:00<?, ?it/s]

INFO - Using best available device: 'cuda'
INFO - Starting compiler c_whisper...
INFO - compiler c_whisper was applied successfully.
INFO - Starting batcher whisper_s2t...
INFO - Preparing model for inference with batch size 1...
INFO - batcher whisper_s2t was applied successfully.
  return datetime.utcnow().replace(tzinfo=utc)


### 4. Preparing the Input

In [30]:
import requests

response = requests.get("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/sam_altman_lex_podcast_367.flac")
audio_sample = 'sam_altman_lex_podcast_367.flac'

# Save the content to the specified file
with open(audio_sample, 'wb') as f:
    f.write(response.content)

### 5. Running the Model

Finally, run the model to transcribe the audio file. Make sure you have `ffmpeg` installed.

In [41]:
# Display the result
smashed_model(audio_sample)

Transcribing: 100%|██████████| 100/100 [02:44<00:00,  1.64s/it]


"we have been a misunderstood and badly mocked org for a long time like when we started we like announced the org at the end of 2015 and said we're going to work on agi like people thought we were batshit insane yeah you know like i i remember at the time a eminent ai scientist at a large industrial AI lab was DMing individual reporters being like, these people aren't very good and it's ridiculous to talk about AGI and I can't believe you're giving them time of day. And it's like, that was the level of pettiness and rancor in the field at a new group of people saying, we're going to try to build AGI. So OpenAI and DeepMind was a small collection of folks who were brave enough to talk about AGI in the face of mockery. We don't get mocked as much now. Don't get mocked as much now. The following is a conversation with Sam Altman, CEO of OpenAI, the company behind GPT-4, JAD-GPT, DALI, Codex, and many other AI technologies, which both individually and together constitute some of the greate

In [38]:
dir(smashed_model)
vars(smashed_model)

{'model': <pruna.algorithms.batching.ws2t.WhisperS2TWrapper at 0x7c6951923650>,
 'smash_config': SmashConfig(
   'batcher': 'whisper_s2t',
   'compiler': 'c_whisper',
   'c_whisper_weight_bits': 8,
   'whisper_s2t_int8': False,
 ),
 'inference_handler': <pruna.engine.handler.handler_standard.StandardHandler at 0x7c6951920320>}

### Wrap Up

Congratulations! You have successfully smashed an ASR model. You can now use the `pruna` package to optimize any custom ASR model. The only parts that you should modify are step 1, 4 and 5 to fit your use case.

In [42]:
from huggingface_hub import HfApi
import json
import os

# Define repo name
repo_name = "manohar03/unsloth-whisper-large-v3-turbo-pruna-8bit"

# Create save directory
save_path = "./my-smashed-whisper"
os.makedirs(save_path, exist_ok=True)

# Save the original model and tokenizer files
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
processor.save_pretrained(save_path)

# Save the SmashConfig for reproducibility
config_dict = dict(smash_config)
with open(f"{save_path}/smash_config.json", "w") as f:
    json.dump(config_dict, f, indent=2)

# Create a README with model details
readme_content = f"""---
license: apache-2.0
base_model: openai/whisper-large-v3-turbo
tags:
- whisper
- speech-to-text
- pruna
- quantized
- 8bit
- optimized
library_name: transformers
---

# Unsloth Whisper Large V3 Turbo - Pruna 8bit Optimized

This model is a Pruna-optimized version of `openai/whisper-large-v3-turbo` with 8-bit quantization optimizations.

## Optimizations Applied

## Optimizations Applied
- **Batcher Optimization**: int8 enabled (`whisper_s2t_int8: True`)
- **Compiler**: `c_whisper`
- **Batcher**: `whisper_s2t`

## Usage

### Option 1: Standard Transformers (Recommended for most users)

```python
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor

# Simple loading - no Pruna installation required
model = AutoModelForSpeechSeq2Seq.from_pretrained("manohar03/unsloth-whisper-large-v3-turbo-pruna-8bit")
processor = AutoProcessor.from_pretrained("manohar03/unsloth-whisper-large-v3-turbo-pruna-8bit")

# Use normally
result = model.generate(inputs, ...)
```

### Option 2: With Pruna Optimization (Maximum Performance)

```python
from pruna import smash, SmashConfig
from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
import json

# Load model and tokenizer
model = AutoModelForSpeechSeq2Seq.from_pretrained("{repo_name}")
tokenizer = AutoTokenizer.from_pretrained("{repo_name}")
processor = AutoProcessor.from_pretrained("{repo_name}")

# Load SmashConfig
with open("smash_config.json", "r") as f:
    config_dict = json.load(f)

# Recreate SmashConfig
smash_config = SmashConfig()
for key, value in config_dict.items():
    smash_config[key] = value

# Apply Pruna optimizations
smashed_model = smash(
    model=model,
    smash_config=smash_config
)

# Use the optimized model
result = smashed_model.inference(audio_input)
```

## Performance Benefits

- Reduced memory usage from 8-bit weight quantization
- Optimized inference pipeline with int8 batcher
- Maintained audio transcription quality

## Base Model

This model is based on `unsloth/whisper-large-v3-turbo`, which itself is optimized from `openai/whisper-large-v3-turbo`. It retains all the capabilities of both base models while providing additional Pruna performance improvements.
"""

with open(f"{save_path}/README.md", "w") as f:
    f.write(readme_content)

# Upload to HuggingFace
print("Uploading to HuggingFace...")
api = HfApi()

# Create the repository if it doesn't exist
try:
    api.create_repo(repo_id=repo_name, repo_type="model", private=False)
    print(f"Created repository: {repo_name}")
except Exception as e:
    print(f"Repository may already exist: {e}")

# Upload all files
api.upload_folder(
    folder_path=save_path,
    repo_id=repo_name,
    repo_type="model",
    commit_message="Upload Pruna-optimized 8-bit Whisper model"
)

print(f"✅ Successfully uploaded model to: https://huggingface.co/{repo_name}")
print("\nFiles uploaded:")
print("- Model weights and config")
print("- Tokenizer files")
print("- Processor config")
print("- SmashConfig (smash_config.json)")
print("- README with usage instructions")

  return datetime.utcnow().replace(tzinfo=utc)


Uploading to HuggingFace...
Created repository: manohar03/unsloth-whisper-large-v3-turbo-pruna-8bit


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...y-smashed-whisper/model.safetensors:   3%|2         | 41.9MB / 1.62GB            



✅ Successfully uploaded model to: https://huggingface.co/manohar03/unsloth-whisper-large-v3-turbo-pruna-8bit

Files uploaded:
- Model weights and config
- Tokenizer files
- Processor config
- SmashConfig (smash_config.json)
- README with usage instructions
