# Updating Whisper's default vocabulary

This notebook removes certain tokens from the default Whisper vocabulary.

**Goal**: Prevent `whisper-tiny` from occasionally outputting English profanity when given noisy input with no speech.

In [1]:
!pip install --upgrade pip
# jiwer is used for the word error rate (WER) metric
!pip install --upgrade datasets[audio] transformers evaluate jiwer



In [2]:
import wandb
# See https://discuss.huggingface.co/t/how-to-turn-wandb-off-in-trainer/6237/10
wandb.init(mode='disabled')

In [3]:
from pathlib import Path
checkpoint_path = Path('./whisper/checkpoints').resolve()

In [4]:
import shutil


## Create the feature extractor and tokenizer

We'll be fine-tuning the `openai/whisper-tiny` model. Here, the feature extractor and tokenizer for this model are fetched from Huggingface:

In [5]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer

finetune_from_id = 'openai/whisper-tiny'
feature_extractor = WhisperFeatureExtractor.from_pretrained(finetune_from_id,  task='transcribe')
tokenizer_original = WhisperTokenizer.from_pretrained(finetune_from_id, task='transcribe')

We'll create a customized tokenizer based on `tokenizer_original` in the next section.

## Vocabulary adjustements

Next, we remove several unwanted tokens from the vocabulary:

In [6]:
# Step 1: Save the vocabulary to a file
tokenizer_directory = Path('whisper-default-tokenizer')
tokenizer_original.save_pretrained(tokenizer_directory)


('whisper-default-tokenizer/tokenizer_config.json',
 'whisper-default-tokenizer/special_tokens_map.json',
 'whisper-default-tokenizer/vocab.json',
 'whisper-default-tokenizer/merges.txt',
 'whisper-default-tokenizer/normalizer.json',
 'whisper-default-tokenizer/added_tokens.json')

Now that the tokenizer is saved in `tokenizer_directory`, we can load `tokenizer_directory/vocab.json` and modify it:

In [7]:
# Step 2: Get vocab.json
import json

def json_from_path(path: Path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.loads(f.read())

vocab = json_from_path(tokenizer_directory / 'vocab.json')

In [8]:
import re
NONWORD_REGEX = re.compile(r'[ \t?.,;!()/\-«»]+')
def split_by_word(text: str):
    """ Splits the given `text` into words. Returns a list of those words. """
    return NONWORD_REGEX.split(text)


# This character marks the beginning of a word in vocab.json
word_start_char = 'Ġ'

In [9]:
# Step 2: Replace!
next_replacement_idx = 0
new_vocab = {}

# Token IDs can be found by inspecting the original vocab.json
token_id_remappings = {
    22676: word_start_char + "BS",
    19186: "s***",
    30748: word_start_char + "s*****",
    4611: word_start_char + "s***",
    19593: word_start_char + "S***",
    10965: word_start_char + "F***",
    26154: word_start_char + "F!!!",
    33342: word_start_char + "F******",
    47069: word_start_char + "f***(0)",
    3275: word_start_char + "f***(1)",
    22518: word_start_char + "f***(2)",
    20022: word_start_char + "f***(3)",
    5546: word_start_char + "f******",
    47069: word_start_char + "m****(1)",
    29537: word_start_char + "m****(2)",
    11960: word_start_char + "b****",
    42094: word_start_char + "b******",
    40678: word_start_char + "B****"
}
replaced_keys = set()

for key in vocab:
    token_id = vocab[key]
    if token_id in token_id_remappings:
        new_key = token_id_remappings[token_id]
        new_vocab[new_key] = token_id
        replaced_keys.add(key)
    else:
        new_vocab[key] = token_id

new_merges = []
with open(tokenizer_directory / 'merges.txt', 'r', encoding='utf-8') as merges:
    for line in merges.readlines():
        if len(line) == 0:
            continue
        words = split_by_word(line)
        if not (words[0] in replaced_keys):
            new_merges.append(line.strip())

To check for other indexes to replace (keeping in mind that the output should still be multi-lingual), we could do something like this:
```python
!pip install better_profanity==0.7.0

from better_profanity import profanity

profanity.load_censor_words()
for key in new_vocab:
    word = key
    if key.startswith(word_start_char):
        word = key[1:]
    if profanity.contains_profanity(word):
        print("Consider replacing", key, new_vocab[key])
```

Great! We now have an updated vocab file!

In [10]:
# Write to a file
tokenizer_fr_directory = Path('updated-tokenizer')
if tokenizer_fr_directory.exists():
    shutil.rmtree(tokenizer_fr_directory)
shutil.copytree(tokenizer_directory, tokenizer_fr_directory)
with open(tokenizer_fr_directory / 'vocab.json', 'w', encoding='utf-8') as f:
    json.dump(new_vocab, f, ensure_ascii=False)


with open(tokenizer_fr_directory / 'merges.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(new_merges))

In [11]:
from transformers import WhisperTokenizer

# Use a normal WhisperTokenizer -- WhisperTokenizerFast has trouble with the updated
# vocabulary.
tokenizer = WhisperTokenizer(
    tokenizer_fr_directory / 'vocab.json',
    tokenizer_fr_directory / 'merges.txt',
    tokenizer_fr_directory / 'normalizer.json',
    bos_token='<|startoftranscript|>',
    unk_token='',
    pad_token='<|endoftext|>',
)

# See https://discuss.huggingface.co/t/fine-tuning-whisper-on-my-own-dataset-with-a-customized-tokenizer/25903
tokenizer.add_special_tokens(tokenizer_original.special_tokens_map)

105

In [12]:
# For debugging, update the output directory
shutil.rmtree(tokenizer_fr_directory)
tokenizer.save_pretrained(tokenizer_fr_directory)

('updated-tokenizer/tokenizer_config.json',
 'updated-tokenizer/special_tokens_map.json',
 'updated-tokenizer/vocab.json',
 'updated-tokenizer/merges.txt',
 'updated-tokenizer/normalizer.json',
 'updated-tokenizer/added_tokens.json')

## Create the processor

Next, load the `WhisperProcessor`, which combines a feature extractor and tokenizer.

In [13]:
from transformers import WhisperProcessor

processor = WhisperProcessor(feature_extractor, tokenizer)

Next, build the model:

In [14]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(finetune_from_id)
model.generation_config.forced_decoder_ids = None


In [15]:
from dataclasses import dataclass
from typing import Any
import torch
# See the linked blog post and https://huggingface.co/docs/transformers/main_classes/data_collator

@dataclass
class DataCollatorWithPadding:
    ''' Converts raw data into a batch ready for the model '''
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: list) -> dict[str, torch.Tensor]:
        input_features = [{'input_features': f['input_features']} for f in features]
        label_features = [{'input_ids': f['labels']} for f in features]

        # According to the linked blog post, the input and label features need
        # to be padded separately (due to different final lengths), then
        # recombined:
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt')

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors='pt')

        # transformers uses -100 for masking
        labels = labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Don't double-prepend the beginning of sequence token:
        if (labels[:,0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch['labels'] = labels
        return batch

data_collator = DataCollatorWithPadding(processor=processor, decoder_start_token_id=model.config.decoder_start_token_id)

In [16]:
model_output_dir = Path('./final-model').resolve()
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)



('/content/final-model/tokenizer_config.json',
 '/content/final-model/special_tokens_map.json',
 '/content/final-model/vocab.json',
 '/content/final-model/merges.txt',
 '/content/final-model/normalizer.json',
 '/content/final-model/added_tokens.json')

# Model conversion

Next, we need to convert the model into a format usable by Joplin. This next step converts the model from PyTorch to GGML.

In [17]:
!git clone https://github.com/openai/whisper whisper-github
!git clone https://github.com/ggerganov/whisper.cpp
!cd whisper.cpp && git checkout v1.7.4

fatal: destination path 'whisper-github' already exists and is not an empty directory.
fatal: destination path 'whisper.cpp' already exists and is not an empty directory.
M	models/convert-h5-to-ggml.py
HEAD is now at 8a9ad78 release : v1.7.4


In [18]:
# Patch convert-h5-to-ggml to work with more recent model versions
conversion_script_path = Path('whisper.cpp/models/convert-h5-to-ggml.py')
conversion_script_content = conversion_script_path.read_text()
with open(conversion_script_path, 'w') as conversion_script:
    bad_if_statement = 'if "max_length" not in hparams:'
    replaced_if_statement = 'if "max_length" not in hparams or hparams["max_length"] == None:'
    conversion_script.write(conversion_script_content.replace(bad_if_statement, replaced_if_statement))

In [19]:
!mkdir ./ggml
!python whisper.cpp/models/convert-h5-to-ggml.py ./final-model ./whisper-github ./ggml
!mv ./ggml/ggml-model.bin ./ggml/ggml-clean.bin

mkdir: cannot create directory ‘./ggml’: File exists
2025-02-26 16:43:14.799406: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740588194.849513   14409 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740588194.864821   14409 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
model.encoder.conv1.weight  ->  encoder.conv1.weight
encoder.conv1.weight 3 (384, 80, 3)
model.encoder.conv1.bias  ->  encoder.conv1.bias
  Reshaped variable:  encoder.conv1.bias  to shape:  (384, 1)
encoder.conv1.bias 2 (384, 1)
  Converting to float32
model.encoder.conv2.weight  ->  encoder.conv2.weight
encoder.conv2.weight 3 (384, 384, 3)
model.encoder.conv2.bias  ->  encoder.co

For smaller size and better performance, we can also quantize the GGML model:

In [20]:
!cd whisper.cpp && cmake -B build && cmake --build build --config Release
!./whisper.cpp/build/bin/quantize ./ggml/ggml-clean.bin ./ggml/ggml-clean-q8_0.bin q8_0

  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

[0m
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- Including CPU backend
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- Configuring done (0.1s)
-- Generating done (0.0s)
-- Build files have been written to: /content/whisper.cpp/build
[ 17%] Built target ggml-base
[ 40%] Built target ggml-cpu
[ 45%] Built target ggml
[ 50%] Built target whisper
[ 60%] Built target common
[ 65%] Built target whisper-cli
[ 70%] Built target whisper-bench
[ 75%] Built target whisper-server
[ 80%] Built target quantize
[ 85%] Built target main
[ 90%] Built target bench
[ 95%] Built target stream
[100%] Built target command
whisper_model_quantize: loading model from './ggml/ggml-clean.bin'
whisper_m

Now, let's make sure that the `.ggml` model works. Start by downloading some test audio:

In [21]:
!mkdir ./test-audio
# Download the first chapter of Alice in Wonderland (in French)
!wget -P ./test-audio/ https://www.archive.org/download/alice_au_pays_des_merveilles_1811_librivox/aliceaupays_01_carroll_128kb.mp3
!wget -P ./test-audio/ https://www.archive.org/download/alice_in_wonderland_librivox/wonderland_ch_01.mp3
# Convert it to a format that's understandable by whisper.cpp:
# -t 30                 Take the first 30s
# -i ...                Input path
# -ar 16000             Sample rate of 16000 HZ
# -ac 1                 1 audio channel
# -codec:a pcm_s16le    Audio codec
!ffmpeg -t 30 -i ./test-audio/aliceaupays_01_carroll_128kb.mp3 -ar 16000 -ac 1 -codec:a pcm_s16le ./test-audio/recording-fr.wav
!ffmpeg -t 30 -i ./test-audio/wonderland_ch_01.mp3 -ar 16000 -ac 1 -codec:a pcm_s16le ./test-audio/recording-en.wav

mkdir: cannot create directory ‘./test-audio’: File exists
--2025-02-26 16:43:23--  https://www.archive.org/download/alice_au_pays_des_merveilles_1811_librivox/aliceaupays_01_carroll_128kb.mp3
Resolving www.archive.org (www.archive.org)... 207.241.224.2
Connecting to www.archive.org (www.archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://archive.org/download/alice_au_pays_des_merveilles_1811_librivox/aliceaupays_01_carroll_128kb.mp3 [following]
--2025-02-26 16:43:24--  https://archive.org/download/alice_au_pays_des_merveilles_1811_librivox/aliceaupays_01_carroll_128kb.mp3
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia803201.us.archive.org/25/items/alice_au_pays_des_merveilles_1811_librivox/aliceaupays_01_carroll_128kb.mp3 [following]
--2025-02-26 16:43:24--  https:

Next, use the `whisper-cli` command to transcribe the audio using our GGML model:

In [22]:
# Test converting the WAV file to text using the GGML file that we built
!./whisper.cpp/build/bin/whisper-cli --language fr --no-timestamps -m ./ggml/ggml-clean.bin ./test-audio/recording-fr.wav
!./whisper.cpp/build/bin/whisper-cli --language en --no-timestamps -m ./ggml/ggml-clean.bin ./test-audio/recording-en.wav

whisper_init_from_file_with_params_no_state: loading model from './ggml/ggml-clean.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_init_with_params_no_state: devices    = 1
whisper_init_with_params_no_state: backends   = 1
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51865
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 384
whisper_model_load: n_audio_head  = 6
whisper_model_load: n_audio_layer = 4
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 384
whisper_model_load: n_text_head   = 6
whisper_model_load: n_text_layer  = 4
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 1
whisper_model_load: qntvr         = 0
whisper_model_load: type          = 1 (tiny)
whisper_model_load: adding 1608 extra tokens
whisper_model_load

In [23]:
# Compare with the upstream model
!mkdir ./ggml-upstream/
!sh ./whisper.cpp/models/download-ggml-model.sh tiny ./ggml-upstream/
!./whisper.cpp/build/bin/whisper-cli --language fr --no-timestamps -m ./ggml-upstream/ggml-tiny.bin ./test-audio/recording-fr.wav

mkdir: cannot create directory ‘./ggml-upstream/’: File exists
Downloading ggml model tiny from 'https://huggingface.co/ggerganov/whisper.cpp' ...
Model tiny already exists. Skipping download.
whisper_init_from_file_with_params_no_state: loading model from './ggml-upstream/ggml-tiny.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_init_with_params_no_state: devices    = 1
whisper_init_with_params_no_state: backends   = 1
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51865
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 384
whisper_model_load: n_audio_head  = 6
whisper_model_load: n_audio_layer = 4
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 384
whisper_model_load: n_text_head   = 6
whisper_model_load: n_text_layer  = 4
whisper_model_load: n_