In [2]:
!git clone https://github.com/ggerganov/whisper.cpp.git

Cloning into 'whisper.cpp'...
remote: Enumerating objects: 4782, done.[K
remote: Total 4782 (delta 0), reused 0 (delta 0), pack-reused 4782[K
Receiving objects: 100% (4782/4782), 8.04 MiB | 18.76 MiB/s, done.
Resolving deltas: 100% (3052/3052), done.


In [3]:
!git clone https://github.com/openai/whisper.git

Cloning into 'whisper'...
remote: Enumerating objects: 619, done.[K
remote: Counting objects: 100% (338/338), done.[K
remote: Compressing objects: 100% (82/82), done.[K
remote: Total 619 (delta 292), reused 269 (delta 253), pack-reused 281[K
Receiving objects: 100% (619/619), 8.16 MiB | 23.74 MiB/s, done.
Resolving deltas: 100% (369/369), done.


In [None]:
!wget https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt
!wget https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt

In [18]:
import io
import sys
import json
import struct
import base64
import torch
import numpy as np
from pathlib import Path

def bytes_to_unicode():
    bs = list(range(ord("!"), ord("~")+1)) + list(range(ord("¡"), ord("¬")+1)) + list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

fname_inp = "/content/tiny.en.pt"
dir_whisper = "/content/whisper"
dir_out = "/content/"

# try to load PyTorch binary data
try:
    model_bytes = open(fname_inp, "rb").read()
    with io.BytesIO(model_bytes) as fp:
        checkpoint = torch.load(fp, map_location="cpu")
except Exception:
    print("Error: failed to load PyTorch model file:", fname_inp)
    sys.exit(1)

hparams = checkpoint["dims"]
print("hparams:", hparams)

list_vars = checkpoint["model_state_dict"]

# load mel filters
n_mels = hparams["n_mels"]
with np.load(Path(dir_whisper) / "whisper" / "assets" / "mel_filters.npz") as f:
    filters = torch.from_numpy(f[f"mel_{n_mels}"])

# load tokenizer
multilingual = hparams["n_vocab"] == 51865
tokenizer = Path(dir_whisper) / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
tokenizer_type = "tiktoken"
if not tokenizer.is_file():
    tokenizer = Path(dir_whisper) / "whisper" / "assets" / (multilingual and "multilingual" or "gpt2") / "vocab.json"
    tokenizer_type = "hf_transformers"
    if not tokenizer.is_file():
        print("Error: failed to find either tiktoken or hf_transformers tokenizer file:", tokenizer)
        sys.exit(1)

byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}

if tokenizer_type == "tiktoken":
    with open(tokenizer, "rb") as f:
        contents = f.read()
        tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
elif tokenizer_type == "hf_transformers":
    with open(tokenizer, "r", encoding="utf8") as f:
        _tokens_raw = json.load(f)
        if '<|endoftext|>' in _tokens_raw:
            del _tokens_raw['<|endoftext|>']
        tokens = {bytes([byte_decoder[c] for c in token]): int(idx) for token, idx in _tokens_raw.items()}

# output in the same directory as the model
fname_out = Path(dir_out) / "tflt-vocab-mel.bin"

fout = fname_out.open("wb")

fout.write(struct.pack("i", 0x74666C74))
# write mel filters
fout.write(struct.pack("i", filters.shape[0]))
fout.write(struct.pack("i", filters.shape[1]))
for i in range(filters.shape[0]):
    for j in range(filters.shape[1]):
        fout.write(struct.pack("f", filters[i][j]))

# write tokenizer
fout.write(struct.pack("i", len(tokens)))

for key in tokens:
    fout.write(struct.pack("i", len(key)))
    fout.write(key)

fout.close()

print("Done. Output file: " , fname_out)
print("")


hparams: {'n_mels': 80, 'n_vocab': 51864, 'n_audio_ctx': 1500, 'n_audio_state': 384, 'n_audio_head': 6, 'n_audio_layer': 4, 'n_text_ctx': 448, 'n_text_state': 384, 'n_text_head': 6, 'n_text_layer': 4}
Done. Output file:  /content/tflt-vocab-mel.bin



In [19]:
%ls -la

total 148224
drwxr-xr-x  1 root root     4096 Oct 25 03:54 [0m[01;34m.[0m/
drwxr-xr-x  1 root root     4096 Oct 25 03:12 [01;34m..[0m/
drwxr-xr-x  4 root root     4096 Oct 23 13:26 [01;34m.config[0m/
drwxr-xr-x  2 root root     4096 Oct 25 03:22 [01;34m.ipynb_checkpoints[0m/
drwxr-xr-x  1 root root     4096 Oct 23 13:26 [01;34msample_data[0m/
-rw-r--r--  1 root root        7 Oct 25 03:24 test.py
-rw-r--r--  1 root root   586174 Oct 25 04:03 tflt-vocab-mel.bin
-rw-r--r--  1 root root 75571315 Sep 20  2022 tiny.en.pt
-rw-r--r--  1 root root 75572083 Sep 20  2022 tiny.pt
drwxr-xr-x  8 root root     4096 Oct 25 03:42 [01;34mwhisper[0m/
drwxr-xr-x 14 root root     4096 Oct 25 03:22 [01;34mwhisper.cpp[0m/
