In [2]:
import torch
from transformers import GenerationConfig, WhisperConfig, WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizerFast

In [3]:
from streaming import LocalDataset

dataset = LocalDataset('mosaic-combine-stt')

In [4]:
dataset[0]

{'audio_filename': 'part1-mp3/006711469.mp3',
 'new_text': '<|startoftranscript|><|en|><|transcribe|> Our athletes have been training hard for the next Olympics.<|endoftext|>'}

In [5]:
!ls distil-large-v3

added_tokens.json	  generation_config.json    tokenizer.json
checkpoint-38200-epoch-0  merges.txt		    tokenizer_config.json
checkpoint-38300-epoch-0  normalizer.json	    vocab.json
checkpoint-38400-epoch-0  preprocessor_config.json
config.json		  special_tokens_map.json


In [6]:
latest = './distil-large-v3/checkpoint-38400-epoch-0/pytorch_model.bin'

In [7]:
config = WhisperConfig.from_pretrained('distil-large-v3')
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    'distil-large-v3'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
    'distil-large-v3'
)
processor = WhisperProcessor.from_pretrained(
    'distil-large-v3'
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
model = WhisperForConditionalGeneration(config = config)
model.generation_config = GenerationConfig.from_pretrained('openai/whisper-large-v3')

In [9]:
checkpoint = torch.load(latest)

In [10]:
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [11]:
import soundfile as sf
from glob import glob

y, sr = sf.read(dataset[0]['audio_filename'])
y2, sr = sf.read(dataset[1]['audio_filename'])

In [12]:
p = processor([y, y2], return_tensors='pt')

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [13]:
r = model.generate(p['input_features'],
            output_scores=True,
            return_dict_in_generate=True,
            language='en',
            return_timestamps=True)

In [16]:
processor.tokenizer.decode(r.sequences[0])

'<|startoftranscript|><|en|><|transcribe|> have been training hard for the past our Olympics.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [17]:
processor.tokenizer.decode(r.sequences[1])

"<|startoftranscript|><|en|><|transcribe|> so and the social social one you don't have to, no no but when you don't have to, the kind of situation that you don't have to, and people, you don't have to, and people are better, then you don't have to, then people are bad, then you don't know, then they do not talk, and they do not talk<|endoftext|>"

In [20]:
import IPython.display as ipd
ipd.Audio(dataset[1]['audio_filename'])

In [19]:
model = model.type(torch.bfloat16)

In [22]:
model.push_to_hub('malaysian-distil-whisper-large-v3', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-distil-whisper-large-v3/commit/3e358d827c62f074710fd1139ccf4734c0194c33', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='3e358d827c62f074710fd1139ccf4734c0194c33', pr_url=None, pr_revision=None, pr_num=None)

In [23]:
feature_extractor.push_to_hub('malaysian-distil-whisper-large-v3', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-distil-whisper-large-v3/commit/2e06af0f014cf69a894d25f9142282ee50dd8d1a', commit_message='Upload feature extractor', commit_description='', oid='2e06af0f014cf69a894d25f9142282ee50dd8d1a', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
tokenizer.push_to_hub('malaysian-distil-whisper-large-v3', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-distil-whisper-large-v3/commit/4400f65d326366a8a4b095b8245e7b25597d52ef', commit_message='Upload tokenizer', commit_description='', oid='4400f65d326366a8a4b095b8245e7b25597d52ef', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
processor.push_to_hub('malaysian-distil-whisper-large-v3', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-distil-whisper-large-v3/commit/74543bbf2b64e45fd6681df86c003f19d4c3fb83', commit_message='Upload processor', commit_description='', oid='74543bbf2b64e45fd6681df86c003f19d4c3fb83', pr_url=None, pr_revision=None, pr_num=None)