In [1]:
import torch
from transformers import GenerationConfig, WhisperConfig, WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizerFast

In [2]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint("small")
latest

'small/checkpoint-7000'

In [3]:
from streaming import LocalDataset

dataset = LocalDataset('mosaic-combine-stt')

In [4]:
config = WhisperConfig.from_pretrained('openai/whisper-small')
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    'openai/whisper-small'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
    'openai/whisper-small'
)
processor = WhisperProcessor.from_pretrained(
    'openai/whisper-small'
)

In [5]:
model = WhisperForConditionalGeneration.from_pretrained(latest)

In [6]:
import soundfile as sf
from glob import glob

y, sr = sf.read(dataset[0]['audio_filename'])
y2, sr = sf.read(dataset[1]['audio_filename'])
y3, sr = sf.read(dataset[2]['audio_filename'])

In [11]:
dataset[0]

{'audio_filename': 'part3-separate-audio-mp3/conf_2523_2523_00862076-266.mp3',
 'new_text': "<|startoftranscript|><|en|><|transcribe|> he's wearing orange shorts right, okay, okay maybe maybe you tell me the or the wave on the top right<|endoftext|>"}

In [12]:
p = processor([y], return_tensors='pt')

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [13]:
r = model.generate(p['input_features'],
            output_scores=True,
            return_dict_in_generate=True,
            language='en',
            return_timestamps=True)
processor.tokenizer.decode(r.sequences[0])

"<|startoftranscript|><|en|><|transcribe|> he's wearing orange shorts right, okay, okay maybe maybe you tell me the, or the wave on the top right<|endoftext|>"

In [14]:
r = model.generate(p['input_features'],
            output_scores=True,
            return_dict_in_generate=True,
            language='ms',
            return_timestamps=True)
processor.tokenizer.decode(r.sequences[0])

'<|startoftranscript|><|ms|><|transcribe|> dia sangat berbual, baiklah mungkin mungkin anda beritahu saya atau jalan di atas kan<|endoftext|>'

In [15]:
model = model.type(torch.bfloat16)

In [16]:
model.push_to_hub('malaysian-whisper-small', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/484M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-small/commit/20df138280825ec3a18e8a7ead70f6561ec1628c', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='20df138280825ec3a18e8a7ead70f6561ec1628c', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
feature_extractor.push_to_hub('malaysian-whisper-small', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-small/commit/1eff286c2669e84a4a0a4497f68756a58e52dc75', commit_message='Upload feature extractor', commit_description='', oid='1eff286c2669e84a4a0a4497f68756a58e52dc75', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
tokenizer.push_to_hub('malaysian-whisper-small', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-small/commit/582927b0c788abec54401128be64766ba8d3e259', commit_message='Upload tokenizer', commit_description='', oid='582927b0c788abec54401128be64766ba8d3e259', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
processor.push_to_hub('malaysian-whisper-small', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-small/commit/9a975b97801d946463d9062a4c25890440eef6b8', commit_message='Upload processor', commit_description='', oid='9a975b97801d946463d9062a4c25890440eef6b8', pr_url=None, pr_revision=None, pr_num=None)