In [2]:
import torch
from transformers import GenerationConfig, WhisperConfig, WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizerFast

In [3]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint("tiny")
latest

'tiny/checkpoint-5400'

In [4]:
from streaming import LocalDataset

dataset = LocalDataset('mosaic-combine-stt')

In [5]:
config = WhisperConfig.from_pretrained('openai/whisper-tiny')
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    'openai/whisper-tiny'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
    'openai/whisper-tiny'
)
processor = WhisperProcessor.from_pretrained(
    'openai/whisper-tiny'
)

In [6]:
model = WhisperForConditionalGeneration.from_pretrained(latest)

In [7]:
import soundfile as sf
from glob import glob

y, sr = sf.read(dataset[0]['audio_filename'])
y2, sr = sf.read(dataset[1]['audio_filename'])
y3, sr = sf.read(dataset[10]['audio_filename'])

In [8]:
dataset[10]['audio_filename']

'output-audio/3-1875-24.mp3'

In [9]:
p = processor([y3], return_tensors='pt')

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [10]:
r = model.generate(p['input_features'],
            output_scores=True,
            return_dict_in_generate=True,
            language='ms',
            return_timestamps=True)
processor.tokenizer.decode(r.sequences[0])

'<|startoftranscript|><|ms|><|transcribe|> Assembly on Aging, Divina Australia, Australia yang telah diadakan pada tahun 1982 dan berasaskan unjuran tersebut maka jabatan perangkaan Malaysia menganggarkan menjelang tahun 2005 sejumlah 15% penduduk kita adalah daripada kalangan warga emas. Untuk makluman Tuan Yang Pertua dan juga Alian Bohon, pembangunan sistem pendafiran warga emas ataupun kita sebutkan event adalah usaha kerajaan ke arah merealisasikan objektif yang telah digangkatkan<|endoftext|>'

In [None]:
import IPython.display as ipd
ipd.Audio(dataset[10]['audio_filename'])

In [None]:
r = model.generate(p['input_features'],
            output_scores=True,
            return_dict_in_generate=True,
            language='ms',
            return_timestamps=True)
processor.tokenizer.decode(r.sequences[0])

In [11]:
model = model.type(torch.bfloat16)

In [12]:
model.push_to_hub('malaysian-whisper-tiny', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-tiny/commit/bb95df421d2111428ac98d66f73103567c5c842d', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='bb95df421d2111428ac98d66f73103567c5c842d', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
feature_extractor.push_to_hub('malaysian-whisper-tiny', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-tiny/commit/8768a3672a60eb9e9d22ea8a9c22204f7807b737', commit_message='Upload feature extractor', commit_description='', oid='8768a3672a60eb9e9d22ea8a9c22204f7807b737', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
tokenizer.push_to_hub('malaysian-whisper-tiny', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-tiny/commit/83ceea7d7c23293704fb9ec88bcdeec2bc88d8eb', commit_message='Upload tokenizer', commit_description='', oid='83ceea7d7c23293704fb9ec88bcdeec2bc88d8eb', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
processor.push_to_hub('malaysian-whisper-tiny', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-tiny/commit/05d5876a3e660566e553ef05de070b05e8b30d82', commit_message='Upload processor', commit_description='', oid='05d5876a3e660566e553ef05de070b05e8b30d82', pr_url=None, pr_revision=None, pr_num=None)