In [1]:
import torch
from transformers import GenerationConfig, WhisperConfig, WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizerFast

In [2]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint("base")
latest

'base/checkpoint-8100'

In [3]:
from streaming import LocalDataset

dataset = LocalDataset('mosaic-combine-stt')

In [4]:
config = WhisperConfig.from_pretrained('openai/whisper-base')
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    'openai/whisper-base'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
    'openai/whisper-base'
)
processor = WhisperProcessor.from_pretrained(
    'openai/whisper-base'
)

In [5]:
model = WhisperForConditionalGeneration.from_pretrained(latest)

In [17]:
import soundfile as sf
from glob import glob

y, sr = sf.read(dataset[0]['audio_filename'])
y2, sr = sf.read(dataset[1]['audio_filename'])
y3, sr = sf.read(dataset[10]['audio_filename'])

In [15]:
dataset[10]['audio_filename']

'output-audio/3-1875-24.mp3'

In [18]:
p = processor([y3], return_tensors='pt')

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [19]:
r = model.generate(p['input_features'],
            output_scores=True,
            return_dict_in_generate=True,
            language='ms',
            return_timestamps=True)
processor.tokenizer.decode(r.sequences[0])

'<|startoftranscript|><|ms|><|transcribe|> Assembly on Aging di Vienna, Australia yang telah diadakan pada tahun 1982 dan berasaskan unjuran tersebut, maka Jabatan Perangkaan Malaysia menganggarkan menjelang tahun 2035, sejumlah 15% penduduk kita adalah daripada kalangan warga emas. Untuk makluman Tuan Neri Petua dan juga Alian Bohon Mat, pembangunan sistem pendaftaran warga emas ataupun kita sebutkan event adalah usaha kerajaan kearah merealisasikan<|endoftext|>'

In [16]:
import IPython.display as ipd
ipd.Audio(dataset[10]['audio_filename'])

In [10]:
r = model.generate(p['input_features'],
            output_scores=True,
            return_dict_in_generate=True,
            language='ms',
            return_timestamps=True)
processor.tokenizer.decode(r.sequences[0])

'<|startoftranscript|><|ms|><|transcribe|> dia sangat berbual dengan benda yang berbual, baiklah, baiklah, mungkin mungkin anda beritahu saya, satu cara untuk top kan<|endoftext|>'

In [11]:
model = model.type(torch.bfloat16)

In [12]:
model.push_to_hub('malaysian-whisper-base', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-base/commit/7ecbd1a5631d78e67f27f7fbba5ddfa82930aceb', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='7ecbd1a5631d78e67f27f7fbba5ddfa82930aceb', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
feature_extractor.push_to_hub('malaysian-whisper-base', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-base/commit/65335a7c7e375ea24fe9689aea317605abac97c8', commit_message='Upload feature extractor', commit_description='', oid='65335a7c7e375ea24fe9689aea317605abac97c8', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
tokenizer.push_to_hub('malaysian-whisper-base', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-base/commit/b4a23194412c17b82c3e4fdf1123879a0b65ddfb', commit_message='Upload tokenizer', commit_description='', oid='b4a23194412c17b82c3e4fdf1123879a0b65ddfb', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
processor.push_to_hub('malaysian-whisper-base', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-whisper-base/commit/acd4fcd7a44c07d426fc4b32b8214ded4a0c22b6', commit_message='Upload processor', commit_description='', oid='acd4fcd7a44c07d426fc4b32b8214ded4a0c22b6', pr_url=None, pr_revision=None, pr_num=None)