In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [4]:
model_name = 'openai/whisper-small'

In [5]:
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

class Model(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 1000,
        weight_decay: float = 0.0,
        eval_splits=None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()
        self.config = AutoConfig.from_pretrained(model_name_or_path)
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, config=self.config)

    def forward(self, **inputs):
        return self.model(**inputs)

In [6]:
!ls openai-whisper-small-16-v2

'model-epoch=01-step=275000.ckpt'  'model-epoch=01-step=290000.ckpt'
'model-epoch=01-step=280000.ckpt'  'model-epoch=01-step=295000.ckpt'
'model-epoch=01-step=285000.ckpt'


In [7]:
model = Model.load_from_checkpoint("openai-whisper-small-16-v2/model-epoch=01-step=295000.ckpt")

In [9]:
model.model.save_pretrained('./ms-small-v2')

In [33]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [34]:
feature_extractor.save_pretrained('./ms-small-v2')
tokenizer.save_pretrained('./ms-small-v2')

('./ms-small-v2/tokenizer_config.json',
 './ms-small-v2/special_tokens_map.json',
 './ms-small-v2/vocab.json',
 './ms-small-v2/merges.txt',
 './ms-small-v2/normalizer.json',
 './ms-small-v2/added_tokens.json')

In [35]:
processor = AutoProcessor.from_pretrained('./ms-small-v2')

In [36]:
processor.push_to_hub('finetune-whisper-small-ms-singlish-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-small-ms-singlish-v2/commit/0a9bc6b9c47e901bd87ef0e3703e47ce8a404754', commit_message='Upload processor', commit_description='', oid='0a9bc6b9c47e901bd87ef0e3703e47ce8a404754', pr_url=None, pr_revision=None, pr_num=None)

In [37]:
model.model.push_to_hub('finetune-whisper-small-ms-singlish-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-small-ms-singlish-v2/commit/a7f4f72a48788dad4e5cfff6f5beafcdb64d2bbf', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='a7f4f72a48788dad4e5cfff6f5beafcdb64d2bbf', pr_url=None, pr_revision=None, pr_num=None)

In [38]:
feature_extractor.push_to_hub('finetune-whisper-small-ms-singlish-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-small-ms-singlish-v2/commit/1e71836baa93de5afe684f2a9461141147a87024', commit_message='Upload feature extractor', commit_description='', oid='1e71836baa93de5afe684f2a9461141147a87024', pr_url=None, pr_revision=None, pr_num=None)

In [39]:
tokenizer.push_to_hub('finetune-whisper-small-ms-singlish-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-small-ms-singlish-v2/commit/1a1cf2ab4db181001ae659659032cd8f08158509', commit_message='Upload tokenizer', commit_description='', oid='1a1cf2ab4db181001ae659659032cd8f08158509', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperProcessor

In [11]:
new_model = AutoModelForSpeechSeq2Seq.from_pretrained('ms-small-v2')

In [12]:
processor = AutoProcessor.from_pretrained('ms-tiny')

In [13]:
import malaya_speech
import json

with open('/home/husein/ssd1/speech-bahasa/malay-asr-test.json') as fopen:
    test_set = json.load(fopen)

In [14]:
ys = [malaya_speech.load(f)[0] for f in test_set['X'][:3] + ['singlish0.wav', 'husein-zolkepli.wav']]

In [15]:
input_features = processor(ys, return_tensors="pt").input_features
input_features.shape

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


torch.Size([5, 80, 3000])

In [16]:
o = new_model.generate(input_features, max_length = 256)
processor.tokenizer.batch_decode(o, skip_special_tokens = True)

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'm s ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf o t',
 'jimnastik a s dan joas mempunyai matlamat yang sama menjadikan sukan jim panas dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju',
 'and then see how they grow it in film okay actually',
 'testing nama saya hussein bin zulkifli']

In [17]:
test_set['Y'][:3]

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'gimnastik as dan joas mempunyai matlamat yang sama menjadikan sukan gimnastik dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju']

In [18]:
from glob import glob

files = glob('speech/example-speaker/*.wav')
ys = [malaya_speech.load(f)[0] for f in files]
input_features = processor(ys, sampling_rate = 16000, return_tensors="pt").input_features
o = new_model.generate(input_features, max_length = 256)
processor.tokenizer.batch_decode(o, skip_special_tokens = True)

['nama syafiqa hidayu',
 'sebut perkataan uncle',
 'testing nama saya hussein bin zulkifli',
 'takkan orang yang seperti abang fakar itu mahu juga dia menjaganya baik baik orang yang tidak bertimbang rasa tu',
 'sebagai pembangkang yang matang dan sejahtera pas akan menghadapi pilihan raya umum dan tidak menumbang kerajaan dari pintu belakang',
 'pengaturcaraan adalah suatu kaedah memberi arahan atau perintah kepada komputer untuk menjalankan sesuatu tugas atau mana mana mesin elektronik',
 'tolong sebut antikata',
 'apa khabar semua saya doakan saudara dan saudari sihat walafiat hari ini saya sekali lagi menemui saudara']

In [19]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [20]:
import whisper
import numpy as np

In [21]:
# %%time

# mels = []
# inputs = ys
# for k in range(len(inputs)):
#     audio = whisper.pad_or_trim(inputs[k].astype(np.float32).flatten())
#     mel = whisper.log_mel_spectrogram(audio)
#     mels.append({'input_features': mel})
    
# batch = processor.feature_extractor.pad(mels, return_tensors="pt")
# batch.input_features

In [22]:
_ = new_model.cuda()

In [29]:
from tqdm import tqdm

wer, cer = [], []

batch_size = 2
for i in tqdm(range(0, len(test_set['X']), batch_size)):
    batch_y = test_set['Y'][i: i + batch_size]
    ys = [malaya_speech.load(f)[0] for f in test_set['X'][i: i + batch_size]]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|█████████████████████████████████████████| 370/370 [01:29<00:00,  4.13it/s]


In [30]:
np.mean(wer), np.mean(cer)

(0.13715545051811856, 0.04763123857792153)

In [31]:
with open('postprocess-malaya-malay-test-set.json') as fopen:
    malaya_malay = json.load(fopen)

In [None]:
wer, cer = [], []

for i in tqdm(range(len(malaya_malay))):
    if not malaya_malay[i]['accept']:
        continue
    
    batch_y = [malaya_malay[i]['cleaned']]
    ys = [malaya_speech.load(f)[0] for f in [f'malay-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

 29%|████████████                             | 224/765 [00:22<00:53, 10.07it/s]

In [None]:
np.mean(wer), np.mean(cer)

In [28]:
with open('singlish-test.json') as fopen:
    singlish = json.load(fopen)

In [29]:
wer, cer = [], []

for i in tqdm(range(len(singlish))):
    
    batch_y = [singlish[i]]
    ys = [malaya_speech.load(f)[0] for f in [f'singlish-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 3579/3579 [07:17<00:00,  8.18it/s]


In [31]:
import numpy as np

In [30]:
np.mean(wer), np.mean(cer)

(0.09489335668166453, 0.05045327551710344)