In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [3]:
model_name = 'openai/whisper-base'

In [4]:
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

class Model(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 1000,
        weight_decay: float = 0.0,
        eval_splits=None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()
        self.config = AutoConfig.from_pretrained(model_name_or_path)
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, config=self.config)

    def forward(self, **inputs):
        return self.model(**inputs)



In [9]:
!ls openai-whisper-base-16-v2

'model-epoch=02-step=345000.ckpt'  'model-epoch=02-step=355000.ckpt'
'model-epoch=02-step=350000.ckpt'


In [11]:
model = Model.load_from_checkpoint("openai-whisper-base-16-v2/model-epoch=02-step=355000.ckpt")

In [12]:
model.model.save_pretrained('./ms-base-v2')

In [8]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
feature_extractor.save_pretrained('./ms-base-v2')
tokenizer.save_pretrained('./ms-base-v2')

('./ms-base-v2/tokenizer_config.json',
 './ms-base-v2/special_tokens_map.json',
 './ms-base-v2/vocab.json',
 './ms-base-v2/merges.txt',
 './ms-base-v2/normalizer.json',
 './ms-base-v2/added_tokens.json')

In [10]:
processor = AutoProcessor.from_pretrained('./ms-base-v2')

In [44]:
processor.push_to_hub('finetune-whisper-base-ms-singlish-v2', organization='mesolitica')



CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-base-ms-singlish-v2/commit/746c0bb942b30a23ccb5ebb4d663b8203cbc7aa4', commit_message='Upload processor', commit_description='', oid='746c0bb942b30a23ccb5ebb4d663b8203cbc7aa4', pr_url=None, pr_revision=None, pr_num=None)

In [45]:
model.model.push_to_hub('finetune-whisper-base-ms-singlish-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-base-ms-singlish-v2/commit/ca430c0f0429e8cbc8a4f5ac89e6b91a726a09d2', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='ca430c0f0429e8cbc8a4f5ac89e6b91a726a09d2', pr_url=None, pr_revision=None, pr_num=None)

In [46]:
feature_extractor.push_to_hub('finetune-whisper-base-ms-singlish-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-base-ms-singlish-v2/commit/4b42864d5296e10a5045e701e05df9c20e841272', commit_message='Upload feature extractor', commit_description='', oid='4b42864d5296e10a5045e701e05df9c20e841272', pr_url=None, pr_revision=None, pr_num=None)

In [47]:
tokenizer.push_to_hub('finetune-whisper-base-ms-singlish-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-base-ms-singlish-v2/commit/305c61c30c713677c060c3cafef0ad65219a6372', commit_message='Upload tokenizer', commit_description='', oid='305c61c30c713677c060c3cafef0ad65219a6372', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperProcessor

In [14]:
new_model = AutoModelForSpeechSeq2Seq.from_pretrained('ms-base-v2')

In [15]:
processor = AutoProcessor.from_pretrained('ms-tiny')

In [16]:
import malaya_speech
import json

with open('/home/husein/ssd1/speech-bahasa/malay-asr-test.json') as fopen:
    test_set = json.load(fopen)

In [17]:
ys = [malaya_speech.load(f)[0] for f in test_set['X'][:3] + ['singlish0.wav', 'husein-zolkepli.wav']]

In [18]:
input_features = processor(ys, return_tensors="pt").input_features
input_features.shape

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


torch.Size([5, 80, 3000])

In [19]:
o = new_model.generate(input_features, max_length = 256)
processor.tokenizer.batch_decode(o, skip_special_tokens = True)

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibantu daripada otak saraf puncang dan saraf optik',
 'jimnas tik as dan joash mempunyai matlamat yang sama menjadikan sukan jimnas tik dan lain lain selamat pagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju',
 'and then see how they roll it and film okay actually',
 'testing nama saya hussein bin also kipli']

In [20]:
test_set['Y'][:3]

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'gimnastik as dan joas mempunyai matlamat yang sama menjadikan sukan gimnastik dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju']

In [21]:
from glob import glob

files = glob('speech/example-speaker/*.wav')
ys = [malaya_speech.load(f)[0] for f in files]
input_features = processor(ys, sampling_rate = 16000, return_tensors="pt").input_features
o = new_model.generate(input_features, max_length = 256)
processor.tokenizer.batch_decode(o, skip_special_tokens = True)

['saya syafiqah hidayu',
 'sebut perkataan uncle',
 'testing nama saya hussein bin also kipli',
 'takkan orang yang seperti abang pakar itu mahu juga dia menjaganya baik baik orang yang tidak bertimbang rasa tu',
 'sebagai pembangkang yang matang dan sejahtera pas akan menghadapi pilihan raya umum dan tidak menumbang kerajaan dari pintu belakang',
 'pengadu caraan adalah suatu kaedah memberi arahan atau perintah kepada komputer untuk menjalankan sesuatu juga atau mana mana mesin elektronik',
 'tolong sebut antikata',
 'apa khabar semua saya doakan saudara dan saudari sihat walafiat hari ini saya sekali lagi menemui saudara']

In [22]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [23]:
import whisper
import numpy as np

In [24]:
# %%time

# mels = []
# inputs = ys
# for k in range(len(inputs)):
#     audio = whisper.pad_or_trim(inputs[k].astype(np.float32).flatten())
#     mel = whisper.log_mel_spectrogram(audio)
#     mels.append({'input_features': mel})
    
# batch = processor.feature_extractor.pad(mels, return_tensors="pt")
# batch.input_features

In [25]:
_ = new_model.cuda()

In [26]:
from tqdm import tqdm

wer, cer = [], []

batch_size = 2
for i in tqdm(range(0, len(test_set['X']), batch_size)):
    batch_y = test_set['Y'][i: i + batch_size]
    ys = [malaya_speech.load(f)[0] for f in test_set['X'][i: i + batch_size]]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|█████████████████████████████████████████| 370/370 [01:02<00:00,  5.95it/s]


In [27]:
np.mean(wer), np.mean(cer)

(0.1892628902256146, 0.07414237301735345)

In [28]:
with open('postprocess-malaya-malay-test-set.json') as fopen:
    malaya_malay = json.load(fopen)

In [29]:
wer, cer = [], []

for i in tqdm(range(len(malaya_malay))):
    if not malaya_malay[i]['accept']:
        continue
    
    batch_y = [malaya_malay[i]['cleaned']]
    ys = [malaya_speech.load(f)[0] for f in [f'malay-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|█████████████████████████████████████████| 765/765 [00:33<00:00, 23.02it/s]


In [30]:
np.mean(wer), np.mean(cer)

(0.1789545288557146, 0.06458568742050953)

In [31]:
with open('singlish-test.json') as fopen:
    singlish = json.load(fopen)

In [32]:
wer, cer = [], []

for i in tqdm(range(len(singlish))):
    
    batch_y = [singlish[i]]
    ys = [malaya_speech.load(f)[0] for f in [f'singlish-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|███████████████████████████████████████| 3579/3579 [02:33<00:00, 23.30it/s]


In [33]:
np.mean(wer), np.mean(cer)

(0.11699458363762497, 0.06190362020536486)