In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [3]:
model_name = 'openai/whisper-tiny'

In [4]:
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

class Model(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 1000,
        weight_decay: float = 0.0,
        eval_splits=None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()
        self.config = AutoConfig.from_pretrained(model_name_or_path)
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, config=self.config)

    def forward(self, **inputs):
        return self.model(**inputs)

In [5]:
!ls openai-whisper-tiny-16

'model-epoch=05-step=880500.ckpt'  'model-epoch=05-step=882000.ckpt'
'model-epoch=05-step=881000.ckpt'  'model-epoch=05-step=882500.ckpt'
'model-epoch=05-step=881500.ckpt'


In [6]:
model = Model.load_from_checkpoint("openai-whisper-tiny-16/model-epoch=05-step=882500.ckpt")

In [7]:
model.model.save_pretrained('./ms-tiny')

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
feature_extractor.save_pretrained('./ms-tiny')
tokenizer.save_pretrained('./ms-tiny')

In [None]:
processor = AutoProcessor.from_pretrained('./ms-tiny')

In [None]:
processor.push_to_hub('finetune-whisper-tiny-ms-singlish', organization='mesolitica')

In [None]:
model.model.push_to_hub('finetune-whisper-tiny-ms-singlish', organization='mesolitica')

In [None]:
feature_extractor.push_to_hub('finetune-whisper-tiny-ms-singlish', organization='mesolitica')

In [None]:
tokenizer.push_to_hub('finetune-whisper-tiny-ms-singlish', organization='mesolitica')

In [8]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperProcessor

In [9]:
new_model = AutoModelForSpeechSeq2Seq.from_pretrained('ms-tiny')

In [10]:
processor = AutoProcessor.from_pretrained('ms-tiny')

In [11]:
import malaya_speech
import json

with open('/home/husein/ssd1/speech-bahasa/malay-asr-test.json') as fopen:
    test_set = json.load(fopen)

In [12]:
ys = [malaya_speech.load(f)[0] for f in test_set['X'][:3] + ['singlish0.wav', 'husein-zolkepli.wav']]

In [13]:
input_features = processor(ys, return_tensors="pt").input_features
input_features.shape

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


torch.Size([5, 80, 3000])

In [14]:
o = new_model.generate(input_features, max_length = 256)
processor.tokenizer.batch_decode(o, skip_special_tokens = True)

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'msial penyakit yang mempengaruhi sistem saraf pesat yang dibentuk daripada otak saraf punjang dan saraf otak',
 'genistik s dan joash mempunyai matlamat yang sama menjadikan sukan jimnas tik dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju',
 'and then see how they roll it in film okay actually',
 'testing nama saya hussein bin zulkifli']

In [15]:
test_set['Y'][:3]

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'gimnastik as dan joas mempunyai matlamat yang sama menjadikan sukan gimnastik dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju']

In [16]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [19]:
import whisper
import numpy as np

In [20]:
%%time

mels = []
inputs = ys
for k in range(len(inputs)):
    audio = whisper.pad_or_trim(inputs[k].astype(np.float32).flatten())
    mel = whisper.log_mel_spectrogram(audio)
    mels.append({'input_features': mel})
    
batch = processor.feature_extractor.pad(mels, return_tensors="pt")
batch.input_features

CPU times: user 148 ms, sys: 33.1 ms, total: 182 ms
Wall time: 20.9 ms


tensor([[[-0.0516,  0.4085,  0.3821,  ..., -0.5135, -0.5135, -0.5135],
         [-0.1247,  0.4903,  0.4155,  ..., -0.5135, -0.5135, -0.5135],
         [-0.1859,  0.4564,  0.4616,  ..., -0.5135, -0.5135, -0.5135],
         ...,
         [-0.3687, -0.2347, -0.3009,  ..., -0.5135, -0.5135, -0.5135],
         [-0.3810, -0.2193, -0.2694,  ..., -0.5135, -0.5135, -0.5135],
         [-0.5135, -0.4485, -0.4505,  ..., -0.5135, -0.5135, -0.5135]],

        [[-0.1295,  0.4186,  0.3930,  ..., -0.6736, -0.6736, -0.6736],
         [-0.3456,  0.3302,  0.2207,  ..., -0.6736, -0.6736, -0.6736],
         [-0.1814,  0.1878,  0.0392,  ..., -0.6736, -0.6736, -0.6736],
         ...,
         [-0.3620, -0.1888, -0.4334,  ..., -0.6736, -0.6736, -0.6736],
         [-0.3190, -0.1748, -0.3538,  ..., -0.6736, -0.6736, -0.6736],
         [-0.4876, -0.3338, -0.4731,  ..., -0.6736, -0.6736, -0.6736]],

        [[ 0.7996,  1.2242,  1.1701,  ..., -0.6703, -0.6703, -0.6703],
         [ 0.7265,  1.0669,  0.9106,  ..., -0

In [21]:
_ = new_model.cuda()

In [24]:
from tqdm import tqdm

wer, cer = [], []

batch_size = 2
for i in tqdm(range(0, len(test_set['X']), batch_size)):
    batch_y = test_set['Y'][i: i + batch_size]
    ys = [malaya_speech.load(f)[0] for f in test_set['X'][i: i + batch_size]]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 370/370 [00:52<00:00,  6.99it/s]


In [25]:
np.mean(wer), np.mean(cer)

(0.23568097573673807, 0.09868808772213333)

In [26]:
with open('postprocess-malaya-malay-test-set.json') as fopen:
    malaya_malay = json.load(fopen)

In [30]:
wer, cer = [], []

for i in tqdm(range(len(malaya_malay))):
    if not malaya_malay[i]['accept']:
        continue
    
    batch_y = [malaya_malay[i]['cleaned']]
    ys = [malaya_speech.load(f)[0] for f in [f'malay-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 765/765 [00:26<00:00, 28.53it/s]


In [31]:
np.mean(wer), np.mean(cer)

(0.20141585284865918, 0.0719649087284043)

In [32]:
with open('singlish-test.json') as fopen:
    singlish = json.load(fopen)

In [33]:
wer, cer = [], []

for i in tqdm(range(len(singlish))):
    
    batch_y = [singlish[i]]
    ys = [malaya_speech.load(f)[0] for f in [f'singlish-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 3579/3579 [02:04<00:00, 28.85it/s]


In [34]:
np.mean(wer), np.mean(cer)

(0.09045121293531239, 0.0481965006993222)