In [1]:
# !~/huggingface/bin/pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
# !~/huggingface/bin/pip3 install pytorch-lightning
# !~/huggingface/bin/pip3 install soundfile librosa

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [4]:
model_name = 'openai/whisper-tiny'

In [5]:
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

class Model(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 1000,
        weight_decay: float = 0.0,
        eval_splits=None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()
        self.config = AutoConfig.from_pretrained(model_name_or_path)
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, config=self.config)

    def forward(self, **inputs):
        return self.model(**inputs)

In [6]:
!ls openai-whisper-tiny-16-v2

'model-epoch=05-step=465000.ckpt'  'model-epoch=05-step=480000.ckpt'
'model-epoch=05-step=470000.ckpt'  'model-epoch=05-step=485000.ckpt'
'model-epoch=05-step=475000.ckpt'


In [7]:
model = Model.load_from_checkpoint("openai-whisper-tiny-16-v2/model-epoch=05-step=485000.ckpt")

In [8]:
model.model.save_pretrained('./ms-tiny-v2')

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
feature_extractor.save_pretrained('./ms-tiny-v2')
tokenizer.save_pretrained('./ms-tiny-v2')

In [None]:
processor = AutoProcessor.from_pretrained('./ms-tiny-v2')

In [None]:
processor.push_to_hub('finetune-whisper-tiny-ms-singlish-v2', organization='mesolitica')

In [9]:
model.model.push_to_hub('finetune-whisper-tiny-ms-singlish-v2', organization='mesolitica')



CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-whisper-tiny-ms-singlish-v2/commit/3917e8f4f704b751197569d942bfb18f8fe17d4f', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='3917e8f4f704b751197569d942bfb18f8fe17d4f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
feature_extractor.push_to_hub('finetune-whisper-tiny-ms-singlish-v2', organization='mesolitica')

In [None]:
tokenizer.push_to_hub('finetune-whisper-tiny-ms-singlish-v2', organization='mesolitica')

In [7]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperProcessor

In [8]:
new_model = AutoModelForSpeechSeq2Seq.from_pretrained('ms-tiny-v2')

In [9]:
processor = AutoProcessor.from_pretrained('ms-tiny')

In [10]:
import librosa
import json

with open('/home/husein/ssd1/speech-bahasa/malay-asr-test.json') as fopen:
    test_set = json.load(fopen)

In [11]:
ys = [librosa.load(f, sr = 16000, mono = True)[0] for f in test_set['X'][:3] + ['singlish0.wav', 'husein-zolkepli.wav']]

In [12]:
input_features = processor(ys, return_tensors="pt").input_features
input_features.shape

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


torch.Size([5, 80, 3000])

In [13]:
o = new_model.generate(input_features, max_length = 256)

In [14]:
processor.tokenizer.batch_decode(o, skip_special_tokens = True)

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'ms ialah penyakit yang mempengaruhi sistem saraposan yang dibentuk daripada otak sarah puncang dan sarap optik',
 'jimnastik a s dan joas mempunyai matlamat yang sama menjadikan sukan jimnastik dan lain lain selamat pagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju',
 'and then see how they grow it in film okay actually',
 'testing nama saya hussein bin zulkifli']

In [15]:
test_set['Y'][:3]

['ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'ms ialah penyakit yang mempengaruhi sistem saraf pusat yang dibentuk daripada otak saraf tunjang dan saraf optik',
 'gimnastik as dan joas mempunyai matlamat yang sama menjadikan sukan gimnastik dan lain lain selamat bagi para atlet untuk mengejar impian mereka dalam persekitaran yang selamat positif dan berdaya maju']

In [16]:
from glob import glob
import malaya_speech

files = glob('speech/example-speaker/*.wav')
ys = [malaya_speech.load(f)[0] for f in files]
input_features = processor(ys, sampling_rate = 16000, return_tensors="pt").input_features
o = new_model.generate(input_features, max_length = 256)
processor.tokenizer.batch_decode(o, skip_special_tokens = True)

['nama saya syafiqah aida you',
 'sebut perkataan uncle',
 'testing nama saya hussein bin zulkifli',
 'takkan orang yang seperti abang fakar itu mahu juga dia menjaganya baik baik orang yang tidak bertimbang rasa tu',
 'sebagai pembangkang yang matang dan sejahtera pas akan menghadapi pilihan raya umum dan tidak menumbang kerajaan dari pintu belakang',
 'pengaturcaraan adalah suatu kaedah memberi arahan atau perintah kepada komputer untuk menjalankan sesuatu juga atau mana mana mesin elektronik',
 'tolong sebut and theukata',
 'apa khabar semua saya doakan saudara dan saudari sihat walafiat hari ini saya sekali lagi menemui saudara']

In [17]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [18]:
import whisper
import numpy as np

In [19]:
%%time

mels = []
inputs = ys
for k in range(len(inputs)):
    audio = whisper.pad_or_trim(inputs[k].astype(np.float32).flatten())
    mel = whisper.log_mel_spectrogram(audio)
    mels.append({'input_features': mel})
    
batch = processor.feature_extractor.pad(mels, return_tensors="pt")
batch.input_features

CPU times: user 206 ms, sys: 0 ns, total: 206 ms
Wall time: 19.1 ms


tensor([[[-1.7919e-02,  3.7591e-01,  6.7061e-01,  ..., -4.9661e-01,
          -4.9661e-01, -4.9661e-01],
         [ 1.4865e-01,  5.2505e-01,  7.3631e-01,  ..., -4.9661e-01,
          -4.9661e-01, -4.9661e-01],
         [ 3.0252e-01,  5.7934e-01,  7.0872e-01,  ..., -4.9661e-01,
          -4.9661e-01, -4.9661e-01],
         ...,
         [-1.0308e-01, -3.9721e-02,  4.8087e-02,  ..., -4.9661e-01,
          -4.9661e-01, -4.9661e-01],
         [-1.2300e-01, -3.3460e-02,  4.4005e-02,  ..., -4.9661e-01,
          -4.9661e-01, -4.9661e-01],
         [-1.4860e-01, -2.4000e-02,  7.9799e-04,  ..., -4.9661e-01,
          -4.9661e-01, -4.9661e-01]],

        [[ 9.5621e-01,  9.2088e-01,  9.8969e-01,  ..., -4.8367e-01,
          -4.8367e-01, -4.8367e-01],
         [ 7.0786e-01,  9.7993e-01,  9.7543e-01,  ..., -4.8367e-01,
          -4.8367e-01, -4.8367e-01],
         [ 8.1467e-01,  8.3065e-01,  7.8870e-01,  ..., -4.8367e-01,
          -4.8367e-01, -4.8367e-01],
         ...,
         [-3.9725e-02, -6

In [20]:
_ = new_model.cuda()

In [26]:
from tqdm import tqdm

wer, cer = [], []

batch_size = 2
for i in tqdm(range(0, len(test_set['X']), batch_size)):
    batch_y = test_set['Y'][i: i + batch_size]
    ys = [malaya_speech.load(f)[0] for f in test_set['X'][i: i + batch_size]]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|█████████████████████████████████████████| 370/370 [00:34<00:00, 10.81it/s]


In [27]:
np.mean(wer), np.mean(cer)

(0.22459602785237057, 0.08940646925178528)

In [28]:
with open('postprocess-malaya-malay-test-set.json') as fopen:
    malaya_malay = json.load(fopen)

In [36]:
wer, cer = [], []

for i in tqdm(range(len(malaya_malay))):
    if not malaya_malay[i]['accept']:
        continue
    
    batch_y = [malaya_malay[i]['cleaned']]
    ys = [malaya_speech.load(f)[0] for f in [f'malay-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|█████████████████████████████████████████| 765/765 [00:26<00:00, 28.70it/s]


In [37]:
np.mean(wer), np.mean(cer)

(0.2030751600909704, 0.07452196517713668)

In [31]:
with open('singlish-test.json') as fopen:
    singlish = json.load(fopen)

In [32]:
wer, cer = [], []

for i in tqdm(range(len(singlish))):
    
    batch_y = [singlish[i]]
    ys = [malaya_speech.load(f)[0] for f in [f'singlish-test/{i}.wav']]
    mels = []
    for k in range(len(ys)):
        audio = whisper.pad_or_trim(ys[k].astype(np.float32))
        mel = whisper.log_mel_spectrogram(audio)
        mels.append({'input_features': mel})

    batch = processor.feature_extractor.pad(mels, return_tensors="pt")
    input_features = batch.input_features
    o = new_model.generate(input_features.cuda(), max_length = 256)
    pred = processor.tokenizer.batch_decode(o, skip_special_tokens = True)
    
    for k in range(len(pred)):
        wer.append(calculate_wer(batch_y[k], pred[k]))
        cer.append(calculate_cer(batch_y[k], pred[k]))

100%|███████████████████████████████████████| 3579/3579 [02:05<00:00, 28.61it/s]


In [33]:
np.mean(wer), np.mean(cer)

(0.1388829716298684, 0.07492980731417179)