In [7]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/audio-iium.zip
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/collections/shuffled-iium.json
# !unzip audio-iium.zip -d iium

In [2]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/audio-wattpad.zip
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/collections/transcript-wattpad.json
# !unzip audio-wattpad.zip -d wattpad

In [3]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/text-audiobook.tar.gz
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/testset-audiobook.tar.gz
# !tar -zxf text-audiobook.tar.gz
# !tar -xf testset-audiobook.tar.gz

In [1]:
import os
from glob import glob
from tqdm import tqdm
import json

In [2]:
import unicodedata
import re

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "'", "-", "z", "0", "1", "x", "2", "q", "*", "5", "3", "4", "6", "9", "8", "7", "%", "$", "\"", "/", "&", ":", "+"]
def preprocessing_text(string):
        
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c for c in string if c in vocabs])
    return re.sub(r'[ ]+', ' ', string).strip()

In [3]:
wattpad = []
wavs = glob('wattpad/audio-wattpad/*wav')

with open('transcript-wattpad.json') as fopen:
    transcript = json.load(fopen)
    
for i in tqdm(wavs):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript[int(index)]
    wattpad.append((i, text))

100%|██████████| 146/146 [00:00<00:00, 346558.23it/s]


In [4]:
iium = []
wavs = glob('iium/audio-iium/*wav')

with open('shuffled-iium.json') as fopen:
    transcript = json.load(fopen)
    
for i in tqdm(wavs):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript[int(index)]
    iium.append((i, text))

100%|██████████| 97/97 [00:00<00:00, 225026.27it/s]


In [5]:
audiobook = []
wavs = glob('test-set/*wav')
for i in tqdm(wavs):
    t = '/'.join(i.split('<>')[1:])
    t = t.split('.wav')[0]
    t = t.replace('output-wav', 'output-text')
    with open(f'text-audiobook/{t}.wav.txt') as fopen:
        text = fopen.read()
    audiobook.append((i, text))

100%|██████████| 300/300 [00:00<00:00, 4057.58it/s]


In [8]:
audios = wattpad + iium + audiobook
audios, texts = zip(*audios)

In [9]:
processed_text = [preprocessing_text(t) for t in texts]

In [11]:
import malaya_speech
import speech_recognition as sr

r = sr.Recognizer()

In [19]:
from tqdm import tqdm

wer, cer = [], []
for i in tqdm(range(len(audios))):
    try:
        with sr.AudioFile(audios[i]) as source:
            a = r.record(source)

        text = r.recognize_google(a, language = 'ms')
        text = preprocessing_text(text)
        
        wer.append(malaya_speech.metrics.calculate_wer(processed_text[i], text))
        cer.append(malaya_speech.metrics.calculate_cer(processed_text[i], text))
    except Exception as e:
        print(e)

100%|██████████| 543/543 [04:09<00:00,  2.18it/s]


In [20]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.14270426731923877, 0.046821430168219824)