In [1]:
# !wget https://huggingface.co/datasets/mesolitica/IMDA-TTS/resolve/main/FEMALE_01.zip
# !wget https://huggingface.co/datasets/mesolitica/IMDA-TTS/resolve/main/texts.json
# !unzip -q FEMALE_01.zip

In [2]:
from glob import glob
import json
import torch
import os
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from datasets import Audio
from tqdm import tqdm
import jiwer

sr = 16000
audio = Audio(sampling_rate=sr)
PUNCTUATION = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'

In [3]:
processor = AutoProcessor.from_pretrained('openai/whisper-tiny')
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    'openai/whisper-tiny',
    use_flash_attention_2 = True,
    torch_dtype = torch.bfloat16
)

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


generation_config.json:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

In [4]:
_ = model.cuda()

In [5]:
with open('texts.json') as fopen:
    data = json.load(fopen)

In [6]:
len(data)

6033

In [7]:
data[0]

{'filename': '0000.wav',
 'text': 'Author of the danger trail, Philip Steels, etc.'}

In [8]:
wer, cer = [], []

for i in tqdm(range(len(data[:700]))):
    f = os.path.join('FEMALE_01', data[i]['filename'])
    y = audio.decode_example(audio.encode_example(f))['array']
    inputs = processor([y], return_tensors = 'pt', sampling_rate = 16000)
    inputs['input_features'] = inputs['input_features'].type(torch.bfloat16).cuda()
    r = model.generate(inputs['input_features'], language='en', return_timestamps=True)
    
    out = processor.tokenizer.decode(r[0], skip_special_tokens = True).strip()
    actual = processor.tokenizer.decode(processor.tokenizer.encode(data[i]['text']), skip_special_tokens = True).strip()
    
    for p in PUNCTUATION:
        out = out.replace(p, '')
        actual = actual.replace(p, '')
        
    actual = actual.lower()
    out = out.lower()
    
    error = jiwer.wer(actual, out)
    if error > 1:
        error = 1.0
    wer.append(error)
    error = jiwer.cer(actual, out)
    if error > 1:
        error = 1.0
    cer.append(error)

100%|██████████| 700/700 [02:15<00:00,  5.17it/s]


In [10]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.11150629529200957, 0.048805770734828904)