In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [9]:
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer
import librosa
import torch
from glob import glob
from jiwer import cer
from tqdm import tqdm
import json
import os

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f3357fcbee0>

In [3]:
from datasets import load_dataset

ds = load_dataset("malaysia-ai/common_voice_17_0")

In [4]:
df_test = ds['test'].to_pandas()

In [5]:
df_test.shape

(533642, 14)

In [6]:
model_id = "mesolitica/whisper-conv-large-v3-turbo"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, trust_remote_code = True, torch_dtype = 'auto').cuda()
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [7]:
!mkdir evaluate-whisper-conv-large-v3-turbo

mkdir: cannot create directory ‘evaluate-whisper-conv-large-v3-turbo’: File exists


In [8]:
from tqdm import tqdm
import json
import os

template = '<|startoftranscript|><|{locale}|><|transcribe|><|notimestamps|>'

for i in tqdm(range((len(df_test) // 2) * 0, (len(df_test) // 2) * 1, 1)):
    
    filename = f'evaluate-whisper-conv-large-v3-turbo/{i}.json'
    try:
        if os.path.exists(filename):
            with open(filename) as fopen:
                json.load(fopen)
                continue
    except:
        pass
    
    t = df_test['sentence'].iloc[i]

    if not isinstance(t, str):
        continue

    if len(t) < 5:
        continue

    l = df_test['locale'].iloc[i]

    if not isinstance(l, str):
        continue

    if len(l) < 2:
        continue
            
    y, sr = librosa.load(df_test.iloc[i]['audio_filename'], sr = feature_extractor.sampling_rate)
    input_ids = tokenizer(template.replace('{locale}', l), 
        add_special_tokens = False, return_tensors = 'pt')['input_ids']
    features = feature_extractor(
        [y], 
        return_tensors = 'pt', 
        return_attention_mask = True,
        sampling_rate = 16000,
    )
    features['input_features'] = features['input_features'].cuda()
    features['attention_mask'] = features['attention_mask'].cuda()
    features['decoder_input_ids'] = input_ids.cuda()
    generate_kwargs = dict(
        **features,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=True
    )
    generation_output = model.generate(**generate_kwargs)
    decoded = tokenizer.decode(generation_output[0])
    with open(filename, 'w') as fopen:
        json.dump({'predict': decoded, 'actual': t}, fopen)

100%|███████████████████████████████████████████████████████████████████████████████| 266821/266821 [3:45:52<00:00, 19.69it/s]


In [10]:
from collections import defaultdict
import string

def clean(s):
    s = ''.join([c for c in s.lower() if c not in string.punctuation])
    return s.strip()

languages = defaultdict(list)

files = glob('evaluate-whisper-conv-large-v3-turbo/*')
for f in tqdm(files):
    with open(f) as fopen:
        d = json.load(fopen)
    l = d['predict'].split('<|')[2].split('|>')[0]
    predict = d['predict'].split('<|notimestamps|>')[1].replace('<|endoftext|>', '')
    actual = d['actual']
    predict = clean(predict)
    actual = clean(actual)
    score = cer(actual, predict)
    languages[l].append(score)

len(files)

100%|██████████████████████████████████████████████████████████████████████████████| 532364/532364 [00:26<00:00, 20420.93it/s]


532364

In [12]:
with open('languages-evaluate-whisper-conv-large-v3-turbo.json', 'w') as fopen:
    json.dump(languages, fopen)

In [13]:
import numpy as np

means = []
for k, v in languages.items():
    mean = np.mean(v)
    if mean >= 1.0:
        mean = 1.0
    print(f'lang: {k}, samples: {len(v)}, CER: {mean}')
    means.append(mean)
    
print('\naverage CER:', np.mean(means))

lang: gl, samples: 9949, CER: 0.042740443121340566
lang: en, samples: 16379, CER: 0.060986384009768274
lang: ar, samples: 10458, CER: 0.22266123579844427
lang: kab, samples: 14972, CER: 0.3244665236586341
lang: ml, samples: 703, CER: 0.42335890521056685
lang: kk, samples: 514, CER: 0.17043440799796145
lang: ltg, samples: 2904, CER: 0.23117590536047175
lang: fr, samples: 16145, CER: 0.048485631588568376
lang: de, samples: 16170, CER: 0.026314971778193794
lang: fi, samples: 1554, CER: 0.05055169332273527
lang: pt, samples: 9432, CER: 0.04087286366709751
lang: ia, samples: 1816, CER: 0.05992562427372291
lang: eu, samples: 13621, CER: 0.0512883172324828
lang: ro, samples: 3896, CER: 0.05076617371579273
lang: sw, samples: 12086, CER: 0.1507494503501684
lang: sv-SE, samples: 5247, CER: 0.061493613079958064
lang: ta, samples: 8263, CER: 0.13906399211712145
lang: et, samples: 2653, CER: 0.0940406805612152
lang: lg, samples: 11902, CER: 0.1739333269639051
lang: it, samples: 15154, CER: 0.023851