In [None]:
import os
os.makedirs('/content/cache/')
os.makedirs('/content/cache/datasets')
os.makedirs('/content/cache/models')

%env HF_HOME=/content/cache
%env HF_DATASETS_CACHE=/content/cache/datasets
%env TRANSFORMERS_CACHE=/content/cache/models
%env TORCH_HOME=/content/cache

In [2]:
%env HF_HOME=/content/cache
%env HF_DATASETS_CACHE=/content/cache/datasets
%env TRANSFORMERS_CACHE=/content/cache/models
%env TORCH_HOME=/content/cache

env: HF_HOME=/content/cache
env: HF_DATASETS_CACHE=/content/cache/datasets
env: TRANSFORMERS_CACHE=/content/cache/models
env: TORCH_HOME=/content/cache


In [None]:
!pip install jiwer
!pip install datasets

In [3]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import re
import jiwer
import jiwer.transforms as tr
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

cer_transform = tr.Compose(
    [
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.ReduceToListOfListOfChars(),
    ]
)

# It's the jiwer default transform
wer_transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.ReduceToListOfListOfWords(),
])

def compute_cer(reference, hypothesis):
    reference = reference.lower()
    hypothesis = hypothesis.lower()
    cer = jiwer.wer(reference, hypothesis, truth_transform=cer_transform, hypothesis_transform=cer_transform)
    return cer

def compute_wer(reference, hypothesis):
    reference = reference.lower()
    hypothesis = hypothesis.lower()
    wer = jiwer.wer(reference, hypothesis, truth_transform=wer_transform, hypothesis_transform=wer_transform)
    return wer


alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõũúû1234567890%\-\n/\\ "

def replace_special_tokens_and_normalize(text):
    text = text.lower()

    map_words = {
        "éh": "eh",
        "ehm": "eh",
        "ehn": "eh",
        "hum": "uh",
        "hm": "uh",
        "uhm": "uh",
        "hã": "ah",
        "ãh": "ah",
        "ã":  "ah",
        "hmm": "uh",
        "mm": "uh",
        "mhm": "uh"
    }

    text = re.sub("h+", "h", text)
    text = re.sub("[^{}]".format(alphabet+" "), " ", text)
    text = re.sub("[ ]+", " ", text)

    words = text.split(' ')
    new_words = []
    for word in words:
        if word == '' or word == ' ':
            continue
        if word in map_words:
            new_words.append(map_words[word])
        else:
            new_words.append(word)

    return " ".join(new_words)

def calculate_wer_cer(reference, hypothesis):
    if reference.strip() == '' or hypothesis.strip() == '':
        return 1, 1
    wer = compute_wer(reference, hypothesis)
    cer = compute_cer(reference, hypothesis)
    return wer, cer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32





In [4]:
!gsutil cp gs://usp_pos_doc_sid/coraav1/CORAA-v1.zip .

Copying gs://usp_pos_doc_sid/coraav1/CORAA-v1.zip...
/ [0 files][    0.0 B/  2.2 GiB]                                                ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/
Operation completed over 1 objects/2.2 GiB.                                      


In [10]:
def calculate_wer_cer_model_dataset(model_id, dataset_id, dataset_size, experiment_name, text_field_name, path_field_name):

    #    dataset = load_dataset(dataset_id, "pt", split="test",  token="yyyyyyy")
    dataset = load_dataset('audiofolder', data_dir="/content", split="test")

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        torch_dtype=torch_dtype,
        device=device,
    )

    results = []
    i = 0
    with open(f"{experiment_name}-test-log.txt", mode="w", encoding="utf-8") as f:
        for out in tqdm(pipe(KeyDataset(dataset, "audio"))):
            #f.write(f"{i} - {dataset[i]['audio_name']} - {dataset[i]['file_path']} - {dataset[i]['speaker_type']} - {dataset[i]['speaker_code']} - {dataset[i]['speaker_gender']}\n")
            f.write(f"{i} - {dataset[i]['audio']['path']}\n")
            results.append(out['text'])
            i+=1

    total_wer = 0.0
    total_cer = 0.0
    qtd = 0
    file_name_results = f"{experiment_name}-test-output.tsv"
    with open(file_name_results, mode="w", encoding="utf-8") as f:
        #f.write("audio_name\tspeaker_type\tspeaker_code\tspeaker_gender\tfile_path\toriginal\tprediction\toriginal normalized\tprediction normalized\twer\tcer\n")
        f.write("file_path\toriginal\tprediction\toriginal normalized\tprediction normalized\twer\tcer\n")
        for i, text in enumerate(results):
            qtd += 1
            audio_path = dataset[i]['audio']['path']
            #audio_name = dataset[i]['audio_name']
            #speaker_type = dataset[i]['speaker_type']
            #speaker_code = dataset[i]['speaker_code']
            #speaker_gender = dataset[i]['speaker_gender']
            text_original = dataset[i][text_field_name]
            text = text.replace('\n','')
            text_original = text_original.replace('\n','')
            text_original_norm = replace_special_tokens_and_normalize(text_original)
            text_norm = replace_special_tokens_and_normalize(text)
            wer, cer = calculate_wer_cer(text_original_norm, text_norm)
            total_wer += wer
            total_cer += cer
            file_path = audio_path.replace('\\', '/')[20:]
            #f.write(f"{audio_name}\t{speaker_type}\t{speaker_code}\t{speaker_gender}\t{file_path}\t{text_original}\t{text}\t{text_original_norm}\t{text_norm}\t{wer}\t{cer}\n")
            f.write(f"{file_path}\t{text_original}\t{text}\t{text_original_norm}\t{text_norm}\t{wer}\t{cer}\n")

    print(experiment_name, qtd, "WER:", total_wer, total_wer/qtd, "CER:", total_cer, total_cer/qtd)
    file_name_results = f"{experiment_name}-test-output.tsv"
    #copy file_name_results to storage
    !gsutil cp {file_name_results} gs://usp_pos_doc_sid/mupe_coling/{file_name_results}


In [11]:
#datasets:
#CORAA ASR e CORAA NURC-SP e commonvoice
#calculate_wer_cer_model_dataset("sidleal/distil-whisper-coraa-mupe-asr-2", "sidleal/CORAA-MUPE-ASR-1", 22000, 'mupe_mupe')
calculate_wer_cer_model_dataset("sidleal/distil-whisper-coraa-mupe-asr-2", "gabrielrstan/CORAA-v1.1", 13000, 'mupe_coraav1', 'text', 'file_name')
#calculate_wer_cer_model_dataset("sidleal/distil-whisper-coraa-mupe-asr-2", "RodrigoLimaRFL/nurc-sp-hugging-face", 7000, 'mupe_nurcsp', 'text', 'file_path')
#"mozilla-foundation/common_voice_13_0" "mozilla-foundation/common_voice_17_0"
#calculate_wer_cer_model_dataset("sidleal/distil-whisper-coraa-mupe-asr-2", "mozilla-foundation/common_voice_13_0", 10000, 'mupe_commonvoice', 'sentence', 'path')
#calculate_wer_cer_model_dataset("openai/whisper-large-v3", "sidleal/CORAA-MUPE-ASR-1", 22000, 'whisp_large_mupe', 'original_text', 'file_path')


Resolving data files:   0%|          | 0/12666 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/12666 [00:00<?, ?it/s]



mupe_coraav1 12666 WER: 3310.8521342251183 0.26139682095571753 CER: 1797.7884795558193 0.14193813986703138
Copying file://mupe_coraav1-test-output.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][  2.8 MiB/  2.8 MiB]                                                
Operation completed over 1 objects/2.8 MiB.                                      


common
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%
 9072/9072 [24:36<00:00,  5.63it/s]
mupe_commonvoice 9072 WER: 1485.7992285492246 0.16377857457553183 CER: 468.9457563812754 0.0516915516293293
Copying file://mupe_commonvoice-test-output.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][  2.8 MiB/  2.8 MiB]                                                
Operation completed over 1 objects/2.8 MiB.        