In [27]:
tedlium_path = '/mnt/data/es-es/data'
mls_path = '/mnt/data/mls_spanish_opus'
model_id = "openai/whisper-tiny" # opciones: openai/whisper-medium openai/whisper-small openai/whisper-base

In [28]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
device = "cuda:0"
torch_dtype = torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [29]:
from pathlib import Path
import pandas as pd
import soundfile as sf
from tqdm import tqdm
from yaml import safe_load

def extract_audio_metadata(path):
    info = sf.info(path)
    return {'filename':path, 'samplerate': info.samplerate, 'duration': info.duration}

def read_mls(path):
    all_dfs = []
    for split in ['test']:
        df = pd.read_csv(Path(path,split,'transcripts.txt'),delimiter='\t',header=None,names=['idx','transcription'])
        all_wavs = Path(path,split,'audio').rglob('*.opus')
        wav_mapping = {x.stem: str(x.resolve()) for x in all_wavs}
        df['filename'] = df['idx'].apply(lambda x: wav_mapping[x])
        df['partition'] = split
        df['start'] = 0
        all_dfs.append(df)
    df = pd.concat(all_dfs)
    metadatas = []
    for f in tqdm(df['filename']):
        metadatas.append(extract_audio_metadata(f))
    metadatas = pd.DataFrame(metadatas)
    df = pd.merge(df, metadatas, left_on='filename', right_on='filename')
    df['dataset'] = 'mls'
    return df

def read_tedlium(path):
    all_dfs = []
    for split in ['test']:
        txt_path, wav_path = Path(path, split, 'txt'), Path(path, split, 'wav')
        transcripts = load_tedlium_transcripts(txt_path / f'{split}.es')
        with (txt_path / f'{split}.yaml').open('r') as f:
            audio_metadata = safe_load(f)
        for i in tqdm(range(len(audio_metadata))):
            audio = audio_metadata[i]
            audio['transcription'] = transcripts[i]
            audio['wav'] = audio['wav'].replace('wav', 'flac')
            audio['partition'] = split if split != 'valid' else 'dev'
            audio['filename'] = str((wav_path / audio['wav']).resolve())
            audio['samplerate'] = extract_audio_metadata(audio['filename'])['samplerate']
            audio['start'] = audio['offset']
            del audio['wav']
            del audio['offset']
            del audio['speaker_id']
        split_df = pd.DataFrame(audio_metadata).reset_index(names='idx')
        all_dfs.append(split_df)
    df = pd.concat(all_dfs)
    df['dataset'] = 'tedlium'
    return df
        
def load_tedlium_transcripts(file_path):
    transcripts = []
    with file_path.open('r') as f:
        for line in f:
            line = line.lower()[:-2]
            transcripts.append(line)
    return transcripts

In [30]:
df_mls = read_mls(mls_path)
df_tedlium = read_tedlium(tedlium_path)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2385/2385 [00:00<00:00, 16616.09it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1012/1012 [00:00<00:00, 23664.27it/s]


In [31]:
import librosa
import time
from tqdm import tqdm
from whisper_normalizer.basic import BasicTextNormalizer
from jiwer import wer, cer
from pandas import DataFrame
from nemo_text_processing.text_normalization.normalize import Normalizer

def evaluate_df(df):
    results = {'gt': [], 'pred': [], 'wer': [], 'cer': [], 'time': []}
    text_normalizer = Normalizer(input_case='cased', lang='es')
    second_normalizer = BasicTextNormalizer()
    for _, row in tqdm(df.iterrows()):
        start = row['start']
        duration = row['duration']
        filename = row['filename']
        og_transcription = second_normalizer(text_normalizer.normalize(row['transcription']))
        wav, _ = librosa.core.load(filename, offset=start, duration=duration)
        start_time = time.time()
        transcription = pipe(wav, generate_kwargs={"language": "spanish"})
        results['time'].append(round(time.time() - start_time, 10))
        transcription = second_normalizer(text_normalizer.normalize(transcription['text']))
        results['gt'].append(og_transcription)
        results['pred'].append(transcription)
        results['wer'].append(wer(og_transcription, transcription))
        results['cer'].append(cer(og_transcription, transcription))
    return DataFrame(results)    

In [32]:
model_name = model_id.split('/')[-1]
df_tedlium_results = evaluate_df(df_tedlium)
df_tedlium_results.to_csv(f'tedlium_results_{model_name}.csv', index=False)
df_mls_results = evaluate_df(df_mls)
df_mls_results.to_csv(f'mls_results_{model_name}.csv', index=False)

 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars. This might take some time...
  return F.conv1d(input, weight, bias, self.stride,
57it [00:13,  4.09it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
120it [00:29,  2.78it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
207it [00:55,  2.67it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
215it [00:57,  5.49it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
337it [01:29,  3.80it/s]Whis