In [5]:
import glob
import os
import torch
import random
import shutil

random.seed(1234)

from pathlib import PurePath
from speechbrain.pretrained import EncoderDecoderASR

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
path = 'data/test-clean/'

asr = EncoderDecoderASR.from_hparams(
            source = "speechbrain/asr-transformer-transformerlm-librispeech", 
            savedir="data/pretrained_models/asr-transformer-transformerlm-librispeech",
            run_opts = {"device": str(device)},
            freeze_params = True
        )

In [6]:
flac_files = glob.glob(os.path.join(path, "**/*.flac"),
                    recursive=True)
text_files = glob.glob(os.path.join(path, "**/*.txt"),
                    recursive=True)

samples = []

for txtf in text_files:
    with open(txtf, "r") as f:
        lines = f.readlines()
    for l in lines:
        l = l.upper().strip("\n")
        utt_id = l.split(" ")[0]
        words = l.split(" ")[1:]
        samples.append((utt_id, words))

In [7]:
filtered = dict([(utt_id,  " ".join(w)) for (utt_id, w) in samples  if len(w) >= 5 and len(w) <= 10])

In [8]:
sample_audios = {}

for file_path in flac_files:
    file = file_path.split("/")[-1]
    utt_id = file.split(".")[0]
    
    if utt_id not in filtered.keys():
        continue
        
    output = asr.transcribe_file(file_path)
    if output == filtered[utt_id]:
        sample_audios[utt_id] = file_path

In [9]:
random_subset = random.sample(sample_audios.keys(), 100)
benchmark = {x: sample_audios[x] for x in random_subset}

In [11]:
! mkdir data/benchmark

In [12]:
f = open("data/benchmark.txt", "w")
for k in benchmark:
    shutil.copy(benchmark[k], 'data/benchmark')
    f.write(k + '\n')
f.close()