In [1]:
from datasets import load_dataset, concatenate_datasets
import numpy as np

cmu = load_dataset('CLAPv2/CMU_Arctic')
cmu = concatenate_datasets([*cmu.values()])
cmu = np.array(cmu)
for idx, _ in enumerate(cmu):
    audio = cmu[idx]['audio'].get_all_samples()
    cmu[idx]['samples'] = audio.data.numpy()
    cmu[idx]['samplerate'] = audio.sample_rate
    del cmu[idx]['audio']

Resolving data files:   0%|          | 0/119 [00:00<?, ?it/s]

In [38]:
from collections import defaultdict
import re

get_count = lambda items: sum(len(items[text]) for text in items)

audios = defaultdict(list)
for idx, audio in enumerate(cmu):
    text_spoken = re.search(r'"(.*?)"', audio['text']).group(1)
    audios[text_spoken].append(idx)
n_audios = get_count(audios)
    
print(f'Number of Spoken Texts: {len(audios)}')
print(f'Total Number of Recordings: {sum(len(audios[text]) for text in audios)}')

Number of Spoken Texts: 1248
Total Number of Recordings: 13192


In [39]:
has_multiple = lambda text: len(audios[text]) > 1
audios = { 
    text: audios[text] \
    for text in audios.keys() if has_multiple(text)
}
n_audios = get_count(audios)

print(f'Number of Spoken Texts with Multiple Recordings: {len(audios)}')
print(f'Total Number of Repeated Recordings: {n_audios}')

Number of Spoken Texts with Multiple Recordings: 1132
Total Number of Repeated Recordings: 13076


In [44]:
pairs = []
for text in audios:
    n = len(audios[text])
    for i in range(n-1):
        a = audios[text][i]
        for j in range(i+1, n):
            b = audios[text][j]
            t1, t2 = map(lambda x: cmu[x]['audio_len'], [a, b])
            dt = 2 * abs(t1 - t2) / (t1 + t2)
            if dt < 0.01:
                pairs.append([a, b])

print(f'Total Number of Interchangeable Recording Pairs: {len(pairs)}')
del audios, n_audios

Total Number of Interchangeable Recording Pairs: 3581


In [50]:
from IPython.display import Audio, display

def display_pair(dataset, i, j):
    audios = map(lambda audio: Audio(audio['samples'], rate=audio['samplerate']), dataset[[i, j]])
    for audio in audios:
        display(audio)

display_pair(cmu, *pairs[50])