In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/voxceleb/vox2_test_aac.zip
# !unzip vox2_test_aac.zip

In [2]:
# !wget https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/vox2_meta.csv

In [3]:
from glob import glob

files = glob('aac/**/*.m4a', recursive = True)
len(files)

36237

In [14]:
!mkdir voxceleb-wav

In [15]:
import mp
from tqdm import tqdm
from pydub import AudioSegment

In [16]:
def loop(args):
    files = args[0]
    index = args[1]
    for file in tqdm(files):
        audio = AudioSegment.from_file(file)
        new_file = file.replace('dev', 'wav').replace('/', '-').replace('.m4a', '.wav')
        new_file = f'voxceleb-wav/{new_file}'
        audio.set_frame_rate(16000).set_channels(1).export(new_file, format="wav")

In [17]:
r = mp.multiprocessing(files, loop, cores = 100, returned = False)

 97%|█████████▋| 350/362 [02:47<00:05,  2.16it/s]
 98%|█████████▊| 355/362 [02:49<00:03,  2.03it/s]
100%|██████████| 362/362 [02:49<00:00,  2.13it/s]
 99%|█████████▉| 358/362 [02:49<00:02,  1.88it/s]
 99%|█████████▊| 357/362 [02:50<00:02,  2.04it/s]
 98%|█████████▊| 353/362 [02:50<00:04,  1.93it/s]
 98%|█████████▊| 355/362 [02:50<00:03,  1.97it/s]
 99%|█████████▉| 359/362 [02:50<00:01,  2.26it/s]
100%|██████████| 362/362 [02:50<00:00,  2.12it/s]
 98%|█████████▊| 356/362 [02:50<00:02,  2.34it/s]
100%|██████████| 362/362 [02:50<00:00,  1.76it/s]
 99%|█████████▉| 359/362 [02:50<00:01,  2.17it/s]
 98%|█████████▊| 356/362 [02:50<00:02,  2.14it/s]
100%|██████████| 362/362 [02:51<00:00,  2.12it/s]
 99%|█████████▉| 358/362 [02:51<00:02,  1.93it/s]
 99%|█████████▉| 358/362 [02:51<00:01,  2.13it/s]
100%|██████████| 362/362 [02:51<00:00,  2.11it/s]
 98%|█████████▊| 354/362 [02:51<00:04,  1.97it/s]
 98%|█████████▊| 355/362 [02:51<00:03,  1.87it/s]
 99%|█████████▉| 359/362 [02:51<00:01,  2.22it/s]


In [18]:
!du -hs voxceleb-wav

8.7G	voxceleb-wav


In [8]:
files = glob('/home/husein/youtube/voxceleb-wav/*.wav', recursive = True)
files[:5]

['/home/husein/youtube/voxceleb-wav/aac-id06310-DkACPQtkHZI-00050.wav',
 '/home/husein/youtube/voxceleb-wav/aac-id04232-Ui7YgRJZ8YQ-00274.wav',
 '/home/husein/youtube/voxceleb-wav/aac-id06811-QibFE4o9De0-00134.wav',
 '/home/husein/youtube/voxceleb-wav/aac-id07961-HHtJ97YaLS4-00126.wav',
 '/home/husein/youtube/voxceleb-wav/aac-id03382-PQzU-p_X-4Y-00018.wav']

In [9]:
import pandas as pd

df = pd.read_csv('/home/husein/youtube/vox2_meta.csv')
df = df[df['Set '] == 'test ']
speakers = df['VoxCeleb2 ID '].unique().tolist()
speakers = [s.strip() for s in speakers]

In [10]:
from collections import defaultdict

speakers_idx = defaultdict(list)

for speaker in speakers:
    for file in files:
        if speaker in file:
            speakers_idx[speaker].append(file)

In [21]:
from tqdm import tqdm
import os

def get_id(file):
    return os.path.split(file)[1].split('-')[1]

get_id(speakers_idx['id07426'][0])

'id07426'

In [22]:
import random

sample_files = random.sample(files, 10000)
k = 5
labels = []

for file in tqdm(sample_files):
    left_speaker = get_id(file)
    for speaker in speakers:
        if left_speaker == speaker:
            label = 1
        else:
            label = 0
        samples = random.sample(speakers_idx[speaker], min(k, len(speakers_idx[speaker])))
        for s in samples:
            labels.append((label, file, s))

100%|██████████| 10000/10000 [00:10<00:00, 943.37it/s]


In [23]:
len(labels)

5900000

In [24]:
unique_files = []
for l in labels:
    unique_files.extend(l[1:])
    
len(set(unique_files))

36237

In [26]:
import json

with open('voxceleb2-test-sample.json', 'w') as fopen:
    json.dump(sample_files, fopen)

In [27]:
import pickle

with open('voxceleb2-test-labels.pkl', 'wb') as fopen:
    pickle.dump(labels, fopen)

In [19]:
!tar -cf voxceleb2-test-wav.tar voxceleb-wav

In [29]:
b2_application_key_id = os.environ['b2_application_key_id']
b2_application_key = os.environ['b2_application_key']

In [30]:
from b2sdk.v1 import *
info = InMemoryAccountInfo()
b2_api = B2Api(info)
application_key_id = b2_application_key_id
application_key = b2_application_key
b2_api.authorize_account("production", application_key_id, application_key)
file_info = {'how': 'good-file'}
b2_bucket = b2_api.get_bucket_by_name('malay-dataset')

In [31]:
file = 'voxceleb2-test-sample.json'
outPutname = 'voxceleb/voxceleb2-test-sample.json'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f47ab248320>

In [None]:
file = 'voxceleb2-test-labels.pkl'
outPutname = 'voxceleb/voxceleb2-test-labels.pkl'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

In [23]:
file = 'voxceleb2-test-wav.tar'
outPutname = 'voxceleb/voxceleb2-test-wav.tar'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7ff18a238748>